PostgreSQL Source Code  git master
heapam.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  * heap access method code
5  *
6  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  * heap_beginscan - begin relation scan
16  * heap_rescan - restart a relation scan
17  * heap_endscan - end relation scan
18  * heap_getnext - retrieve next tuple in scan
19  * heap_fetch - retrieve tuple with given tid
20  * heap_insert - insert tuple into a relation
21  * heap_multi_insert - insert multiple tuples into a relation
22  * heap_delete - delete a tuple from a relation
23  * heap_update - replace a tuple in a relation with another tuple
24  *
25  * NOTES
26  * This file contains the heap_ routines which implement
27  * the POSTGRES heap access method used for all POSTGRES
28  * relations.
29  *
30  *-------------------------------------------------------------------------
31  */
32 #include "postgres.h"
33 
34 #include "access/bufmask.h"
35 #include "access/genam.h"
36 #include "access/heapam.h"
37 #include "access/heapam_xlog.h"
38 #include "access/heaptoast.h"
39 #include "access/hio.h"
40 #include "access/multixact.h"
41 #include "access/parallel.h"
42 #include "access/relscan.h"
43 #include "access/subtrans.h"
44 #include "access/syncscan.h"
45 #include "access/sysattr.h"
46 #include "access/tableam.h"
47 #include "access/transam.h"
48 #include "access/valid.h"
49 #include "access/visibilitymap.h"
50 #include "access/xact.h"
51 #include "access/xlog.h"
52 #include "access/xloginsert.h"
53 #include "access/xlogutils.h"
54 #include "catalog/catalog.h"
55 #include "miscadmin.h"
56 #include "pgstat.h"
57 #include "port/atomics.h"
58 #include "port/pg_bitutils.h"
59 #include "storage/bufmgr.h"
60 #include "storage/freespace.h"
61 #include "storage/lmgr.h"
62 #include "storage/predicate.h"
63 #include "storage/procarray.h"
64 #include "storage/smgr.h"
65 #include "storage/spin.h"
66 #include "storage/standby.h"
67 #include "utils/datum.h"
68 #include "utils/inval.h"
69 #include "utils/lsyscache.h"
70 #include "utils/relcache.h"
71 #include "utils/snapmgr.h"
72 #include "utils/spccache.h"
73 
74 
75 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
76  TransactionId xid, CommandId cid, int options);
77 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
78  Buffer newbuf, HeapTuple oldtup,
79  HeapTuple newtup, HeapTuple old_key_tuple,
80  bool all_visible_cleared, bool new_all_visible_cleared);
82  Bitmapset *interesting_cols,
83  HeapTuple oldtup, HeapTuple newtup);
84 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
85  LockTupleMode mode, LockWaitPolicy wait_policy,
86  bool *have_tuple_lock);
87 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
88  uint16 old_infomask2, TransactionId add_to_xmax,
89  LockTupleMode mode, bool is_update,
90  TransactionId *result_xmax, uint16 *result_infomask,
91  uint16 *result_infomask2);
93  ItemPointer ctid, TransactionId xid,
95 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
96  uint16 *new_infomask2);
98  uint16 t_infomask);
99 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
100  LockTupleMode lockmode, bool *current_is_member);
101 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
102  Relation rel, ItemPointer ctid, XLTW_Oper oper,
103  int *remaining);
105  uint16 infomask, Relation rel, int *remaining);
106 static void index_delete_sort(TM_IndexDeleteOp *delstate);
107 static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
108 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
109 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_changed,
110  bool *copy);
111 
112 
113 /*
114  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
115  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
116  * update them). This table (and the macros below) helps us determine the
117  * heavyweight lock mode and MultiXactStatus values to use for any particular
118  * tuple lock strength.
119  *
120  * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
121  * instead.
122  */
123 static const struct
124 {
128 }
129 
131 {
132  { /* LockTupleKeyShare */
135  -1 /* KeyShare does not allow updating tuples */
136  },
137  { /* LockTupleShare */
138  RowShareLock,
140  -1 /* Share does not allow updating tuples */
141  },
142  { /* LockTupleNoKeyExclusive */
146  },
147  { /* LockTupleExclusive */
151  }
152 };
153 
154 /* Get the LOCKMODE for a given MultiXactStatus */
155 #define LOCKMODE_from_mxstatus(status) \
156  (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
157 
158 /*
159  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
160  * This is more readable than having every caller translate it to lock.h's
161  * LOCKMODE.
162  */
163 #define LockTupleTuplock(rel, tup, mode) \
164  LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165 #define UnlockTupleTuplock(rel, tup, mode) \
166  UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
167 #define ConditionalLockTupleTuplock(rel, tup, mode) \
168  ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
169 
170 #ifdef USE_PREFETCH
171 /*
172  * heap_index_delete_tuples and index_delete_prefetch_buffer use this
173  * structure to coordinate prefetching activity
174  */
175 typedef struct
176 {
177  BlockNumber cur_hblkno;
178  int next_item;
179  int ndeltids;
180  TM_IndexDelete *deltids;
181 } IndexDeletePrefetchState;
182 #endif
183 
184 /* heap_index_delete_tuples bottom-up index deletion costing constants */
185 #define BOTTOMUP_MAX_NBLOCKS 6
186 #define BOTTOMUP_TOLERANCE_NBLOCKS 3
187 
188 /*
189  * heap_index_delete_tuples uses this when determining which heap blocks it
190  * must visit to help its bottom-up index deletion caller
191  */
192 typedef struct IndexDeleteCounts
193 {
194  int16 npromisingtids; /* Number of "promising" TIDs in group */
195  int16 ntids; /* Number of TIDs in group */
196  int16 ifirsttid; /* Offset to group's first deltid */
198 
199 /*
200  * This table maps tuple lock strength values for each particular
201  * MultiXactStatus value.
202  */
204 {
205  LockTupleKeyShare, /* ForKeyShare */
206  LockTupleShare, /* ForShare */
207  LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
208  LockTupleExclusive, /* ForUpdate */
209  LockTupleNoKeyExclusive, /* NoKeyUpdate */
210  LockTupleExclusive /* Update */
211 };
212 
213 /* Get the LockTupleMode for a given MultiXactStatus */
214 #define TUPLOCK_from_mxstatus(status) \
215  (MultiXactStatusLock[(status)])
216 
217 /* ----------------------------------------------------------------
218  * heap support routines
219  * ----------------------------------------------------------------
220  */
221 
222 /* ----------------
223  * initscan - scan code common to heap_beginscan and heap_rescan
224  * ----------------
225  */
226 static void
227 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
228 {
229  ParallelBlockTableScanDesc bpscan = NULL;
230  bool allow_strat;
231  bool allow_sync;
232 
233  /*
234  * Determine the number of blocks we have to scan.
235  *
236  * It is sufficient to do this once at scan start, since any tuples added
237  * while the scan is in progress will be invisible to my snapshot anyway.
238  * (That is not true when using a non-MVCC snapshot. However, we couldn't
239  * guarantee to return tuples added after scan start anyway, since they
240  * might go into pages we already scanned. To guarantee consistent
241  * results for a non-MVCC snapshot, the caller must hold some higher-level
242  * lock that ensures the interesting tuple(s) won't change.)
243  */
244  if (scan->rs_base.rs_parallel != NULL)
245  {
247  scan->rs_nblocks = bpscan->phs_nblocks;
248  }
249  else
251 
252  /*
253  * If the table is large relative to NBuffers, use a bulk-read access
254  * strategy and enable synchronized scanning (see syncscan.c). Although
255  * the thresholds for these features could be different, we make them the
256  * same so that there are only two behaviors to tune rather than four.
257  * (However, some callers need to be able to disable one or both of these
258  * behaviors, independently of the size of the table; also there is a GUC
259  * variable that can disable synchronized scanning.)
260  *
261  * Note that table_block_parallelscan_initialize has a very similar test;
262  * if you change this, consider changing that one, too.
263  */
264  if (!RelationUsesLocalBuffers(scan->rs_base.rs_rd) &&
265  scan->rs_nblocks > NBuffers / 4)
266  {
267  allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
268  allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
269  }
270  else
271  allow_strat = allow_sync = false;
272 
273  if (allow_strat)
274  {
275  /* During a rescan, keep the previous strategy object. */
276  if (scan->rs_strategy == NULL)
278  }
279  else
280  {
281  if (scan->rs_strategy != NULL)
283  scan->rs_strategy = NULL;
284  }
285 
286  if (scan->rs_base.rs_parallel != NULL)
287  {
288  /* For parallel scan, believe whatever ParallelTableScanDesc says. */
289  if (scan->rs_base.rs_parallel->phs_syncscan)
290  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
291  else
292  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
293  }
294  else if (keep_startblock)
295  {
296  /*
297  * When rescanning, we want to keep the previous startblock setting,
298  * so that rewinding a cursor doesn't generate surprising results.
299  * Reset the active syncscan setting, though.
300  */
301  if (allow_sync && synchronize_seqscans)
302  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
303  else
304  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
305  }
306  else if (allow_sync && synchronize_seqscans)
307  {
308  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
309  scan->rs_startblock = ss_get_location(scan->rs_base.rs_rd, scan->rs_nblocks);
310  }
311  else
312  {
313  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
314  scan->rs_startblock = 0;
315  }
316 
318  scan->rs_inited = false;
319  scan->rs_ctup.t_data = NULL;
321  scan->rs_cbuf = InvalidBuffer;
323 
324  /* page-at-a-time fields are always invalid when not rs_inited */
325 
326  /*
327  * copy the scan key, if appropriate
328  */
329  if (key != NULL)
330  memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
331 
332  /*
333  * Currently, we only have a stats counter for sequential heap scans (but
334  * e.g for bitmap scans the underlying bitmap index scans will be counted,
335  * and for sample scans we update stats for tuple fetches).
336  */
337  if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
339 }
340 
341 /*
342  * heap_setscanlimits - restrict range of a heapscan
343  *
344  * startBlk is the page to start at
345  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
346  */
347 void
349 {
350  HeapScanDesc scan = (HeapScanDesc) sscan;
351 
352  Assert(!scan->rs_inited); /* else too late to change */
353  /* else rs_startblock is significant */
354  Assert(!(scan->rs_base.rs_flags & SO_ALLOW_SYNC));
355 
356  /* Check startBlk is valid (but allow case of zero blocks...) */
357  Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
358 
359  scan->rs_startblock = startBlk;
360  scan->rs_numblocks = numBlks;
361 }
362 
363 /*
364  * heapgetpage - subroutine for heapgettup()
365  *
366  * This routine reads and pins the specified page of the relation.
367  * In page-at-a-time mode it performs additional work, namely determining
368  * which tuples on the page are visible.
369  */
370 void
372 {
373  HeapScanDesc scan = (HeapScanDesc) sscan;
374  Buffer buffer;
375  Snapshot snapshot;
376  Page dp;
377  int lines;
378  int ntup;
379  OffsetNumber lineoff;
380  ItemId lpp;
381  bool all_visible;
382 
383  Assert(page < scan->rs_nblocks);
384 
385  /* release previous scan buffer, if any */
386  if (BufferIsValid(scan->rs_cbuf))
387  {
388  ReleaseBuffer(scan->rs_cbuf);
389  scan->rs_cbuf = InvalidBuffer;
390  }
391 
392  /*
393  * Be sure to check for interrupts at least once per page. Checks at
394  * higher code levels won't be able to stop a seqscan that encounters many
395  * pages' worth of consecutive dead tuples.
396  */
398 
399  /* read page using selected strategy */
400  scan->rs_cbuf = ReadBufferExtended(scan->rs_base.rs_rd, MAIN_FORKNUM, page,
401  RBM_NORMAL, scan->rs_strategy);
402  scan->rs_cblock = page;
403 
404  if (!(scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE))
405  return;
406 
407  buffer = scan->rs_cbuf;
408  snapshot = scan->rs_base.rs_snapshot;
409 
410  /*
411  * Prune and repair fragmentation for the whole page, if possible.
412  */
413  heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
414 
415  /*
416  * We must hold share lock on the buffer content while examining tuple
417  * visibility. Afterwards, however, the tuples we have found to be
418  * visible are guaranteed good as long as we hold the buffer pin.
419  */
420  LockBuffer(buffer, BUFFER_LOCK_SHARE);
421 
422  dp = BufferGetPage(buffer);
423  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
424  lines = PageGetMaxOffsetNumber(dp);
425  ntup = 0;
426 
427  /*
428  * If the all-visible flag indicates that all tuples on the page are
429  * visible to everyone, we can skip the per-tuple visibility tests.
430  *
431  * Note: In hot standby, a tuple that's already visible to all
432  * transactions on the primary might still be invisible to a read-only
433  * transaction in the standby. We partly handle this problem by tracking
434  * the minimum xmin of visible tuples as the cut-off XID while marking a
435  * page all-visible on the primary and WAL log that along with the
436  * visibility map SET operation. In hot standby, we wait for (or abort)
437  * all transactions that can potentially may not see one or more tuples on
438  * the page. That's how index-only scans work fine in hot standby. A
439  * crucial difference between index-only scans and heap scans is that the
440  * index-only scan completely relies on the visibility map where as heap
441  * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
442  * the page-level flag can be trusted in the same way, because it might
443  * get propagated somehow without being explicitly WAL-logged, e.g. via a
444  * full page write. Until we can prove that beyond doubt, let's check each
445  * tuple for visibility the hard way.
446  */
447  all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
448 
449  for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
450  lineoff <= lines;
451  lineoff++, lpp++)
452  {
453  if (ItemIdIsNormal(lpp))
454  {
455  HeapTupleData loctup;
456  bool valid;
457 
458  loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
459  loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
460  loctup.t_len = ItemIdGetLength(lpp);
461  ItemPointerSet(&(loctup.t_self), page, lineoff);
462 
463  if (all_visible)
464  valid = true;
465  else
466  valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
467 
469  &loctup, buffer, snapshot);
470 
471  if (valid)
472  scan->rs_vistuples[ntup++] = lineoff;
473  }
474  }
475 
477 
478  Assert(ntup <= MaxHeapTuplesPerPage);
479  scan->rs_ntuples = ntup;
480 }
481 
482 /* ----------------
483  * heapgettup - fetch next heap tuple
484  *
485  * Initialize the scan if not already done; then advance to the next
486  * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
487  * or set scan->rs_ctup.t_data = NULL if no more tuples.
488  *
489  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
490  * by scan->rs_ctup".
491  *
492  * Note: the reason nkeys/key are passed separately, even though they are
493  * kept in the scan descriptor, is that the caller may not want us to check
494  * the scankeys.
495  *
496  * Note: when we fall off the end of the scan in either direction, we
497  * reset rs_inited. This means that a further request with the same
498  * scan direction will restart the scan, which is a bit odd, but a
499  * request with the opposite scan direction will start a fresh scan
500  * in the proper direction. The latter is required behavior for cursors,
501  * while the former case is generally undefined behavior in Postgres
502  * so we don't care too much.
503  * ----------------
504  */
505 static void
507  ScanDirection dir,
508  int nkeys,
509  ScanKey key)
510 {
511  HeapTuple tuple = &(scan->rs_ctup);
512  Snapshot snapshot = scan->rs_base.rs_snapshot;
513  bool backward = ScanDirectionIsBackward(dir);
514  BlockNumber page;
515  bool finished;
516  Page dp;
517  int lines;
518  OffsetNumber lineoff;
519  int linesleft;
520  ItemId lpp;
521 
522  /*
523  * calculate next starting lineoff, given scan direction
524  */
525  if (ScanDirectionIsForward(dir))
526  {
527  if (!scan->rs_inited)
528  {
529  /*
530  * return null immediately if relation is empty
531  */
532  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
533  {
534  Assert(!BufferIsValid(scan->rs_cbuf));
535  tuple->t_data = NULL;
536  return;
537  }
538  if (scan->rs_base.rs_parallel != NULL)
539  {
542  ParallelBlockTableScanWorker pbscanwork =
543  scan->rs_parallelworkerdata;
544 
546  pbscanwork, pbscan);
547 
549  pbscanwork, pbscan);
550 
551  /* Other processes might have already finished the scan. */
552  if (page == InvalidBlockNumber)
553  {
554  Assert(!BufferIsValid(scan->rs_cbuf));
555  tuple->t_data = NULL;
556  return;
557  }
558  }
559  else
560  page = scan->rs_startblock; /* first page */
561  heapgetpage((TableScanDesc) scan, page);
562  lineoff = FirstOffsetNumber; /* first offnum */
563  scan->rs_inited = true;
564  }
565  else
566  {
567  /* continue from previously returned page/tuple */
568  page = scan->rs_cblock; /* current page */
569  lineoff = /* next offnum */
571  }
572 
574 
575  dp = BufferGetPage(scan->rs_cbuf);
576  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
577  lines = PageGetMaxOffsetNumber(dp);
578  /* page and lineoff now reference the physically next tid */
579 
580  linesleft = lines - lineoff + 1;
581  }
582  else if (backward)
583  {
584  /* backward parallel scan not supported */
585  Assert(scan->rs_base.rs_parallel == NULL);
586 
587  if (!scan->rs_inited)
588  {
589  /*
590  * return null immediately if relation is empty
591  */
592  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
593  {
594  Assert(!BufferIsValid(scan->rs_cbuf));
595  tuple->t_data = NULL;
596  return;
597  }
598 
599  /*
600  * Disable reporting to syncscan logic in a backwards scan; it's
601  * not very likely anyone else is doing the same thing at the same
602  * time, and much more likely that we'll just bollix things for
603  * forward scanners.
604  */
605  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
606 
607  /*
608  * Start from last page of the scan. Ensure we take into account
609  * rs_numblocks if it's been adjusted by heap_setscanlimits().
610  */
611  if (scan->rs_numblocks != InvalidBlockNumber)
612  page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
613  else if (scan->rs_startblock > 0)
614  page = scan->rs_startblock - 1;
615  else
616  page = scan->rs_nblocks - 1;
617  heapgetpage((TableScanDesc) scan, page);
618  }
619  else
620  {
621  /* continue from previously returned page/tuple */
622  page = scan->rs_cblock; /* current page */
623  }
624 
626 
627  dp = BufferGetPage(scan->rs_cbuf);
628  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
629  lines = PageGetMaxOffsetNumber(dp);
630 
631  if (!scan->rs_inited)
632  {
633  lineoff = lines; /* final offnum */
634  scan->rs_inited = true;
635  }
636  else
637  {
638  /*
639  * The previous returned tuple may have been vacuumed since the
640  * previous scan when we use a non-MVCC snapshot, so we must
641  * re-establish the lineoff <= PageGetMaxOffsetNumber(dp)
642  * invariant
643  */
644  lineoff = /* previous offnum */
645  Min(lines,
647  }
648  /* page and lineoff now reference the physically previous tid */
649 
650  linesleft = lineoff;
651  }
652  else
653  {
654  /*
655  * ``no movement'' scan direction: refetch prior tuple
656  */
657  if (!scan->rs_inited)
658  {
659  Assert(!BufferIsValid(scan->rs_cbuf));
660  tuple->t_data = NULL;
661  return;
662  }
663 
664  page = ItemPointerGetBlockNumber(&(tuple->t_self));
665  if (page != scan->rs_cblock)
666  heapgetpage((TableScanDesc) scan, page);
667 
668  /* Since the tuple was previously fetched, needn't lock page here */
669  dp = BufferGetPage(scan->rs_cbuf);
670  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
671  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
672  lpp = PageGetItemId(dp, lineoff);
673  Assert(ItemIdIsNormal(lpp));
674 
675  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
676  tuple->t_len = ItemIdGetLength(lpp);
677 
678  return;
679  }
680 
681  /*
682  * advance the scan until we find a qualifying tuple or run out of stuff
683  * to scan
684  */
685  lpp = PageGetItemId(dp, lineoff);
686  for (;;)
687  {
688  /*
689  * Only continue scanning the page while we have lines left.
690  *
691  * Note that this protects us from accessing line pointers past
692  * PageGetMaxOffsetNumber(); both for forward scans when we resume the
693  * table scan, and for when we start scanning a new page.
694  */
695  while (linesleft > 0)
696  {
697  if (ItemIdIsNormal(lpp))
698  {
699  bool valid;
700 
701  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
702  tuple->t_len = ItemIdGetLength(lpp);
703  ItemPointerSet(&(tuple->t_self), page, lineoff);
704 
705  /*
706  * if current tuple qualifies, return it.
707  */
708  valid = HeapTupleSatisfiesVisibility(tuple,
709  snapshot,
710  scan->rs_cbuf);
711 
713  tuple, scan->rs_cbuf,
714  snapshot);
715 
716  if (valid && key != NULL)
718  nkeys, key, valid);
719 
720  if (valid)
721  {
723  return;
724  }
725  }
726 
727  /*
728  * otherwise move to the next item on the page
729  */
730  --linesleft;
731  if (backward)
732  {
733  --lpp; /* move back in this page's ItemId array */
734  --lineoff;
735  }
736  else
737  {
738  ++lpp; /* move forward in this page's ItemId array */
739  ++lineoff;
740  }
741  }
742 
743  /*
744  * if we get here, it means we've exhausted the items on this page and
745  * it's time to move to the next.
746  */
748 
749  /*
750  * advance to next/prior page and detect end of scan
751  */
752  if (backward)
753  {
754  finished = (page == scan->rs_startblock) ||
755  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
756  if (page == 0)
757  page = scan->rs_nblocks;
758  page--;
759  }
760  else if (scan->rs_base.rs_parallel != NULL)
761  {
764  ParallelBlockTableScanWorker pbscanwork =
765  scan->rs_parallelworkerdata;
766 
768  pbscanwork, pbscan);
769  finished = (page == InvalidBlockNumber);
770  }
771  else
772  {
773  page++;
774  if (page >= scan->rs_nblocks)
775  page = 0;
776  finished = (page == scan->rs_startblock) ||
777  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
778 
779  /*
780  * Report our new scan position for synchronization purposes. We
781  * don't do that when moving backwards, however. That would just
782  * mess up any other forward-moving scanners.
783  *
784  * Note: we do this before checking for end of scan so that the
785  * final state of the position hint is back at the start of the
786  * rel. That's not strictly necessary, but otherwise when you run
787  * the same query multiple times the starting position would shift
788  * a little bit backwards on every invocation, which is confusing.
789  * We don't guarantee any specific ordering in general, though.
790  */
791  if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
792  ss_report_location(scan->rs_base.rs_rd, page);
793  }
794 
795  /*
796  * return NULL if we've exhausted all the pages
797  */
798  if (finished)
799  {
800  if (BufferIsValid(scan->rs_cbuf))
801  ReleaseBuffer(scan->rs_cbuf);
802  scan->rs_cbuf = InvalidBuffer;
804  tuple->t_data = NULL;
805  scan->rs_inited = false;
806  return;
807  }
808 
809  heapgetpage((TableScanDesc) scan, page);
810 
811  LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
812 
813  dp = BufferGetPage(scan->rs_cbuf);
814  TestForOldSnapshot(snapshot, scan->rs_base.rs_rd, dp);
815  lines = PageGetMaxOffsetNumber((Page) dp);
816  linesleft = lines;
817  if (backward)
818  {
819  lineoff = lines;
820  lpp = PageGetItemId(dp, lines);
821  }
822  else
823  {
824  lineoff = FirstOffsetNumber;
825  lpp = PageGetItemId(dp, FirstOffsetNumber);
826  }
827  }
828 }
829 
830 /* ----------------
831  * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
832  *
833  * Same API as heapgettup, but used in page-at-a-time mode
834  *
835  * The internal logic is much the same as heapgettup's too, but there are some
836  * differences: we do not take the buffer content lock (that only needs to
837  * happen inside heapgetpage), and we iterate through just the tuples listed
838  * in rs_vistuples[] rather than all tuples on the page. Notice that
839  * lineindex is 0-based, where the corresponding loop variable lineoff in
840  * heapgettup is 1-based.
841  * ----------------
842  */
843 static void
845  ScanDirection dir,
846  int nkeys,
847  ScanKey key)
848 {
849  HeapTuple tuple = &(scan->rs_ctup);
850  bool backward = ScanDirectionIsBackward(dir);
851  BlockNumber page;
852  bool finished;
853  Page dp;
854  int lines;
855  int lineindex;
856  OffsetNumber lineoff;
857  int linesleft;
858  ItemId lpp;
859 
860  /*
861  * calculate next starting lineindex, given scan direction
862  */
863  if (ScanDirectionIsForward(dir))
864  {
865  if (!scan->rs_inited)
866  {
867  /*
868  * return null immediately if relation is empty
869  */
870  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
871  {
872  Assert(!BufferIsValid(scan->rs_cbuf));
873  tuple->t_data = NULL;
874  return;
875  }
876  if (scan->rs_base.rs_parallel != NULL)
877  {
880  ParallelBlockTableScanWorker pbscanwork =
881  scan->rs_parallelworkerdata;
882 
884  pbscanwork, pbscan);
885 
887  pbscanwork, pbscan);
888 
889  /* Other processes might have already finished the scan. */
890  if (page == InvalidBlockNumber)
891  {
892  Assert(!BufferIsValid(scan->rs_cbuf));
893  tuple->t_data = NULL;
894  return;
895  }
896  }
897  else
898  page = scan->rs_startblock; /* first page */
899  heapgetpage((TableScanDesc) scan, page);
900  lineindex = 0;
901  scan->rs_inited = true;
902  }
903  else
904  {
905  /* continue from previously returned page/tuple */
906  page = scan->rs_cblock; /* current page */
907  lineindex = scan->rs_cindex + 1;
908  }
909 
910  dp = BufferGetPage(scan->rs_cbuf);
912  lines = scan->rs_ntuples;
913  /* page and lineindex now reference the next visible tid */
914 
915  linesleft = lines - lineindex;
916  }
917  else if (backward)
918  {
919  /* backward parallel scan not supported */
920  Assert(scan->rs_base.rs_parallel == NULL);
921 
922  if (!scan->rs_inited)
923  {
924  /*
925  * return null immediately if relation is empty
926  */
927  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
928  {
929  Assert(!BufferIsValid(scan->rs_cbuf));
930  tuple->t_data = NULL;
931  return;
932  }
933 
934  /*
935  * Disable reporting to syncscan logic in a backwards scan; it's
936  * not very likely anyone else is doing the same thing at the same
937  * time, and much more likely that we'll just bollix things for
938  * forward scanners.
939  */
940  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
941 
942  /*
943  * Start from last page of the scan. Ensure we take into account
944  * rs_numblocks if it's been adjusted by heap_setscanlimits().
945  */
946  if (scan->rs_numblocks != InvalidBlockNumber)
947  page = (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
948  else if (scan->rs_startblock > 0)
949  page = scan->rs_startblock - 1;
950  else
951  page = scan->rs_nblocks - 1;
952  heapgetpage((TableScanDesc) scan, page);
953  }
954  else
955  {
956  /* continue from previously returned page/tuple */
957  page = scan->rs_cblock; /* current page */
958  }
959 
960  dp = BufferGetPage(scan->rs_cbuf);
962  lines = scan->rs_ntuples;
963 
964  if (!scan->rs_inited)
965  {
966  lineindex = lines - 1;
967  scan->rs_inited = true;
968  }
969  else
970  {
971  lineindex = scan->rs_cindex - 1;
972  }
973  /* page and lineindex now reference the previous visible tid */
974 
975  linesleft = lineindex + 1;
976  }
977  else
978  {
979  /*
980  * ``no movement'' scan direction: refetch prior tuple
981  */
982  if (!scan->rs_inited)
983  {
984  Assert(!BufferIsValid(scan->rs_cbuf));
985  tuple->t_data = NULL;
986  return;
987  }
988 
989  page = ItemPointerGetBlockNumber(&(tuple->t_self));
990  if (page != scan->rs_cblock)
991  heapgetpage((TableScanDesc) scan, page);
992 
993  /* Since the tuple was previously fetched, needn't lock page here */
994  dp = BufferGetPage(scan->rs_cbuf);
995  TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
996  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
997  lpp = PageGetItemId(dp, lineoff);
998  Assert(ItemIdIsNormal(lpp));
999 
1000  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
1001  tuple->t_len = ItemIdGetLength(lpp);
1002 
1003  /* check that rs_cindex is in sync */
1004  Assert(scan->rs_cindex < scan->rs_ntuples);
1005  Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
1006 
1007  return;
1008  }
1009 
1010  /*
1011  * advance the scan until we find a qualifying tuple or run out of stuff
1012  * to scan
1013  */
1014  for (;;)
1015  {
1016  while (linesleft > 0)
1017  {
1018  lineoff = scan->rs_vistuples[lineindex];
1019  lpp = PageGetItemId(dp, lineoff);
1020  Assert(ItemIdIsNormal(lpp));
1021 
1022  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
1023  tuple->t_len = ItemIdGetLength(lpp);
1024  ItemPointerSet(&(tuple->t_self), page, lineoff);
1025 
1026  /*
1027  * if current tuple qualifies, return it.
1028  */
1029  if (key != NULL)
1030  {
1031  bool valid;
1032 
1033  HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
1034  nkeys, key, valid);
1035  if (valid)
1036  {
1037  scan->rs_cindex = lineindex;
1038  return;
1039  }
1040  }
1041  else
1042  {
1043  scan->rs_cindex = lineindex;
1044  return;
1045  }
1046 
1047  /*
1048  * otherwise move to the next item on the page
1049  */
1050  --linesleft;
1051  if (backward)
1052  --lineindex;
1053  else
1054  ++lineindex;
1055  }
1056 
1057  /*
1058  * if we get here, it means we've exhausted the items on this page and
1059  * it's time to move to the next.
1060  */
1061  if (backward)
1062  {
1063  finished = (page == scan->rs_startblock) ||
1064  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1065  if (page == 0)
1066  page = scan->rs_nblocks;
1067  page--;
1068  }
1069  else if (scan->rs_base.rs_parallel != NULL)
1070  {
1073  ParallelBlockTableScanWorker pbscanwork =
1074  scan->rs_parallelworkerdata;
1075 
1077  pbscanwork, pbscan);
1078  finished = (page == InvalidBlockNumber);
1079  }
1080  else
1081  {
1082  page++;
1083  if (page >= scan->rs_nblocks)
1084  page = 0;
1085  finished = (page == scan->rs_startblock) ||
1086  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1087 
1088  /*
1089  * Report our new scan position for synchronization purposes. We
1090  * don't do that when moving backwards, however. That would just
1091  * mess up any other forward-moving scanners.
1092  *
1093  * Note: we do this before checking for end of scan so that the
1094  * final state of the position hint is back at the start of the
1095  * rel. That's not strictly necessary, but otherwise when you run
1096  * the same query multiple times the starting position would shift
1097  * a little bit backwards on every invocation, which is confusing.
1098  * We don't guarantee any specific ordering in general, though.
1099  */
1100  if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
1101  ss_report_location(scan->rs_base.rs_rd, page);
1102  }
1103 
1104  /*
1105  * return NULL if we've exhausted all the pages
1106  */
1107  if (finished)
1108  {
1109  if (BufferIsValid(scan->rs_cbuf))
1110  ReleaseBuffer(scan->rs_cbuf);
1111  scan->rs_cbuf = InvalidBuffer;
1112  scan->rs_cblock = InvalidBlockNumber;
1113  tuple->t_data = NULL;
1114  scan->rs_inited = false;
1115  return;
1116  }
1117 
1118  heapgetpage((TableScanDesc) scan, page);
1119 
1120  dp = BufferGetPage(scan->rs_cbuf);
1121  TestForOldSnapshot(scan->rs_base.rs_snapshot, scan->rs_base.rs_rd, dp);
1122  lines = scan->rs_ntuples;
1123  linesleft = lines;
1124  if (backward)
1125  lineindex = lines - 1;
1126  else
1127  lineindex = 0;
1128  }
1129 }
1130 
1131 
1132 #if defined(DISABLE_COMPLEX_MACRO)
1133 /*
1134  * This is formatted so oddly so that the correspondence to the macro
1135  * definition in access/htup_details.h is maintained.
1136  */
1137 Datum
1138 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1139  bool *isnull)
1140 {
1141  return (
1142  (attnum) > 0 ?
1143  (
1144  (*(isnull) = false),
1145  HeapTupleNoNulls(tup) ?
1146  (
1147  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1148  (
1149  fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1150  (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1151  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1152  )
1153  :
1154  nocachegetattr((tup), (attnum), (tupleDesc))
1155  )
1156  :
1157  (
1158  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1159  (
1160  (*(isnull) = true),
1161  (Datum) NULL
1162  )
1163  :
1164  (
1165  nocachegetattr((tup), (attnum), (tupleDesc))
1166  )
1167  )
1168  )
1169  :
1170  (
1171  (Datum) NULL
1172  )
1173  );
1174 }
1175 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1176 
1177 
1178 /* ----------------------------------------------------------------
1179  * heap access method interface
1180  * ----------------------------------------------------------------
1181  */
1182 
1183 
1185 heap_beginscan(Relation relation, Snapshot snapshot,
1186  int nkeys, ScanKey key,
1187  ParallelTableScanDesc parallel_scan,
1188  uint32 flags)
1189 {
1190  HeapScanDesc scan;
1191 
1192  /*
1193  * increment relation ref count while scanning relation
1194  *
1195  * This is just to make really sure the relcache entry won't go away while
1196  * the scan has a pointer to it. Caller should be holding the rel open
1197  * anyway, so this is redundant in all normal scenarios...
1198  */
1200 
1201  /*
1202  * allocate and initialize scan descriptor
1203  */
1204  scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1205 
1206  scan->rs_base.rs_rd = relation;
1207  scan->rs_base.rs_snapshot = snapshot;
1208  scan->rs_base.rs_nkeys = nkeys;
1209  scan->rs_base.rs_flags = flags;
1210  scan->rs_base.rs_parallel = parallel_scan;
1211  scan->rs_strategy = NULL; /* set in initscan */
1212 
1213  /*
1214  * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1215  */
1216  if (!(snapshot && IsMVCCSnapshot(snapshot)))
1218 
1219  /*
1220  * For seqscan and sample scans in a serializable transaction, acquire a
1221  * predicate lock on the entire relation. This is required not only to
1222  * lock all the matching tuples, but also to conflict with new insertions
1223  * into the table. In an indexscan, we take page locks on the index pages
1224  * covering the range specified in the scan qual, but in a heap scan there
1225  * is nothing more fine-grained to lock. A bitmap scan is a different
1226  * story, there we have already scanned the index and locked the index
1227  * pages covering the predicate. But in that case we still have to lock
1228  * any matching heap tuples. For sample scan we could optimize the locking
1229  * to be at least page-level granularity, but we'd need to add per-tuple
1230  * locking for that.
1231  */
1233  {
1234  /*
1235  * Ensure a missing snapshot is noticed reliably, even if the
1236  * isolation mode means predicate locking isn't performed (and
1237  * therefore the snapshot isn't used here).
1238  */
1239  Assert(snapshot);
1240  PredicateLockRelation(relation, snapshot);
1241  }
1242 
1243  /* we only need to set this up once */
1244  scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1245 
1246  /*
1247  * Allocate memory to keep track of page allocation for parallel workers
1248  * when doing a parallel scan.
1249  */
1250  if (parallel_scan != NULL)
1252  else
1253  scan->rs_parallelworkerdata = NULL;
1254 
1255  /*
1256  * we do this here instead of in initscan() because heap_rescan also calls
1257  * initscan() and we don't want to allocate memory again
1258  */
1259  if (nkeys > 0)
1260  scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1261  else
1262  scan->rs_base.rs_key = NULL;
1263 
1264  initscan(scan, key, false);
1265 
1266  return (TableScanDesc) scan;
1267 }
1268 
1269 void
1270 heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1271  bool allow_strat, bool allow_sync, bool allow_pagemode)
1272 {
1273  HeapScanDesc scan = (HeapScanDesc) sscan;
1274 
1275  if (set_params)
1276  {
1277  if (allow_strat)
1278  scan->rs_base.rs_flags |= SO_ALLOW_STRAT;
1279  else
1280  scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1281 
1282  if (allow_sync)
1283  scan->rs_base.rs_flags |= SO_ALLOW_SYNC;
1284  else
1285  scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1286 
1287  if (allow_pagemode && scan->rs_base.rs_snapshot &&
1290  else
1292  }
1293 
1294  /*
1295  * unpin scan buffers
1296  */
1297  if (BufferIsValid(scan->rs_cbuf))
1298  ReleaseBuffer(scan->rs_cbuf);
1299 
1300  /*
1301  * reinitialize scan descriptor
1302  */
1303  initscan(scan, key, true);
1304 }
1305 
1306 void
1308 {
1309  HeapScanDesc scan = (HeapScanDesc) sscan;
1310 
1311  /* Note: no locking manipulations needed */
1312 
1313  /*
1314  * unpin scan buffers
1315  */
1316  if (BufferIsValid(scan->rs_cbuf))
1317  ReleaseBuffer(scan->rs_cbuf);
1318 
1319  /*
1320  * decrement relation reference count and free scan descriptor storage
1321  */
1323 
1324  if (scan->rs_base.rs_key)
1325  pfree(scan->rs_base.rs_key);
1326 
1327  if (scan->rs_strategy != NULL)
1329 
1330  if (scan->rs_parallelworkerdata != NULL)
1332 
1333  if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1335 
1336  pfree(scan);
1337 }
1338 
1339 HeapTuple
1341 {
1342  HeapScanDesc scan = (HeapScanDesc) sscan;
1343 
1344  /*
1345  * This is still widely used directly, without going through table AM, so
1346  * add a safety check. It's possible we should, at a later point,
1347  * downgrade this to an assert. The reason for checking the AM routine,
1348  * rather than the AM oid, is that this allows to write regression tests
1349  * that create another AM reusing the heap handler.
1350  */
1352  ereport(ERROR,
1353  (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1354  errmsg_internal("only heap AM is supported")));
1355 
1356  /*
1357  * We don't expect direct calls to heap_getnext with valid CheckXidAlive
1358  * for catalog or regular tables. See detailed comments in xact.c where
1359  * these variables are declared. Normally we have such a check at tableam
1360  * level API but this is called from many places so we need to ensure it
1361  * here.
1362  */
1364  elog(ERROR, "unexpected heap_getnext call during logical decoding");
1365 
1366  /* Note: no locking manipulations needed */
1367 
1368  if (scan->rs_base.rs_flags & SO_ALLOW_PAGEMODE)
1369  heapgettup_pagemode(scan, direction,
1370  scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1371  else
1372  heapgettup(scan, direction,
1373  scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1374 
1375  if (scan->rs_ctup.t_data == NULL)
1376  return NULL;
1377 
1378  /*
1379  * if we get here it means we have a new current scan tuple, so point to
1380  * the proper return buffer and return the tuple.
1381  */
1382 
1384 
1385  return &scan->rs_ctup;
1386 }
1387 
1388 bool
1390 {
1391  HeapScanDesc scan = (HeapScanDesc) sscan;
1392 
1393  /* Note: no locking manipulations needed */
1394 
1395  if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1396  heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1397  else
1398  heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1399 
1400  if (scan->rs_ctup.t_data == NULL)
1401  {
1402  ExecClearTuple(slot);
1403  return false;
1404  }
1405 
1406  /*
1407  * if we get here it means we have a new current scan tuple, so point to
1408  * the proper return buffer and return the tuple.
1409  */
1410 
1412 
1413  ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1414  scan->rs_cbuf);
1415  return true;
1416 }
1417 
1418 void
1420  ItemPointer maxtid)
1421 {
1422  HeapScanDesc scan = (HeapScanDesc) sscan;
1423  BlockNumber startBlk;
1424  BlockNumber numBlks;
1425  ItemPointerData highestItem;
1426  ItemPointerData lowestItem;
1427 
1428  /*
1429  * For relations without any pages, we can simply leave the TID range
1430  * unset. There will be no tuples to scan, therefore no tuples outside
1431  * the given TID range.
1432  */
1433  if (scan->rs_nblocks == 0)
1434  return;
1435 
1436  /*
1437  * Set up some ItemPointers which point to the first and last possible
1438  * tuples in the heap.
1439  */
1440  ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber);
1441  ItemPointerSet(&lowestItem, 0, FirstOffsetNumber);
1442 
1443  /*
1444  * If the given maximum TID is below the highest possible TID in the
1445  * relation, then restrict the range to that, otherwise we scan to the end
1446  * of the relation.
1447  */
1448  if (ItemPointerCompare(maxtid, &highestItem) < 0)
1449  ItemPointerCopy(maxtid, &highestItem);
1450 
1451  /*
1452  * If the given minimum TID is above the lowest possible TID in the
1453  * relation, then restrict the range to only scan for TIDs above that.
1454  */
1455  if (ItemPointerCompare(mintid, &lowestItem) > 0)
1456  ItemPointerCopy(mintid, &lowestItem);
1457 
1458  /*
1459  * Check for an empty range and protect from would be negative results
1460  * from the numBlks calculation below.
1461  */
1462  if (ItemPointerCompare(&highestItem, &lowestItem) < 0)
1463  {
1464  /* Set an empty range of blocks to scan */
1465  heap_setscanlimits(sscan, 0, 0);
1466  return;
1467  }
1468 
1469  /*
1470  * Calculate the first block and the number of blocks we must scan. We
1471  * could be more aggressive here and perform some more validation to try
1472  * and further narrow the scope of blocks to scan by checking if the
1473  * lowerItem has an offset above MaxOffsetNumber. In this case, we could
1474  * advance startBlk by one. Likewise, if highestItem has an offset of 0
1475  * we could scan one fewer blocks. However, such an optimization does not
1476  * seem worth troubling over, currently.
1477  */
1478  startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem);
1479 
1480  numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) -
1481  ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1;
1482 
1483  /* Set the start block and number of blocks to scan */
1484  heap_setscanlimits(sscan, startBlk, numBlks);
1485 
1486  /* Finally, set the TID range in sscan */
1487  ItemPointerCopy(&lowestItem, &sscan->rs_mintid);
1488  ItemPointerCopy(&highestItem, &sscan->rs_maxtid);
1489 }
1490 
1491 bool
1493  TupleTableSlot *slot)
1494 {
1495  HeapScanDesc scan = (HeapScanDesc) sscan;
1496  ItemPointer mintid = &sscan->rs_mintid;
1497  ItemPointer maxtid = &sscan->rs_maxtid;
1498 
1499  /* Note: no locking manipulations needed */
1500  for (;;)
1501  {
1502  if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1503  heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1504  else
1505  heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1506 
1507  if (scan->rs_ctup.t_data == NULL)
1508  {
1509  ExecClearTuple(slot);
1510  return false;
1511  }
1512 
1513  /*
1514  * heap_set_tidrange will have used heap_setscanlimits to limit the
1515  * range of pages we scan to only ones that can contain the TID range
1516  * we're scanning for. Here we must filter out any tuples from these
1517  * pages that are outwith that range.
1518  */
1519  if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1520  {
1521  ExecClearTuple(slot);
1522 
1523  /*
1524  * When scanning backwards, the TIDs will be in descending order.
1525  * Future tuples in this direction will be lower still, so we can
1526  * just return false to indicate there will be no more tuples.
1527  */
1528  if (ScanDirectionIsBackward(direction))
1529  return false;
1530 
1531  continue;
1532  }
1533 
1534  /*
1535  * Likewise for the final page, we must filter out TIDs greater than
1536  * maxtid.
1537  */
1538  if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1539  {
1540  ExecClearTuple(slot);
1541 
1542  /*
1543  * When scanning forward, the TIDs will be in ascending order.
1544  * Future tuples in this direction will be higher still, so we can
1545  * just return false to indicate there will be no more tuples.
1546  */
1547  if (ScanDirectionIsForward(direction))
1548  return false;
1549  continue;
1550  }
1551 
1552  break;
1553  }
1554 
1555  /*
1556  * if we get here it means we have a new current scan tuple, so point to
1557  * the proper return buffer and return the tuple.
1558  */
1560 
1561  ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1562  return true;
1563 }
1564 
1565 /*
1566  * heap_fetch - retrieve tuple with given tid
1567  *
1568  * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1569  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1570  * against the specified snapshot.
1571  *
1572  * If successful (tuple found and passes snapshot time qual), then *userbuf
1573  * is set to the buffer holding the tuple and true is returned. The caller
1574  * must unpin the buffer when done with the tuple.
1575  *
1576  * If the tuple is not found (ie, item number references a deleted slot),
1577  * then tuple->t_data is set to NULL and false is returned.
1578  *
1579  * If the tuple is found but fails the time qual check, then false is returned
1580  * but tuple->t_data is left pointing to the tuple.
1581  *
1582  * heap_fetch does not follow HOT chains: only the exact TID requested will
1583  * be fetched.
1584  *
1585  * It is somewhat inconsistent that we ereport() on invalid block number but
1586  * return false on invalid item number. There are a couple of reasons though.
1587  * One is that the caller can relatively easily check the block number for
1588  * validity, but cannot check the item number without reading the page
1589  * himself. Another is that when we are following a t_ctid link, we can be
1590  * reasonably confident that the page number is valid (since VACUUM shouldn't
1591  * truncate off the destination page without having killed the referencing
1592  * tuple first), but the item number might well not be good.
1593  */
1594 bool
1596  Snapshot snapshot,
1597  HeapTuple tuple,
1598  Buffer *userbuf)
1599 {
1600  ItemPointer tid = &(tuple->t_self);
1601  ItemId lp;
1602  Buffer buffer;
1603  Page page;
1604  OffsetNumber offnum;
1605  bool valid;
1606 
1607  /*
1608  * Fetch and pin the appropriate page of the relation.
1609  */
1610  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1611 
1612  /*
1613  * Need share lock on buffer to examine tuple commit status.
1614  */
1615  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1616  page = BufferGetPage(buffer);
1617  TestForOldSnapshot(snapshot, relation, page);
1618 
1619  /*
1620  * We'd better check for out-of-range offnum in case of VACUUM since the
1621  * TID was obtained.
1622  */
1623  offnum = ItemPointerGetOffsetNumber(tid);
1624  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1625  {
1626  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1627  ReleaseBuffer(buffer);
1628  *userbuf = InvalidBuffer;
1629  tuple->t_data = NULL;
1630  return false;
1631  }
1632 
1633  /*
1634  * get the item line pointer corresponding to the requested tid
1635  */
1636  lp = PageGetItemId(page, offnum);
1637 
1638  /*
1639  * Must check for deleted tuple.
1640  */
1641  if (!ItemIdIsNormal(lp))
1642  {
1643  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1644  ReleaseBuffer(buffer);
1645  *userbuf = InvalidBuffer;
1646  tuple->t_data = NULL;
1647  return false;
1648  }
1649 
1650  /*
1651  * fill in *tuple fields
1652  */
1653  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1654  tuple->t_len = ItemIdGetLength(lp);
1655  tuple->t_tableOid = RelationGetRelid(relation);
1656 
1657  /*
1658  * check tuple visibility, then release lock
1659  */
1660  valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1661 
1662  if (valid)
1663  PredicateLockTID(relation, &(tuple->t_self), snapshot,
1664  HeapTupleHeaderGetXmin(tuple->t_data));
1665 
1666  HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1667 
1668  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1669 
1670  if (valid)
1671  {
1672  /*
1673  * All checks passed, so return the tuple as valid. Caller is now
1674  * responsible for releasing the buffer.
1675  */
1676  *userbuf = buffer;
1677 
1678  return true;
1679  }
1680 
1681  /* Tuple failed time qual */
1682  ReleaseBuffer(buffer);
1683  *userbuf = InvalidBuffer;
1684 
1685  return false;
1686 }
1687 
1688 /*
1689  * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1690  *
1691  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1692  * of a HOT chain), and buffer is the buffer holding this tuple. We search
1693  * for the first chain member satisfying the given snapshot. If one is
1694  * found, we update *tid to reference that tuple's offset number, and
1695  * return true. If no match, return false without modifying *tid.
1696  *
1697  * heapTuple is a caller-supplied buffer. When a match is found, we return
1698  * the tuple here, in addition to updating *tid. If no match is found, the
1699  * contents of this buffer on return are undefined.
1700  *
1701  * If all_dead is not NULL, we check non-visible tuples to see if they are
1702  * globally dead; *all_dead is set true if all members of the HOT chain
1703  * are vacuumable, false if not.
1704  *
1705  * Unlike heap_fetch, the caller must already have pin and (at least) share
1706  * lock on the buffer; it is still pinned/locked at exit. Also unlike
1707  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1708  */
1709 bool
1711  Snapshot snapshot, HeapTuple heapTuple,
1712  bool *all_dead, bool first_call)
1713 {
1714  Page dp = (Page) BufferGetPage(buffer);
1715  TransactionId prev_xmax = InvalidTransactionId;
1716  BlockNumber blkno;
1717  OffsetNumber offnum;
1718  bool at_chain_start;
1719  bool valid;
1720  bool skip;
1721  GlobalVisState *vistest = NULL;
1722 
1723  /* If this is not the first call, previous call returned a (live!) tuple */
1724  if (all_dead)
1725  *all_dead = first_call;
1726 
1727  blkno = ItemPointerGetBlockNumber(tid);
1728  offnum = ItemPointerGetOffsetNumber(tid);
1729  at_chain_start = first_call;
1730  skip = !first_call;
1731 
1732  /* XXX: we should assert that a snapshot is pushed or registered */
1734  Assert(BufferGetBlockNumber(buffer) == blkno);
1735 
1736  /* Scan through possible multiple members of HOT-chain */
1737  for (;;)
1738  {
1739  ItemId lp;
1740 
1741  /* check for bogus TID */
1742  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
1743  break;
1744 
1745  lp = PageGetItemId(dp, offnum);
1746 
1747  /* check for unused, dead, or redirected items */
1748  if (!ItemIdIsNormal(lp))
1749  {
1750  /* We should only see a redirect at start of chain */
1751  if (ItemIdIsRedirected(lp) && at_chain_start)
1752  {
1753  /* Follow the redirect */
1754  offnum = ItemIdGetRedirect(lp);
1755  at_chain_start = false;
1756  continue;
1757  }
1758  /* else must be end of chain */
1759  break;
1760  }
1761 
1762  /*
1763  * Update heapTuple to point to the element of the HOT chain we're
1764  * currently investigating. Having t_self set correctly is important
1765  * because the SSI checks and the *Satisfies routine for historical
1766  * MVCC snapshots need the correct tid to decide about the visibility.
1767  */
1768  heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
1769  heapTuple->t_len = ItemIdGetLength(lp);
1770  heapTuple->t_tableOid = RelationGetRelid(relation);
1771  ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1772 
1773  /*
1774  * Shouldn't see a HEAP_ONLY tuple at chain start.
1775  */
1776  if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1777  break;
1778 
1779  /*
1780  * The xmin should match the previous xmax value, else chain is
1781  * broken.
1782  */
1783  if (TransactionIdIsValid(prev_xmax) &&
1784  !TransactionIdEquals(prev_xmax,
1785  HeapTupleHeaderGetXmin(heapTuple->t_data)))
1786  break;
1787 
1788  /*
1789  * When first_call is true (and thus, skip is initially false) we'll
1790  * return the first tuple we find. But on later passes, heapTuple
1791  * will initially be pointing to the tuple we returned last time.
1792  * Returning it again would be incorrect (and would loop forever), so
1793  * we skip it and return the next match we find.
1794  */
1795  if (!skip)
1796  {
1797  /* If it's visible per the snapshot, we must return it */
1798  valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1799  HeapCheckForSerializableConflictOut(valid, relation, heapTuple,
1800  buffer, snapshot);
1801 
1802  if (valid)
1803  {
1804  ItemPointerSetOffsetNumber(tid, offnum);
1805  PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1806  HeapTupleHeaderGetXmin(heapTuple->t_data));
1807  if (all_dead)
1808  *all_dead = false;
1809  return true;
1810  }
1811  }
1812  skip = false;
1813 
1814  /*
1815  * If we can't see it, maybe no one else can either. At caller
1816  * request, check whether all chain members are dead to all
1817  * transactions.
1818  *
1819  * Note: if you change the criterion here for what is "dead", fix the
1820  * planner's get_actual_variable_range() function to match.
1821  */
1822  if (all_dead && *all_dead)
1823  {
1824  if (!vistest)
1825  vistest = GlobalVisTestFor(relation);
1826 
1827  if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1828  *all_dead = false;
1829  }
1830 
1831  /*
1832  * Check to see if HOT chain continues past this tuple; if so fetch
1833  * the next offnum and loop around.
1834  */
1835  if (HeapTupleIsHotUpdated(heapTuple))
1836  {
1838  blkno);
1839  offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1840  at_chain_start = false;
1841  prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1842  }
1843  else
1844  break; /* end of chain */
1845  }
1846 
1847  return false;
1848 }
1849 
1850 /*
1851  * heap_get_latest_tid - get the latest tid of a specified tuple
1852  *
1853  * Actually, this gets the latest version that is visible according to the
1854  * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1855  * possibly uncommitted version.
1856  *
1857  * *tid is both an input and an output parameter: it is updated to
1858  * show the latest version of the row. Note that it will not be changed
1859  * if no version of the row passes the snapshot test.
1860  */
1861 void
1863  ItemPointer tid)
1864 {
1865  Relation relation = sscan->rs_rd;
1866  Snapshot snapshot = sscan->rs_snapshot;
1867  ItemPointerData ctid;
1868  TransactionId priorXmax;
1869 
1870  /*
1871  * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1872  * Assume that t_ctid links are valid however - there shouldn't be invalid
1873  * ones in the table.
1874  */
1875  Assert(ItemPointerIsValid(tid));
1876 
1877  /*
1878  * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1879  * need to examine, and *tid is the TID we will return if ctid turns out
1880  * to be bogus.
1881  *
1882  * Note that we will loop until we reach the end of the t_ctid chain.
1883  * Depending on the snapshot passed, there might be at most one visible
1884  * version of the row, but we don't try to optimize for that.
1885  */
1886  ctid = *tid;
1887  priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1888  for (;;)
1889  {
1890  Buffer buffer;
1891  Page page;
1892  OffsetNumber offnum;
1893  ItemId lp;
1894  HeapTupleData tp;
1895  bool valid;
1896 
1897  /*
1898  * Read, pin, and lock the page.
1899  */
1900  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1901  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1902  page = BufferGetPage(buffer);
1903  TestForOldSnapshot(snapshot, relation, page);
1904 
1905  /*
1906  * Check for bogus item number. This is not treated as an error
1907  * condition because it can happen while following a t_ctid link. We
1908  * just assume that the prior tid is OK and return it unchanged.
1909  */
1910  offnum = ItemPointerGetOffsetNumber(&ctid);
1911  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1912  {
1913  UnlockReleaseBuffer(buffer);
1914  break;
1915  }
1916  lp = PageGetItemId(page, offnum);
1917  if (!ItemIdIsNormal(lp))
1918  {
1919  UnlockReleaseBuffer(buffer);
1920  break;
1921  }
1922 
1923  /* OK to access the tuple */
1924  tp.t_self = ctid;
1925  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1926  tp.t_len = ItemIdGetLength(lp);
1927  tp.t_tableOid = RelationGetRelid(relation);
1928 
1929  /*
1930  * After following a t_ctid link, we might arrive at an unrelated
1931  * tuple. Check for XMIN match.
1932  */
1933  if (TransactionIdIsValid(priorXmax) &&
1935  {
1936  UnlockReleaseBuffer(buffer);
1937  break;
1938  }
1939 
1940  /*
1941  * Check tuple visibility; if visible, set it as the new result
1942  * candidate.
1943  */
1944  valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1945  HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1946  if (valid)
1947  *tid = ctid;
1948 
1949  /*
1950  * If there's a valid t_ctid link, follow it, else we're done.
1951  */
1952  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1956  {
1957  UnlockReleaseBuffer(buffer);
1958  break;
1959  }
1960 
1961  ctid = tp.t_data->t_ctid;
1962  priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1963  UnlockReleaseBuffer(buffer);
1964  } /* end of loop */
1965 }
1966 
1967 
1968 /*
1969  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1970  *
1971  * This is called after we have waited for the XMAX transaction to terminate.
1972  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1973  * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1974  * hint bit if possible --- but beware that that may not yet be possible,
1975  * if the transaction committed asynchronously.
1976  *
1977  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1978  * even if it commits.
1979  *
1980  * Hence callers should look only at XMAX_INVALID.
1981  *
1982  * Note this is not allowed for tuples whose xmax is a multixact.
1983  */
1984 static void
1986 {
1988  Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
1989 
1990  if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
1991  {
1992  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1995  xid);
1996  else
1997  HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
1999  }
2000 }
2001 
2002 
2003 /*
2004  * GetBulkInsertState - prepare status object for a bulk insert
2005  */
2008 {
2009  BulkInsertState bistate;
2010 
2011  bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2013  bistate->current_buf = InvalidBuffer;
2014  return bistate;
2015 }
2016 
2017 /*
2018  * FreeBulkInsertState - clean up after finishing a bulk insert
2019  */
2020 void
2022 {
2023  if (bistate->current_buf != InvalidBuffer)
2024  ReleaseBuffer(bistate->current_buf);
2025  FreeAccessStrategy(bistate->strategy);
2026  pfree(bistate);
2027 }
2028 
2029 /*
2030  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2031  */
2032 void
2034 {
2035  if (bistate->current_buf != InvalidBuffer)
2036  ReleaseBuffer(bistate->current_buf);
2037  bistate->current_buf = InvalidBuffer;
2038 }
2039 
2040 
2041 /*
2042  * heap_insert - insert tuple into a heap
2043  *
2044  * The new tuple is stamped with current transaction ID and the specified
2045  * command ID.
2046  *
2047  * See table_tuple_insert for comments about most of the input flags, except
2048  * that this routine directly takes a tuple rather than a slot.
2049  *
2050  * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
2051  * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
2052  * implement table_tuple_insert_speculative().
2053  *
2054  * On return the header fields of *tup are updated to match the stored tuple;
2055  * in particular tup->t_self receives the actual TID where the tuple was
2056  * stored. But note that any toasting of fields within the tuple data is NOT
2057  * reflected into *tup.
2058  */
2059 void
2061  int options, BulkInsertState bistate)
2062 {
2064  HeapTuple heaptup;
2065  Buffer buffer;
2066  Buffer vmbuffer = InvalidBuffer;
2067  bool all_visible_cleared = false;
2068 
2069  /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2071  RelationGetNumberOfAttributes(relation));
2072 
2073  /*
2074  * Fill in tuple header fields and toast the tuple if necessary.
2075  *
2076  * Note: below this point, heaptup is the data we actually intend to store
2077  * into the relation; tup is the caller's original untoasted data.
2078  */
2079  heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2080 
2081  /*
2082  * Find buffer to insert this tuple into. If the page is all visible,
2083  * this will also pin the requisite visibility map page.
2084  */
2085  buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2086  InvalidBuffer, options, bistate,
2087  &vmbuffer, NULL);
2088 
2089  /*
2090  * We're about to do the actual insert -- but check for conflict first, to
2091  * avoid possibly having to roll back work we've just done.
2092  *
2093  * This is safe without a recheck as long as there is no possibility of
2094  * another process scanning the page between this check and the insert
2095  * being visible to the scan (i.e., an exclusive buffer content lock is
2096  * continuously held from this point until the tuple insert is visible).
2097  *
2098  * For a heap insert, we only need to check for table-level SSI locks. Our
2099  * new tuple can't possibly conflict with existing tuple locks, and heap
2100  * page locks are only consolidated versions of tuple locks; they do not
2101  * lock "gaps" as index page locks do. So we don't need to specify a
2102  * buffer when making the call, which makes for a faster check.
2103  */
2105 
2106  /* NO EREPORT(ERROR) from here till changes are logged */
2108 
2109  RelationPutHeapTuple(relation, buffer, heaptup,
2110  (options & HEAP_INSERT_SPECULATIVE) != 0);
2111 
2112  if (PageIsAllVisible(BufferGetPage(buffer)))
2113  {
2114  all_visible_cleared = true;
2116  visibilitymap_clear(relation,
2117  ItemPointerGetBlockNumber(&(heaptup->t_self)),
2118  vmbuffer, VISIBILITYMAP_VALID_BITS);
2119  }
2120 
2121  /*
2122  * XXX Should we set PageSetPrunable on this page ?
2123  *
2124  * The inserting transaction may eventually abort thus making this tuple
2125  * DEAD and hence available for pruning. Though we don't want to optimize
2126  * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2127  * aborted tuple will never be pruned until next vacuum is triggered.
2128  *
2129  * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2130  */
2131 
2132  MarkBufferDirty(buffer);
2133 
2134  /* XLOG stuff */
2135  if (RelationNeedsWAL(relation))
2136  {
2137  xl_heap_insert xlrec;
2138  xl_heap_header xlhdr;
2139  XLogRecPtr recptr;
2140  Page page = BufferGetPage(buffer);
2141  uint8 info = XLOG_HEAP_INSERT;
2142  int bufflags = 0;
2143 
2144  /*
2145  * If this is a catalog, we need to transmit combo CIDs to properly
2146  * decode, so log that as well.
2147  */
2149  log_heap_new_cid(relation, heaptup);
2150 
2151  /*
2152  * If this is the single and first tuple on page, we can reinit the
2153  * page instead of restoring the whole thing. Set flag, and hide
2154  * buffer references from XLogInsert.
2155  */
2156  if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2158  {
2159  info |= XLOG_HEAP_INIT_PAGE;
2160  bufflags |= REGBUF_WILL_INIT;
2161  }
2162 
2163  xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2164  xlrec.flags = 0;
2165  if (all_visible_cleared)
2167  if (options & HEAP_INSERT_SPECULATIVE)
2170 
2171  /*
2172  * For logical decoding, we need the tuple even if we're doing a full
2173  * page write, so make sure it's included even if we take a full-page
2174  * image. (XXX We could alternatively store a pointer into the FPW).
2175  */
2176  if (RelationIsLogicallyLogged(relation) &&
2177  !(options & HEAP_INSERT_NO_LOGICAL))
2178  {
2180  bufflags |= REGBUF_KEEP_DATA;
2181 
2182  if (IsToastRelation(relation))
2184  }
2185 
2186  XLogBeginInsert();
2187  XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2188 
2189  xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2190  xlhdr.t_infomask = heaptup->t_data->t_infomask;
2191  xlhdr.t_hoff = heaptup->t_data->t_hoff;
2192 
2193  /*
2194  * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2195  * write the whole page to the xlog, we don't need to store
2196  * xl_heap_header in the xlog.
2197  */
2198  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2199  XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2200  /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2202  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2203  heaptup->t_len - SizeofHeapTupleHeader);
2204 
2205  /* filtering by origin on a row level is much more efficient */
2207 
2208  recptr = XLogInsert(RM_HEAP_ID, info);
2209 
2210  PageSetLSN(page, recptr);
2211  }
2212 
2213  END_CRIT_SECTION();
2214 
2215  UnlockReleaseBuffer(buffer);
2216  if (vmbuffer != InvalidBuffer)
2217  ReleaseBuffer(vmbuffer);
2218 
2219  /*
2220  * If tuple is cachable, mark it for invalidation from the caches in case
2221  * we abort. Note it is OK to do this after releasing the buffer, because
2222  * the heaptup data structure is all in local memory, not in the shared
2223  * buffer.
2224  */
2225  CacheInvalidateHeapTuple(relation, heaptup, NULL);
2226 
2227  /* Note: speculative insertions are counted too, even if aborted later */
2228  pgstat_count_heap_insert(relation, 1);
2229 
2230  /*
2231  * If heaptup is a private copy, release it. Don't forget to copy t_self
2232  * back to the caller's image, too.
2233  */
2234  if (heaptup != tup)
2235  {
2236  tup->t_self = heaptup->t_self;
2237  heap_freetuple(heaptup);
2238  }
2239 }
2240 
2241 /*
2242  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2243  * tuple header fields and toasts the tuple if necessary. Returns a toasted
2244  * version of the tuple if it was toasted, or the original tuple if not. Note
2245  * that in any case, the header fields are also set in the original tuple.
2246  */
2247 static HeapTuple
2249  CommandId cid, int options)
2250 {
2251  /*
2252  * To allow parallel inserts, we need to ensure that they are safe to be
2253  * performed in workers. We have the infrastructure to allow parallel
2254  * inserts in general except for the cases where inserts generate a new
2255  * CommandId (eg. inserts into a table having a foreign key column).
2256  */
2257  if (IsParallelWorker())
2258  ereport(ERROR,
2259  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2260  errmsg("cannot insert tuples in a parallel worker")));
2261 
2262  tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2263  tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2265  HeapTupleHeaderSetXmin(tup->t_data, xid);
2266  if (options & HEAP_INSERT_FROZEN)
2268 
2269  HeapTupleHeaderSetCmin(tup->t_data, cid);
2270  HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2271  tup->t_tableOid = RelationGetRelid(relation);
2272 
2273  /*
2274  * If the new tuple is too big for storage or contains already toasted
2275  * out-of-line attributes from some other relation, invoke the toaster.
2276  */
2277  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2278  relation->rd_rel->relkind != RELKIND_MATVIEW)
2279  {
2280  /* toast table entries should never be recursively toasted */
2282  return tup;
2283  }
2284  else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2285  return heap_toast_insert_or_update(relation, tup, NULL, options);
2286  else
2287  return tup;
2288 }
2289 
2290 /*
2291  * heap_multi_insert - insert multiple tuples into a heap
2292  *
2293  * This is like heap_insert(), but inserts multiple tuples in one operation.
2294  * That's faster than calling heap_insert() in a loop, because when multiple
2295  * tuples can be inserted on a single page, we can write just a single WAL
2296  * record covering all of them, and only need to lock/unlock the page once.
2297  *
2298  * Note: this leaks memory into the current memory context. You can create a
2299  * temporary context before calling this, if that's a problem.
2300  */
2301 void
2302 heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2303  CommandId cid, int options, BulkInsertState bistate)
2304 {
2306  HeapTuple *heaptuples;
2307  int i;
2308  int ndone;
2309  PGAlignedBlock scratch;
2310  Page page;
2311  Buffer vmbuffer = InvalidBuffer;
2312  bool needwal;
2313  Size saveFreeSpace;
2314  bool need_tuple_data = RelationIsLogicallyLogged(relation);
2315  bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2316 
2317  /* currently not needed (thus unsupported) for heap_multi_insert() */
2318  AssertArg(!(options & HEAP_INSERT_NO_LOGICAL));
2319 
2320  needwal = RelationNeedsWAL(relation);
2321  saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2323 
2324  /* Toast and set header data in all the slots */
2325  heaptuples = palloc(ntuples * sizeof(HeapTuple));
2326  for (i = 0; i < ntuples; i++)
2327  {
2328  HeapTuple tuple;
2329 
2330  tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2331  slots[i]->tts_tableOid = RelationGetRelid(relation);
2332  tuple->t_tableOid = slots[i]->tts_tableOid;
2333  heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2334  options);
2335  }
2336 
2337  /*
2338  * We're about to do the actual inserts -- but check for conflict first,
2339  * to minimize the possibility of having to roll back work we've just
2340  * done.
2341  *
2342  * A check here does not definitively prevent a serialization anomaly;
2343  * that check MUST be done at least past the point of acquiring an
2344  * exclusive buffer content lock on every buffer that will be affected,
2345  * and MAY be done after all inserts are reflected in the buffers and
2346  * those locks are released; otherwise there is a race condition. Since
2347  * multiple buffers can be locked and unlocked in the loop below, and it
2348  * would not be feasible to identify and lock all of those buffers before
2349  * the loop, we must do a final check at the end.
2350  *
2351  * The check here could be omitted with no loss of correctness; it is
2352  * present strictly as an optimization.
2353  *
2354  * For heap inserts, we only need to check for table-level SSI locks. Our
2355  * new tuples can't possibly conflict with existing tuple locks, and heap
2356  * page locks are only consolidated versions of tuple locks; they do not
2357  * lock "gaps" as index page locks do. So we don't need to specify a
2358  * buffer when making the call, which makes for a faster check.
2359  */
2361 
2362  ndone = 0;
2363  while (ndone < ntuples)
2364  {
2365  Buffer buffer;
2366  bool starting_with_empty_page;
2367  bool all_visible_cleared = false;
2368  bool all_frozen_set = false;
2369  int nthispage;
2370 
2372 
2373  /*
2374  * Find buffer where at least the next tuple will fit. If the page is
2375  * all-visible, this will also pin the requisite visibility map page.
2376  *
2377  * Also pin visibility map page if COPY FREEZE inserts tuples into an
2378  * empty page. See all_frozen_set below.
2379  */
2380  buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2381  InvalidBuffer, options, bistate,
2382  &vmbuffer, NULL);
2383  page = BufferGetPage(buffer);
2384 
2385  starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0;
2386 
2387  if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN))
2388  all_frozen_set = true;
2389 
2390  /* NO EREPORT(ERROR) from here till changes are logged */
2392 
2393  /*
2394  * RelationGetBufferForTuple has ensured that the first tuple fits.
2395  * Put that on the page, and then as many other tuples as fit.
2396  */
2397  RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2398 
2399  /*
2400  * For logical decoding we need combo CIDs to properly decode the
2401  * catalog.
2402  */
2403  if (needwal && need_cids)
2404  log_heap_new_cid(relation, heaptuples[ndone]);
2405 
2406  for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2407  {
2408  HeapTuple heaptup = heaptuples[ndone + nthispage];
2409 
2410  if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2411  break;
2412 
2413  RelationPutHeapTuple(relation, buffer, heaptup, false);
2414 
2415  /*
2416  * For logical decoding we need combo CIDs to properly decode the
2417  * catalog.
2418  */
2419  if (needwal && need_cids)
2420  log_heap_new_cid(relation, heaptup);
2421  }
2422 
2423  /*
2424  * If the page is all visible, need to clear that, unless we're only
2425  * going to add further frozen rows to it.
2426  *
2427  * If we're only adding already frozen rows to a previously empty
2428  * page, mark it as all-visible.
2429  */
2430  if (PageIsAllVisible(page) && !(options & HEAP_INSERT_FROZEN))
2431  {
2432  all_visible_cleared = true;
2433  PageClearAllVisible(page);
2434  visibilitymap_clear(relation,
2435  BufferGetBlockNumber(buffer),
2436  vmbuffer, VISIBILITYMAP_VALID_BITS);
2437  }
2438  else if (all_frozen_set)
2439  PageSetAllVisible(page);
2440 
2441  /*
2442  * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2443  */
2444 
2445  MarkBufferDirty(buffer);
2446 
2447  /* XLOG stuff */
2448  if (needwal)
2449  {
2450  XLogRecPtr recptr;
2451  xl_heap_multi_insert *xlrec;
2453  char *tupledata;
2454  int totaldatalen;
2455  char *scratchptr = scratch.data;
2456  bool init;
2457  int bufflags = 0;
2458 
2459  /*
2460  * If the page was previously empty, we can reinit the page
2461  * instead of restoring the whole thing.
2462  */
2463  init = starting_with_empty_page;
2464 
2465  /* allocate xl_heap_multi_insert struct from the scratch area */
2466  xlrec = (xl_heap_multi_insert *) scratchptr;
2467  scratchptr += SizeOfHeapMultiInsert;
2468 
2469  /*
2470  * Allocate offsets array. Unless we're reinitializing the page,
2471  * in that case the tuples are stored in order starting at
2472  * FirstOffsetNumber and we don't need to store the offsets
2473  * explicitly.
2474  */
2475  if (!init)
2476  scratchptr += nthispage * sizeof(OffsetNumber);
2477 
2478  /* the rest of the scratch space is used for tuple data */
2479  tupledata = scratchptr;
2480 
2481  /* check that the mutually exclusive flags are not both set */
2482  Assert(!(all_visible_cleared && all_frozen_set));
2483 
2484  xlrec->flags = 0;
2485  if (all_visible_cleared)
2487  if (all_frozen_set)
2489 
2490  xlrec->ntuples = nthispage;
2491 
2492  /*
2493  * Write out an xl_multi_insert_tuple and the tuple data itself
2494  * for each tuple.
2495  */
2496  for (i = 0; i < nthispage; i++)
2497  {
2498  HeapTuple heaptup = heaptuples[ndone + i];
2499  xl_multi_insert_tuple *tuphdr;
2500  int datalen;
2501 
2502  if (!init)
2503  xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2504  /* xl_multi_insert_tuple needs two-byte alignment. */
2505  tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2506  scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2507 
2508  tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2509  tuphdr->t_infomask = heaptup->t_data->t_infomask;
2510  tuphdr->t_hoff = heaptup->t_data->t_hoff;
2511 
2512  /* write bitmap [+ padding] [+ oid] + data */
2513  datalen = heaptup->t_len - SizeofHeapTupleHeader;
2514  memcpy(scratchptr,
2515  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2516  datalen);
2517  tuphdr->datalen = datalen;
2518  scratchptr += datalen;
2519  }
2520  totaldatalen = scratchptr - tupledata;
2521  Assert((scratchptr - scratch.data) < BLCKSZ);
2522 
2523  if (need_tuple_data)
2525 
2526  /*
2527  * Signal that this is the last xl_heap_multi_insert record
2528  * emitted by this call to heap_multi_insert(). Needed for logical
2529  * decoding so it knows when to cleanup temporary data.
2530  */
2531  if (ndone + nthispage == ntuples)
2532  xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2533 
2534  if (init)
2535  {
2536  info |= XLOG_HEAP_INIT_PAGE;
2537  bufflags |= REGBUF_WILL_INIT;
2538  }
2539 
2540  /*
2541  * If we're doing logical decoding, include the new tuple data
2542  * even if we take a full-page image of the page.
2543  */
2544  if (need_tuple_data)
2545  bufflags |= REGBUF_KEEP_DATA;
2546 
2547  XLogBeginInsert();
2548  XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2549  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2550 
2551  XLogRegisterBufData(0, tupledata, totaldatalen);
2552 
2553  /* filtering by origin on a row level is much more efficient */
2555 
2556  recptr = XLogInsert(RM_HEAP2_ID, info);
2557 
2558  PageSetLSN(page, recptr);
2559  }
2560 
2561  END_CRIT_SECTION();
2562 
2563  /*
2564  * If we've frozen everything on the page, update the visibilitymap.
2565  * We're already holding pin on the vmbuffer.
2566  */
2567  if (all_frozen_set)
2568  {
2569  Assert(PageIsAllVisible(page));
2570  Assert(visibilitymap_pin_ok(BufferGetBlockNumber(buffer), vmbuffer));
2571 
2572  /*
2573  * It's fine to use InvalidTransactionId here - this is only used
2574  * when HEAP_INSERT_FROZEN is specified, which intentionally
2575  * violates visibility rules.
2576  */
2577  visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer,
2578  InvalidXLogRecPtr, vmbuffer,
2581  }
2582 
2583  UnlockReleaseBuffer(buffer);
2584  ndone += nthispage;
2585 
2586  /*
2587  * NB: Only release vmbuffer after inserting all tuples - it's fairly
2588  * likely that we'll insert into subsequent heap pages that are likely
2589  * to use the same vm page.
2590  */
2591  }
2592 
2593  /* We're done with inserting all tuples, so release the last vmbuffer. */
2594  if (vmbuffer != InvalidBuffer)
2595  ReleaseBuffer(vmbuffer);
2596 
2597  /*
2598  * We're done with the actual inserts. Check for conflicts again, to
2599  * ensure that all rw-conflicts in to these inserts are detected. Without
2600  * this final check, a sequential scan of the heap may have locked the
2601  * table after the "before" check, missing one opportunity to detect the
2602  * conflict, and then scanned the table before the new tuples were there,
2603  * missing the other chance to detect the conflict.
2604  *
2605  * For heap inserts, we only need to check for table-level SSI locks. Our
2606  * new tuples can't possibly conflict with existing tuple locks, and heap
2607  * page locks are only consolidated versions of tuple locks; they do not
2608  * lock "gaps" as index page locks do. So we don't need to specify a
2609  * buffer when making the call.
2610  */
2612 
2613  /*
2614  * If tuples are cachable, mark them for invalidation from the caches in
2615  * case we abort. Note it is OK to do this after releasing the buffer,
2616  * because the heaptuples data structure is all in local memory, not in
2617  * the shared buffer.
2618  */
2619  if (IsCatalogRelation(relation))
2620  {
2621  for (i = 0; i < ntuples; i++)
2622  CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2623  }
2624 
2625  /* copy t_self fields back to the caller's slots */
2626  for (i = 0; i < ntuples; i++)
2627  slots[i]->tts_tid = heaptuples[i]->t_self;
2628 
2629  pgstat_count_heap_insert(relation, ntuples);
2630 }
2631 
2632 /*
2633  * simple_heap_insert - insert a tuple
2634  *
2635  * Currently, this routine differs from heap_insert only in supplying
2636  * a default command ID and not allowing access to the speedup options.
2637  *
2638  * This should be used rather than using heap_insert directly in most places
2639  * where we are modifying system catalogs.
2640  */
2641 void
2643 {
2644  heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2645 }
2646 
2647 /*
2648  * Given infomask/infomask2, compute the bits that must be saved in the
2649  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2650  * xl_heap_lock_updated WAL records.
2651  *
2652  * See fix_infomask_from_infobits.
2653  */
2654 static uint8
2655 compute_infobits(uint16 infomask, uint16 infomask2)
2656 {
2657  return
2658  ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2659  ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2660  ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2661  /* note we ignore HEAP_XMAX_SHR_LOCK here */
2662  ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2663  ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2664  XLHL_KEYS_UPDATED : 0);
2665 }
2666 
2667 /*
2668  * Given two versions of the same t_infomask for a tuple, compare them and
2669  * return whether the relevant status for a tuple Xmax has changed. This is
2670  * used after a buffer lock has been released and reacquired: we want to ensure
2671  * that the tuple state continues to be the same it was when we previously
2672  * examined it.
2673  *
2674  * Note the Xmax field itself must be compared separately.
2675  */
2676 static inline bool
2677 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2678 {
2679  const uint16 interesting =
2681 
2682  if ((new_infomask & interesting) != (old_infomask & interesting))
2683  return true;
2684 
2685  return false;
2686 }
2687 
2688 /*
2689  * heap_delete - delete a tuple
2690  *
2691  * See table_tuple_delete() for an explanation of the parameters, except that
2692  * this routine directly takes a tuple rather than a slot.
2693  *
2694  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2695  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2696  * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2697  * generated by another transaction).
2698  */
2699 TM_Result
2701  CommandId cid, Snapshot crosscheck, bool wait,
2702  TM_FailureData *tmfd, bool changingPart)
2703 {
2704  TM_Result result;
2706  ItemId lp;
2707  HeapTupleData tp;
2708  Page page;
2709  BlockNumber block;
2710  Buffer buffer;
2711  Buffer vmbuffer = InvalidBuffer;
2712  TransactionId new_xmax;
2713  uint16 new_infomask,
2714  new_infomask2;
2715  bool have_tuple_lock = false;
2716  bool iscombo;
2717  bool all_visible_cleared = false;
2718  HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2719  bool old_key_copied = false;
2720 
2721  Assert(ItemPointerIsValid(tid));
2722 
2723  /*
2724  * Forbid this during a parallel operation, lest it allocate a combo CID.
2725  * Other workers might need that combo CID for visibility checks, and we
2726  * have no provision for broadcasting it to them.
2727  */
2728  if (IsInParallelMode())
2729  ereport(ERROR,
2730  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2731  errmsg("cannot delete tuples during a parallel operation")));
2732 
2733  block = ItemPointerGetBlockNumber(tid);
2734  buffer = ReadBuffer(relation, block);
2735  page = BufferGetPage(buffer);
2736 
2737  /*
2738  * Before locking the buffer, pin the visibility map page if it appears to
2739  * be necessary. Since we haven't got the lock yet, someone else might be
2740  * in the middle of changing this, so we'll need to recheck after we have
2741  * the lock.
2742  */
2743  if (PageIsAllVisible(page))
2744  visibilitymap_pin(relation, block, &vmbuffer);
2745 
2747 
2748  /*
2749  * If we didn't pin the visibility map page and the page has become all
2750  * visible while we were busy locking the buffer, we'll have to unlock and
2751  * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2752  * unfortunate, but hopefully shouldn't happen often.
2753  */
2754  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2755  {
2756  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2757  visibilitymap_pin(relation, block, &vmbuffer);
2759  }
2760 
2761  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
2762  Assert(ItemIdIsNormal(lp));
2763 
2764  tp.t_tableOid = RelationGetRelid(relation);
2765  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2766  tp.t_len = ItemIdGetLength(lp);
2767  tp.t_self = *tid;
2768 
2769 l1:
2770  result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2771 
2772  if (result == TM_Invisible)
2773  {
2774  UnlockReleaseBuffer(buffer);
2775  ereport(ERROR,
2776  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2777  errmsg("attempted to delete invisible tuple")));
2778  }
2779  else if (result == TM_BeingModified && wait)
2780  {
2781  TransactionId xwait;
2782  uint16 infomask;
2783 
2784  /* must copy state data before unlocking buffer */
2785  xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
2786  infomask = tp.t_data->t_infomask;
2787 
2788  /*
2789  * Sleep until concurrent transaction ends -- except when there's a
2790  * single locker and it's our own transaction. Note we don't care
2791  * which lock mode the locker has, because we need the strongest one.
2792  *
2793  * Before sleeping, we need to acquire tuple lock to establish our
2794  * priority for the tuple (see heap_lock_tuple). LockTuple will
2795  * release us when we are next-in-line for the tuple.
2796  *
2797  * If we are forced to "start over" below, we keep the tuple lock;
2798  * this arranges that we stay at the head of the line while rechecking
2799  * tuple state.
2800  */
2801  if (infomask & HEAP_XMAX_IS_MULTI)
2802  {
2803  bool current_is_member = false;
2804 
2805  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2806  LockTupleExclusive, &current_is_member))
2807  {
2808  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2809 
2810  /*
2811  * Acquire the lock, if necessary (but skip it when we're
2812  * requesting a lock and already have one; avoids deadlock).
2813  */
2814  if (!current_is_member)
2816  LockWaitBlock, &have_tuple_lock);
2817 
2818  /* wait for multixact */
2820  relation, &(tp.t_self), XLTW_Delete,
2821  NULL);
2823 
2824  /*
2825  * If xwait had just locked the tuple then some other xact
2826  * could update this tuple before we get to this point. Check
2827  * for xmax change, and start over if so.
2828  */
2829  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2831  xwait))
2832  goto l1;
2833  }
2834 
2835  /*
2836  * You might think the multixact is necessarily done here, but not
2837  * so: it could have surviving members, namely our own xact or
2838  * other subxacts of this backend. It is legal for us to delete
2839  * the tuple in either case, however (the latter case is
2840  * essentially a situation of upgrading our former shared lock to
2841  * exclusive). We don't bother changing the on-disk hint bits
2842  * since we are about to overwrite the xmax altogether.
2843  */
2844  }
2845  else if (!TransactionIdIsCurrentTransactionId(xwait))
2846  {
2847  /*
2848  * Wait for regular transaction to end; but first, acquire tuple
2849  * lock.
2850  */
2851  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2853  LockWaitBlock, &have_tuple_lock);
2854  XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2856 
2857  /*
2858  * xwait is done, but if xwait had just locked the tuple then some
2859  * other xact could update this tuple before we get to this point.
2860  * Check for xmax change, and start over if so.
2861  */
2862  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2864  xwait))
2865  goto l1;
2866 
2867  /* Otherwise check if it committed or aborted */
2868  UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2869  }
2870 
2871  /*
2872  * We may overwrite if previous xmax aborted, or if it committed but
2873  * only locked the tuple without updating it.
2874  */
2875  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2878  result = TM_Ok;
2879  else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
2880  result = TM_Updated;
2881  else
2882  result = TM_Deleted;
2883  }
2884 
2885  if (crosscheck != InvalidSnapshot && result == TM_Ok)
2886  {
2887  /* Perform additional check for transaction-snapshot mode RI updates */
2888  if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2889  result = TM_Updated;
2890  }
2891 
2892  if (result != TM_Ok)
2893  {
2894  Assert(result == TM_SelfModified ||
2895  result == TM_Updated ||
2896  result == TM_Deleted ||
2897  result == TM_BeingModified);
2899  Assert(result != TM_Updated ||
2900  !ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid));
2901  tmfd->ctid = tp.t_data->t_ctid;
2903  if (result == TM_SelfModified)
2904  tmfd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
2905  else
2906  tmfd->cmax = InvalidCommandId;
2907  UnlockReleaseBuffer(buffer);
2908  if (have_tuple_lock)
2909  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
2910  if (vmbuffer != InvalidBuffer)
2911  ReleaseBuffer(vmbuffer);
2912  return result;
2913  }
2914 
2915  /*
2916  * We're about to do the actual delete -- check for conflict first, to
2917  * avoid possibly having to roll back work we've just done.
2918  *
2919  * This is safe without a recheck as long as there is no possibility of
2920  * another process scanning the page between this check and the delete
2921  * being visible to the scan (i.e., an exclusive buffer content lock is
2922  * continuously held from this point until the tuple delete is visible).
2923  */
2924  CheckForSerializableConflictIn(relation, tid, BufferGetBlockNumber(buffer));
2925 
2926  /* replace cid with a combo CID if necessary */
2927  HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2928 
2929  /*
2930  * Compute replica identity tuple before entering the critical section so
2931  * we don't PANIC upon a memory allocation failure.
2932  */
2933  old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2934 
2935  /*
2936  * If this is the first possibly-multixact-able operation in the current
2937  * transaction, set my per-backend OldestMemberMXactId setting. We can be
2938  * certain that the transaction will never become a member of any older
2939  * MultiXactIds than that. (We have to do this even if we end up just
2940  * using our own TransactionId below, since some other backend could
2941  * incorporate our XID into a MultiXact immediately afterwards.)
2942  */
2944 
2947  xid, LockTupleExclusive, true,
2948  &new_xmax, &new_infomask, &new_infomask2);
2949 
2951 
2952  /*
2953  * If this transaction commits, the tuple will become DEAD sooner or
2954  * later. Set flag that this page is a candidate for pruning once our xid
2955  * falls below the OldestXmin horizon. If the transaction finally aborts,
2956  * the subsequent page pruning will be a no-op and the hint will be
2957  * cleared.
2958  */
2959  PageSetPrunable(page, xid);
2960 
2961  if (PageIsAllVisible(page))
2962  {
2963  all_visible_cleared = true;
2964  PageClearAllVisible(page);
2965  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2966  vmbuffer, VISIBILITYMAP_VALID_BITS);
2967  }
2968 
2969  /* store transaction information of xact deleting the tuple */
2972  tp.t_data->t_infomask |= new_infomask;
2973  tp.t_data->t_infomask2 |= new_infomask2;
2975  HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2976  HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2977  /* Make sure there is no forward chain link in t_ctid */
2978  tp.t_data->t_ctid = tp.t_self;
2979 
2980  /* Signal that this is actually a move into another partition */
2981  if (changingPart)
2983 
2984  MarkBufferDirty(buffer);
2985 
2986  /*
2987  * XLOG stuff
2988  *
2989  * NB: heap_abort_speculative() uses the same xlog record and replay
2990  * routines.
2991  */
2992  if (RelationNeedsWAL(relation))
2993  {
2994  xl_heap_delete xlrec;
2995  xl_heap_header xlhdr;
2996  XLogRecPtr recptr;
2997 
2998  /*
2999  * For logical decode we need combo CIDs to properly decode the
3000  * catalog
3001  */
3003  log_heap_new_cid(relation, &tp);
3004 
3005  xlrec.flags = 0;
3006  if (all_visible_cleared)
3008  if (changingPart)
3011  tp.t_data->t_infomask2);
3013  xlrec.xmax = new_xmax;
3014 
3015  if (old_key_tuple != NULL)
3016  {
3017  if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3019  else
3021  }
3022 
3023  XLogBeginInsert();
3024  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3025 
3026  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3027 
3028  /*
3029  * Log replica identity of the deleted tuple if there is one
3030  */
3031  if (old_key_tuple != NULL)
3032  {
3033  xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3034  xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3035  xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3036 
3037  XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3038  XLogRegisterData((char *) old_key_tuple->t_data
3040  old_key_tuple->t_len
3042  }
3043 
3044  /* filtering by origin on a row level is much more efficient */
3046 
3047  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3048 
3049  PageSetLSN(page, recptr);
3050  }
3051 
3052  END_CRIT_SECTION();
3053 
3054  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3055 
3056  if (vmbuffer != InvalidBuffer)
3057  ReleaseBuffer(vmbuffer);
3058 
3059  /*
3060  * If the tuple has toasted out-of-line attributes, we need to delete
3061  * those items too. We have to do this before releasing the buffer
3062  * because we need to look at the contents of the tuple, but it's OK to
3063  * release the content lock on the buffer first.
3064  */
3065  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3066  relation->rd_rel->relkind != RELKIND_MATVIEW)
3067  {
3068  /* toast table entries should never be recursively toasted */
3070  }
3071  else if (HeapTupleHasExternal(&tp))
3072  heap_toast_delete(relation, &tp, false);
3073 
3074  /*
3075  * Mark tuple for invalidation from system caches at next command
3076  * boundary. We have to do this before releasing the buffer because we
3077  * need to look at the contents of the tuple.
3078  */
3079  CacheInvalidateHeapTuple(relation, &tp, NULL);
3080 
3081  /* Now we can release the buffer */
3082  ReleaseBuffer(buffer);
3083 
3084  /*
3085  * Release the lmgr tuple lock, if we had it.
3086  */
3087  if (have_tuple_lock)
3088  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3089 
3090  pgstat_count_heap_delete(relation);
3091 
3092  if (old_key_tuple != NULL && old_key_copied)
3093  heap_freetuple(old_key_tuple);
3094 
3095  return TM_Ok;
3096 }
3097 
3098 /*
3099  * simple_heap_delete - delete a tuple
3100  *
3101  * This routine may be used to delete a tuple when concurrent updates of
3102  * the target tuple are not expected (for example, because we have a lock
3103  * on the relation associated with the tuple). Any failure is reported
3104  * via ereport().
3105  */
3106 void
3108 {
3109  TM_Result result;
3110  TM_FailureData tmfd;
3111 
3112  result = heap_delete(relation, tid,
3114  true /* wait for commit */ ,
3115  &tmfd, false /* changingPart */ );
3116  switch (result)
3117  {
3118  case TM_SelfModified:
3119  /* Tuple was already updated in current command? */
3120  elog(ERROR, "tuple already updated by self");
3121  break;
3122 
3123  case TM_Ok:
3124  /* done successfully */
3125  break;
3126 
3127  case TM_Updated:
3128  elog(ERROR, "tuple concurrently updated");
3129  break;
3130 
3131  case TM_Deleted:
3132  elog(ERROR, "tuple concurrently deleted");
3133  break;
3134 
3135  default:
3136  elog(ERROR, "unrecognized heap_delete status: %u", result);
3137  break;
3138  }
3139 }
3140 
3141 /*
3142  * heap_update - replace a tuple
3143  *
3144  * See table_tuple_update() for an explanation of the parameters, except that
3145  * this routine directly takes a tuple rather than a slot.
3146  *
3147  * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
3148  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
3149  * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
3150  * generated by another transaction).
3151  */
3152 TM_Result
3154  CommandId cid, Snapshot crosscheck, bool wait,
3155  TM_FailureData *tmfd, LockTupleMode *lockmode)
3156 {
3157  TM_Result result;
3159  Bitmapset *hot_attrs;
3160  Bitmapset *key_attrs;
3161  Bitmapset *id_attrs;
3162  Bitmapset *interesting_attrs;
3163  Bitmapset *modified_attrs;
3164  ItemId lp;
3165  HeapTupleData oldtup;
3166  HeapTuple heaptup;
3167  HeapTuple old_key_tuple = NULL;
3168  bool old_key_copied = false;
3169  Page page;
3170  BlockNumber block;
3171  MultiXactStatus mxact_status;
3172  Buffer buffer,
3173  newbuf,
3174  vmbuffer = InvalidBuffer,
3175  vmbuffer_new = InvalidBuffer;
3176  bool need_toast;
3177  Size newtupsize,
3178  pagefree;
3179  bool have_tuple_lock = false;
3180  bool iscombo;
3181  bool use_hot_update = false;
3182  bool hot_attrs_checked = false;
3183  bool key_intact;
3184  bool all_visible_cleared = false;
3185  bool all_visible_cleared_new = false;
3186  bool checked_lockers;
3187  bool locker_remains;
3188  TransactionId xmax_new_tuple,
3189  xmax_old_tuple;
3190  uint16 infomask_old_tuple,
3191  infomask2_old_tuple,
3192  infomask_new_tuple,
3193  infomask2_new_tuple;
3194 
3195  Assert(ItemPointerIsValid(otid));
3196 
3197  /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3199  RelationGetNumberOfAttributes(relation));
3200 
3201  /*
3202  * Forbid this during a parallel operation, lest it allocate a combo CID.
3203  * Other workers might need that combo CID for visibility checks, and we
3204  * have no provision for broadcasting it to them.
3205  */
3206  if (IsInParallelMode())
3207  ereport(ERROR,
3208  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3209  errmsg("cannot update tuples during a parallel operation")));
3210 
3211  /*
3212  * Fetch the list of attributes to be checked for various operations.
3213  *
3214  * For HOT considerations, this is wasted effort if we fail to update or
3215  * have to put the new tuple on a different page. But we must compute the
3216  * list before obtaining buffer lock --- in the worst case, if we are
3217  * doing an update on one of the relevant system catalogs, we could
3218  * deadlock if we try to fetch the list later. In any case, the relcache
3219  * caches the data so this is usually pretty cheap.
3220  *
3221  * We also need columns used by the replica identity and columns that are
3222  * considered the "key" of rows in the table.
3223  *
3224  * Note that we get copies of each bitmap, so we need not worry about
3225  * relcache flush happening midway through.
3226  */
3227  hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
3228  key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3229  id_attrs = RelationGetIndexAttrBitmap(relation,
3231 
3232 
3233  block = ItemPointerGetBlockNumber(otid);
3234  buffer = ReadBuffer(relation, block);
3235  page = BufferGetPage(buffer);
3236 
3237  interesting_attrs = NULL;
3238 
3239  /*
3240  * If the page is already full, there is hardly any chance of doing a HOT
3241  * update on this page. It might be wasteful effort to look for index
3242  * column updates only to later reject HOT updates for lack of space in
3243  * the same page. So we be conservative and only fetch hot_attrs if the
3244  * page is not already full. Since we are already holding a pin on the
3245  * buffer, there is no chance that the buffer can get cleaned up
3246  * concurrently and even if that was possible, in the worst case we lose a
3247  * chance to do a HOT update.
3248  */
3249  if (!PageIsFull(page))
3250  {
3251  interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3252  hot_attrs_checked = true;
3253  }
3254  interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3255  interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3256 
3257  /*
3258  * Before locking the buffer, pin the visibility map page if it appears to
3259  * be necessary. Since we haven't got the lock yet, someone else might be
3260  * in the middle of changing this, so we'll need to recheck after we have
3261  * the lock.
3262  */
3263  if (PageIsAllVisible(page))
3264  visibilitymap_pin(relation, block, &vmbuffer);
3265 
3267 
3268  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3269  Assert(ItemIdIsNormal(lp));
3270 
3271  /*
3272  * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3273  * properly.
3274  */
3275  oldtup.t_tableOid = RelationGetRelid(relation);
3276  oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3277  oldtup.t_len = ItemIdGetLength(lp);
3278  oldtup.t_self = *otid;
3279 
3280  /* the new tuple is ready, except for this: */
3281  newtup->t_tableOid = RelationGetRelid(relation);
3282 
3283  /* Determine columns modified by the update. */
3284  modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3285  &oldtup, newtup);
3286 
3287  /*
3288  * If we're not updating any "key" column, we can grab a weaker lock type.
3289  * This allows for more concurrency when we are running simultaneously
3290  * with foreign key checks.
3291  *
3292  * Note that if a column gets detoasted while executing the update, but
3293  * the value ends up being the same, this test will fail and we will use
3294  * the stronger lock. This is acceptable; the important case to optimize
3295  * is updates that don't manipulate key columns, not those that
3296  * serendipitously arrive at the same key values.
3297  */
3298  if (!bms_overlap(modified_attrs, key_attrs))
3299  {
3300  *lockmode = LockTupleNoKeyExclusive;
3301  mxact_status = MultiXactStatusNoKeyUpdate;
3302  key_intact = true;
3303 
3304  /*
3305  * If this is the first possibly-multixact-able operation in the
3306  * current transaction, set my per-backend OldestMemberMXactId
3307  * setting. We can be certain that the transaction will never become a
3308  * member of any older MultiXactIds than that. (We have to do this
3309  * even if we end up just using our own TransactionId below, since
3310  * some other backend could incorporate our XID into a MultiXact
3311  * immediately afterwards.)
3312  */
3314  }
3315  else
3316  {
3317  *lockmode = LockTupleExclusive;
3318  mxact_status = MultiXactStatusUpdate;
3319  key_intact = false;
3320  }
3321 
3322  /*
3323  * Note: beyond this point, use oldtup not otid to refer to old tuple.
3324  * otid may very well point at newtup->t_self, which we will overwrite
3325  * with the new tuple's location, so there's great risk of confusion if we
3326  * use otid anymore.
3327  */
3328 
3329 l2:
3330  checked_lockers = false;
3331  locker_remains = false;
3332  result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3333 
3334  /* see below about the "no wait" case */
3335  Assert(result != TM_BeingModified || wait);
3336 
3337  if (result == TM_Invisible)
3338  {
3339  UnlockReleaseBuffer(buffer);
3340  ereport(ERROR,
3341  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3342  errmsg("attempted to update invisible tuple")));
3343  }
3344  else if (result == TM_BeingModified && wait)
3345  {
3346  TransactionId xwait;
3347  uint16 infomask;
3348  bool can_continue = false;
3349 
3350  /*
3351  * XXX note that we don't consider the "no wait" case here. This
3352  * isn't a problem currently because no caller uses that case, but it
3353  * should be fixed if such a caller is introduced. It wasn't a
3354  * problem previously because this code would always wait, but now
3355  * that some tuple locks do not conflict with one of the lock modes we
3356  * use, it is possible that this case is interesting to handle
3357  * specially.
3358  *
3359  * This may cause failures with third-party code that calls
3360  * heap_update directly.
3361  */
3362 
3363  /* must copy state data before unlocking buffer */
3364  xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3365  infomask = oldtup.t_data->t_infomask;
3366 
3367  /*
3368  * Now we have to do something about the existing locker. If it's a
3369  * multi, sleep on it; we might be awakened before it is completely
3370  * gone (or even not sleep at all in some cases); we need to preserve
3371  * it as locker, unless it is gone completely.
3372  *
3373  * If it's not a multi, we need to check for sleeping conditions
3374  * before actually going to sleep. If the update doesn't conflict
3375  * with the locks, we just continue without sleeping (but making sure
3376  * it is preserved).
3377  *
3378  * Before sleeping, we need to acquire tuple lock to establish our
3379  * priority for the tuple (see heap_lock_tuple). LockTuple will
3380  * release us when we are next-in-line for the tuple. Note we must
3381  * not acquire the tuple lock until we're sure we're going to sleep;
3382  * otherwise we're open for race conditions with other transactions
3383  * holding the tuple lock which sleep on us.
3384  *
3385  * If we are forced to "start over" below, we keep the tuple lock;
3386  * this arranges that we stay at the head of the line while rechecking
3387  * tuple state.
3388  */
3389  if (infomask & HEAP_XMAX_IS_MULTI)
3390  {
3391  TransactionId update_xact;
3392  int remain;
3393  bool current_is_member = false;
3394 
3395  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3396  *lockmode, &current_is_member))
3397  {
3398  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3399 
3400  /*
3401  * Acquire the lock, if necessary (but skip it when we're
3402  * requesting a lock and already have one; avoids deadlock).
3403  */
3404  if (!current_is_member)
3405  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3406  LockWaitBlock, &have_tuple_lock);
3407 
3408  /* wait for multixact */
3409  MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3410  relation, &oldtup.t_self, XLTW_Update,
3411  &remain);
3412  checked_lockers = true;
3413  locker_remains = remain != 0;
3415 
3416  /*
3417  * If xwait had just locked the tuple then some other xact
3418  * could update this tuple before we get to this point. Check
3419  * for xmax change, and start over if so.
3420  */
3422  infomask) ||
3424  xwait))
3425  goto l2;
3426  }
3427 
3428  /*
3429  * Note that the multixact may not be done by now. It could have
3430  * surviving members; our own xact or other subxacts of this
3431  * backend, and also any other concurrent transaction that locked
3432  * the tuple with LockTupleKeyShare if we only got
3433  * LockTupleNoKeyExclusive. If this is the case, we have to be
3434  * careful to mark the updated tuple with the surviving members in
3435  * Xmax.
3436  *
3437  * Note that there could have been another update in the
3438  * MultiXact. In that case, we need to check whether it committed
3439  * or aborted. If it aborted we are safe to update it again;
3440  * otherwise there is an update conflict, and we have to return
3441  * TableTuple{Deleted, Updated} below.
3442  *
3443  * In the LockTupleExclusive case, we still need to preserve the
3444  * surviving members: those would include the tuple locks we had
3445  * before this one, which are important to keep in case this
3446  * subxact aborts.
3447  */
3449  update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3450  else
3451  update_xact = InvalidTransactionId;
3452 
3453  /*
3454  * There was no UPDATE in the MultiXact; or it aborted. No
3455  * TransactionIdIsInProgress() call needed here, since we called
3456  * MultiXactIdWait() above.
3457  */
3458  if (!TransactionIdIsValid(update_xact) ||
3459  TransactionIdDidAbort(update_xact))
3460  can_continue = true;
3461  }
3462  else if (TransactionIdIsCurrentTransactionId(xwait))
3463  {
3464  /*
3465  * The only locker is ourselves; we can avoid grabbing the tuple
3466  * lock here, but must preserve our locking information.
3467  */
3468  checked_lockers = true;
3469  locker_remains = true;
3470  can_continue = true;
3471  }
3472  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3473  {
3474  /*
3475  * If it's just a key-share locker, and we're not changing the key
3476  * columns, we don't need to wait for it to end; but we need to
3477  * preserve it as locker.
3478  */
3479  checked_lockers = true;
3480  locker_remains = true;
3481  can_continue = true;
3482  }
3483  else
3484  {
3485  /*
3486  * Wait for regular transaction to end; but first, acquire tuple
3487  * lock.
3488  */
3489  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3490  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3491  LockWaitBlock, &have_tuple_lock);
3492  XactLockTableWait(xwait, relation, &oldtup.t_self,
3493  XLTW_Update);
3494  checked_lockers = true;
3496 
3497  /*
3498  * xwait is done, but if xwait had just locked the tuple then some
3499  * other xact could update this tuple before we get to this point.
3500  * Check for xmax change, and start over if so.
3501  */
3502  if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3503  !TransactionIdEquals(xwait,
3505  goto l2;
3506 
3507  /* Otherwise check if it committed or aborted */
3508  UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3509  if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3510  can_continue = true;
3511  }
3512 
3513  if (can_continue)
3514  result = TM_Ok;
3515  else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3516  result = TM_Updated;
3517  else
3518  result = TM_Deleted;
3519  }
3520 
3521  if (crosscheck != InvalidSnapshot && result == TM_Ok)
3522  {
3523  /* Perform additional check for transaction-snapshot mode RI updates */
3524  if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3525  {
3526  result = TM_Updated;
3527  Assert(!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3528  }
3529  }
3530 
3531  if (result != TM_Ok)
3532  {
3533  Assert(result == TM_SelfModified ||
3534  result == TM_Updated ||
3535  result == TM_Deleted ||
3536  result == TM_BeingModified);
3537  Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3538  Assert(result != TM_Updated ||
3539  !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3540  tmfd->ctid = oldtup.t_data->t_ctid;
3541  tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3542  if (result == TM_SelfModified)
3543  tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3544  else
3545  tmfd->cmax = InvalidCommandId;
3546  UnlockReleaseBuffer(buffer);
3547  if (have_tuple_lock)
3548  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3549  if (vmbuffer != InvalidBuffer)
3550  ReleaseBuffer(vmbuffer);
3551  bms_free(hot_attrs);
3552  bms_free(key_attrs);
3553  bms_free(id_attrs);
3554  bms_free(modified_attrs);
3555  bms_free(interesting_attrs);
3556  return result;
3557  }
3558 
3559  /*
3560  * If we didn't pin the visibility map page and the page has become all
3561  * visible while we were busy locking the buffer, or during some
3562  * subsequent window during which we had it unlocked, we'll have to unlock
3563  * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3564  * bit unfortunate, especially since we'll now have to recheck whether the
3565  * tuple has been locked or updated under us, but hopefully it won't
3566  * happen very often.
3567  */
3568  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3569  {
3570  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3571  visibilitymap_pin(relation, block, &vmbuffer);
3573  goto l2;
3574  }
3575 
3576  /* Fill in transaction status data */
3577 
3578  /*
3579  * If the tuple we're updating is locked, we need to preserve the locking
3580  * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3581  */
3583  oldtup.t_data->t_infomask,
3584  oldtup.t_data->t_infomask2,
3585  xid, *lockmode, true,
3586  &xmax_old_tuple, &infomask_old_tuple,
3587  &infomask2_old_tuple);
3588 
3589  /*
3590  * And also prepare an Xmax value for the new copy of the tuple. If there
3591  * was no xmax previously, or there was one but all lockers are now gone,
3592  * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3593  * rare cases that might also be InvalidXid and yet not have the
3594  * HEAP_XMAX_INVALID bit set; that's fine.)
3595  */
3596  if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3598  (checked_lockers && !locker_remains))
3599  xmax_new_tuple = InvalidTransactionId;
3600  else
3601  xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3602 
3603  if (!TransactionIdIsValid(xmax_new_tuple))
3604  {
3605  infomask_new_tuple = HEAP_XMAX_INVALID;
3606  infomask2_new_tuple = 0;
3607  }
3608  else
3609  {
3610  /*
3611  * If we found a valid Xmax for the new tuple, then the infomask bits
3612  * to use on the new tuple depend on what was there on the old one.
3613  * Note that since we're doing an update, the only possibility is that
3614  * the lockers had FOR KEY SHARE lock.
3615  */
3616  if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3617  {
3618  GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3619  &infomask2_new_tuple);
3620  }
3621  else
3622  {
3623  infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3624  infomask2_new_tuple = 0;
3625  }
3626  }
3627 
3628  /*
3629  * Prepare the new tuple with the appropriate initial values of Xmin and
3630  * Xmax, as well as initial infomask bits as computed above.
3631  */
3632  newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3633  newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3634  HeapTupleHeaderSetXmin(newtup->t_data, xid);
3635  HeapTupleHeaderSetCmin(newtup->t_data, cid);
3636  newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3637  newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3638  HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3639 
3640  /*
3641  * Replace cid with a combo CID if necessary. Note that we already put
3642  * the plain cid into the new tuple.
3643  */
3644  HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3645 
3646  /*
3647  * If the toaster needs to be activated, OR if the new tuple will not fit
3648  * on the same page as the old, then we need to release the content lock
3649  * (but not the pin!) on the old tuple's buffer while we are off doing
3650  * TOAST and/or table-file-extension work. We must mark the old tuple to
3651  * show that it's locked, else other processes may try to update it
3652  * themselves.
3653  *
3654  * We need to invoke the toaster if there are already any out-of-line
3655  * toasted values present, or if the new tuple is over-threshold.
3656  */
3657  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3658  relation->rd_rel->relkind != RELKIND_MATVIEW)
3659  {
3660  /* toast table entries should never be recursively toasted */
3661  Assert(!HeapTupleHasExternal(&oldtup));
3662  Assert(!HeapTupleHasExternal(newtup));
3663  need_toast = false;
3664  }
3665  else
3666  need_toast = (HeapTupleHasExternal(&oldtup) ||
3667  HeapTupleHasExternal(newtup) ||
3668  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3669 
3670  pagefree = PageGetHeapFreeSpace(page);
3671 
3672  newtupsize = MAXALIGN(newtup->t_len);
3673 
3674  if (need_toast || newtupsize > pagefree)
3675  {
3676  TransactionId xmax_lock_old_tuple;
3677  uint16 infomask_lock_old_tuple,
3678  infomask2_lock_old_tuple;
3679  bool cleared_all_frozen = false;
3680 
3681  /*
3682  * To prevent concurrent sessions from updating the tuple, we have to
3683  * temporarily mark it locked, while we release the page-level lock.
3684  *
3685  * To satisfy the rule that any xid potentially appearing in a buffer
3686  * written out to disk, we unfortunately have to WAL log this
3687  * temporary modification. We can reuse xl_heap_lock for this
3688  * purpose. If we crash/error before following through with the
3689  * actual update, xmax will be of an aborted transaction, allowing
3690  * other sessions to proceed.
3691  */
3692 
3693  /*
3694  * Compute xmax / infomask appropriate for locking the tuple. This has
3695  * to be done separately from the combo that's going to be used for
3696  * updating, because the potentially created multixact would otherwise
3697  * be wrong.
3698  */
3700  oldtup.t_data->t_infomask,
3701  oldtup.t_data->t_infomask2,
3702  xid, *lockmode, false,
3703  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3704  &infomask2_lock_old_tuple);
3705 
3706  Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3707 
3709 
3710  /* Clear obsolete visibility flags ... */
3711  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3712  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3713  HeapTupleClearHotUpdated(&oldtup);
3714  /* ... and store info about transaction updating this tuple */
3715  Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3716  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3717  oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3718  oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3719  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3720 
3721  /* temporarily make it look not-updated, but locked */
3722  oldtup.t_data->t_ctid = oldtup.t_self;
3723 
3724  /*
3725  * Clear all-frozen bit on visibility map if needed. We could
3726  * immediately reset ALL_VISIBLE, but given that the WAL logging
3727  * overhead would be unchanged, that doesn't seem necessarily
3728  * worthwhile.
3729  */
3730  if (PageIsAllVisible(page) &&
3731  visibilitymap_clear(relation, block, vmbuffer,
3733  cleared_all_frozen = true;
3734 
3735  MarkBufferDirty(buffer);
3736 
3737  if (RelationNeedsWAL(relation))
3738  {
3739  xl_heap_lock xlrec;
3740  XLogRecPtr recptr;
3741 
3742  XLogBeginInsert();
3743  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3744 
3745  xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3746  xlrec.locking_xid = xmax_lock_old_tuple;
3748  oldtup.t_data->t_infomask2);
3749  xlrec.flags =
3750  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3751  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3752  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3753  PageSetLSN(page, recptr);
3754  }
3755 
3756  END_CRIT_SECTION();
3757 
3758  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3759 
3760  /*
3761  * Let the toaster do its thing, if needed.
3762  *
3763  * Note: below this point, heaptup is the data we actually intend to
3764  * store into the relation; newtup is the caller's original untoasted
3765  * data.
3766  */
3767  if (need_toast)
3768  {
3769  /* Note we always use WAL and FSM during updates */
3770  heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
3771  newtupsize = MAXALIGN(heaptup->t_len);
3772  }
3773  else
3774  heaptup = newtup;
3775 
3776  /*
3777  * Now, do we need a new page for the tuple, or not? This is a bit
3778  * tricky since someone else could have added tuples to the page while
3779  * we weren't looking. We have to recheck the available space after
3780  * reacquiring the buffer lock. But don't bother to do that if the
3781  * former amount of free space is still not enough; it's unlikely
3782  * there's more free now than before.
3783  *
3784  * What's more, if we need to get a new page, we will need to acquire
3785  * buffer locks on both old and new pages. To avoid deadlock against
3786  * some other backend trying to get the same two locks in the other
3787  * order, we must be consistent about the order we get the locks in.
3788  * We use the rule "lock the lower-numbered page of the relation
3789  * first". To implement this, we must do RelationGetBufferForTuple
3790  * while not holding the lock on the old page, and we must rely on it
3791  * to get the locks on both pages in the correct order.
3792  *
3793  * Another consideration is that we need visibility map page pin(s) if
3794  * we will have to clear the all-visible flag on either page. If we
3795  * call RelationGetBufferForTuple, we rely on it to acquire any such
3796  * pins; but if we don't, we have to handle that here. Hence we need
3797  * a loop.
3798  */
3799  for (;;)
3800  {
3801  if (newtupsize > pagefree)
3802  {
3803  /* It doesn't fit, must use RelationGetBufferForTuple. */
3804  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3805  buffer, 0, NULL,
3806  &vmbuffer_new, &vmbuffer);
3807  /* We're all done. */
3808  break;
3809  }
3810  /* Acquire VM page pin if needed and we don't have it. */
3811  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3812  visibilitymap_pin(relation, block, &vmbuffer);
3813  /* Re-acquire the lock on the old tuple's page. */
3815  /* Re-check using the up-to-date free space */
3816  pagefree = PageGetHeapFreeSpace(page);
3817  if (newtupsize > pagefree ||
3818  (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
3819  {
3820  /*
3821  * Rats, it doesn't fit anymore, or somebody just now set the
3822  * all-visible flag. We must now unlock and loop to avoid
3823  * deadlock. Fortunately, this path should seldom be taken.
3824  */
3825  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3826  }
3827  else
3828  {
3829  /* We're all done. */
3830  newbuf = buffer;
3831  break;
3832  }
3833  }
3834  }
3835  else
3836  {
3837  /* No TOAST work needed, and it'll fit on same page */
3838  newbuf = buffer;
3839  heaptup = newtup;
3840  }
3841 
3842  /*
3843  * We're about to do the actual update -- check for conflict first, to
3844  * avoid possibly having to roll back work we've just done.
3845  *
3846  * This is safe without a recheck as long as there is no possibility of
3847  * another process scanning the pages between this check and the update
3848  * being visible to the scan (i.e., exclusive buffer content lock(s) are
3849  * continuously held from this point until the tuple update is visible).
3850  *
3851  * For the new tuple the only check needed is at the relation level, but
3852  * since both tuples are in the same relation and the check for oldtup
3853  * will include checking the relation level, there is no benefit to a
3854  * separate check for the new tuple.
3855  */
3856  CheckForSerializableConflictIn(relation, &oldtup.t_self,
3857  BufferGetBlockNumber(buffer));
3858 
3859  /*
3860  * At this point newbuf and buffer are both pinned and locked, and newbuf
3861  * has enough space for the new tuple. If they are the same buffer, only
3862  * one pin is held.
3863  */
3864 
3865  if (newbuf == buffer)
3866  {
3867  /*
3868  * Since the new tuple is going into the same page, we might be able
3869  * to do a HOT update. Check if any of the index columns have been
3870  * changed. If the page was already full, we may have skipped checking
3871  * for index columns, and also can't do a HOT update.
3872  */
3873  if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
3874  use_hot_update = true;
3875  }
3876  else
3877  {
3878  /* Set a hint that the old page could use prune/defrag */
3879  PageSetFull(page);
3880  }
3881 
3882  /*
3883  * Compute replica identity tuple before entering the critical section so
3884  * we don't PANIC upon a memory allocation failure.
3885  * ExtractReplicaIdentity() will return NULL if nothing needs to be
3886  * logged.
3887  */
3888  old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3889  bms_overlap(modified_attrs, id_attrs),
3890  &old_key_copied);
3891 
3892  /* NO EREPORT(ERROR) from here till changes are logged */
3894 
3895  /*
3896  * If this transaction commits, the old tuple will become DEAD sooner or
3897  * later. Set flag that this page is a candidate for pruning once our xid
3898  * falls below the OldestXmin horizon. If the transaction finally aborts,
3899  * the subsequent page pruning will be a no-op and the hint will be
3900  * cleared.
3901  *
3902  * XXX Should we set hint on newbuf as well? If the transaction aborts,
3903  * there would be a prunable tuple in the newbuf; but for now we choose
3904  * not to optimize for aborts. Note that heap_xlog_update must be kept in
3905  * sync if this decision changes.
3906  */
3907  PageSetPrunable(page, xid);
3908 
3909  if (use_hot_update)
3910  {
3911  /* Mark the old tuple as HOT-updated */
3912  HeapTupleSetHotUpdated(&oldtup);
3913  /* And mark the new tuple as heap-only */
3914  HeapTupleSetHeapOnly(heaptup);
3915  /* Mark the caller's copy too, in case different from heaptup */
3916  HeapTupleSetHeapOnly(newtup);
3917  }
3918  else
3919  {
3920  /* Make sure tuples are correctly marked as not-HOT */
3921  HeapTupleClearHotUpdated(&oldtup);
3922  HeapTupleClearHeapOnly(heaptup);
3923  HeapTupleClearHeapOnly(newtup);
3924  }
3925 
3926  RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3927 
3928 
3929  /* Clear obsolete visibility flags, possibly set by ourselves above... */
3930  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3931  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3932  /* ... and store info about transaction updating this tuple */
3933  Assert(TransactionIdIsValid(xmax_old_tuple));
3934  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3935  oldtup.t_data->t_infomask |= infomask_old_tuple;
3936  oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3937  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3938 
3939  /* record address of new tuple in t_ctid of old one */
3940  oldtup.t_data->t_ctid = heaptup->t_self;
3941 
3942  /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3943  if (PageIsAllVisible(BufferGetPage(buffer)))
3944  {
3945  all_visible_cleared = true;
3947  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3948  vmbuffer, VISIBILITYMAP_VALID_BITS);
3949  }
3950  if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3951  {
3952  all_visible_cleared_new = true;
3954  visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3955  vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3956  }
3957 
3958  if (newbuf != buffer)
3959  MarkBufferDirty(newbuf);
3960  MarkBufferDirty(buffer);
3961 
3962  /* XLOG stuff */
3963  if (RelationNeedsWAL(relation))
3964  {
3965  XLogRecPtr recptr;
3966 
3967  /*
3968  * For logical decoding we need combo CIDs to properly decode the
3969  * catalog.
3970  */
3972  {
3973  log_heap_new_cid(relation, &oldtup);
3974  log_heap_new_cid(relation, heaptup);
3975  }
3976 
3977  recptr = log_heap_update(relation, buffer,
3978  newbuf, &oldtup, heaptup,
3979  old_key_tuple,
3980  all_visible_cleared,
3981  all_visible_cleared_new);
3982  if (newbuf != buffer)
3983  {
3984  PageSetLSN(BufferGetPage(newbuf), recptr);
3985  }
3986  PageSetLSN(BufferGetPage(buffer), recptr);
3987  }
3988 
3989  END_CRIT_SECTION();
3990 
3991  if (newbuf != buffer)
3992  LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
3993  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3994 
3995  /*
3996  * Mark old tuple for invalidation from system caches at next command
3997  * boundary, and mark the new tuple for invalidation in case we abort. We
3998  * have to do this before releasing the buffer because oldtup is in the
3999  * buffer. (heaptup is all in local memory, but it's necessary to process
4000  * both tuple versions in one call to inval.c so we can avoid redundant
4001  * sinval messages.)
4002  */
4003  CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4004 
4005  /* Now we can release the buffer(s) */
4006  if (newbuf != buffer)
4007  ReleaseBuffer(newbuf);
4008  ReleaseBuffer(buffer);
4009  if (BufferIsValid(vmbuffer_new))
4010  ReleaseBuffer(vmbuffer_new);
4011  if (BufferIsValid(vmbuffer))
4012  ReleaseBuffer(vmbuffer);
4013 
4014  /*
4015  * Release the lmgr tuple lock, if we had it.
4016  */
4017  if (have_tuple_lock)
4018  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4019 
4020  pgstat_count_heap_update(relation, use_hot_update);
4021 
4022  /*
4023  * If heaptup is a private copy, release it. Don't forget to copy t_self
4024  * back to the caller's image, too.
4025  */
4026  if (heaptup != newtup)
4027  {
4028  newtup->t_self = heaptup->t_self;
4029  heap_freetuple(heaptup);
4030  }
4031 
4032  if (old_key_tuple != NULL && old_key_copied)
4033  heap_freetuple(old_key_tuple);
4034 
4035  bms_free(hot_attrs);
4036  bms_free(key_attrs);
4037  bms_free(id_attrs);
4038  bms_free(modified_attrs);
4039  bms_free(interesting_attrs);
4040 
4041  return TM_Ok;
4042 }
4043 
4044 /*
4045  * Check if the specified attribute's value is same in both given tuples.
4046  * Subroutine for HeapDetermineModifiedColumns.
4047  */
4048 static bool
4049 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4050  HeapTuple tup1, HeapTuple tup2)
4051 {
4052  Datum value1,
4053  value2;
4054  bool isnull1,
4055  isnull2;
4056  Form_pg_attribute att;
4057 
4058  /*
4059  * If it's a whole-tuple reference, say "not equal". It's not really
4060  * worth supporting this case, since it could only succeed after a no-op
4061  * update, which is hardly a case worth optimizing for.
4062  */
4063  if (attrnum == 0)
4064  return false;
4065 
4066  /*
4067  * Likewise, automatically say "not equal" for any system attribute other
4068  * than tableOID; we cannot expect these to be consistent in a HOT chain,
4069  * or even to be set correctly yet in the new tuple.
4070  */
4071  if (attrnum < 0)
4072  {
4073  if (attrnum != TableOidAttributeNumber)
4074  return false;
4075  }
4076 
4077  /*
4078  * Extract the corresponding values. XXX this is pretty inefficient if
4079  * there are many indexed columns. Should HeapDetermineModifiedColumns do
4080  * a single heap_deform_tuple call on each tuple, instead? But that
4081  * doesn't work for system columns ...
4082  */
4083  value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4084  value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4085 
4086  /*
4087  * If one value is NULL and other is not, then they are certainly not
4088  * equal
4089  */
4090  if (isnull1 != isnull2)
4091  return false;
4092 
4093  /*
4094  * If both are NULL, they can be considered equal.
4095  */
4096  if (isnull1)
4097  return true;
4098 
4099  /*
4100  * We do simple binary comparison of the two datums. This may be overly
4101  * strict because there can be multiple binary representations for the
4102  * same logical value. But we should be OK as long as there are no false
4103  * positives. Using a type-specific equality operator is messy because
4104  * there could be multiple notions of equality in different operator
4105  * classes; furthermore, we cannot safely invoke user-defined functions
4106  * while holding exclusive buffer lock.
4107  */
4108  if (attrnum <= 0)
4109  {
4110  /* The only allowed system columns are OIDs, so do this */
4111  return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4112  }
4113  else
4114  {
4115  Assert(attrnum <= tupdesc->natts);
4116  att = TupleDescAttr(tupdesc, attrnum - 1);
4117  return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4118  }
4119 }
4120 
4121 /*
4122  * Check which columns are being updated.
4123  *
4124  * Given an updated tuple, determine (and return into the output bitmapset),
4125  * from those listed as interesting, the set of columns that changed.
4126  *
4127  * The input bitmapset is destructively modified; that is OK since this is
4128  * invoked at most once in heap_update.
4129  */
4130 static Bitmapset *
4132  HeapTuple oldtup, HeapTuple newtup)
4133 {
4134  int attnum;
4135  Bitmapset *modified = NULL;
4136 
4137  while ((attnum = bms_first_member(interesting_cols)) >= 0)
4138  {
4140 
4142  attnum, oldtup, newtup))
4143  modified = bms_add_member(modified,
4145  }
4146 
4147  return modified;
4148 }
4149 
4150 /*
4151  * simple_heap_update - replace a tuple
4152  *
4153  * This routine may be used to update a tuple when concurrent updates of
4154  * the target tuple are not expected (for example, because we have a lock
4155  * on the relation associated with the tuple). Any failure is reported
4156  * via ereport().
4157  */
4158 void
4160 {
4161  TM_Result result;
4162  TM_FailureData tmfd;
4163  LockTupleMode lockmode;
4164 
4165  result = heap_update(relation, otid, tup,
4167  true /* wait for commit */ ,
4168  &tmfd, &lockmode);
4169  switch (result)
4170  {
4171  case TM_SelfModified:
4172  /* Tuple was already updated in current command? */
4173  elog(ERROR, "tuple already updated by self");
4174  break;
4175 
4176  case TM_Ok:
4177  /* done successfully */
4178  break;
4179 
4180  case TM_Updated:
4181  elog(ERROR, "tuple concurrently updated");
4182  break;
4183 
4184  case TM_Deleted:
4185  elog(ERROR, "tuple concurrently deleted");
4186  break;
4187 
4188  default:
4189  elog(ERROR, "unrecognized heap_update status: %u", result);
4190  break;
4191  }
4192 }
4193 
4194 
4195 /*
4196  * Return the MultiXactStatus corresponding to the given tuple lock mode.
4197  */
4198 static MultiXactStatus
4200 {
4201  int retval;
4202 
4203  if (is_update)
4204  retval = tupleLockExtraInfo[mode].updstatus;
4205  else
4206  retval = tupleLockExtraInfo[mode].lockstatus;
4207 
4208  if (retval == -1)
4209  elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4210  is_update ? "true" : "false");
4211 
4212  return (MultiXactStatus) retval;
4213 }
4214 
4215 /*
4216  * heap_lock_tuple - lock a tuple in shared or exclusive mode
4217  *
4218  * Note that this acquires a buffer pin, which the caller must release.
4219  *
4220  * Input parameters:
4221  * relation: relation containing tuple (caller must hold suitable lock)
4222  * tid: TID of tuple to lock
4223  * cid: current command ID (used for visibility test, and stored into
4224  * tuple's cmax if lock is successful)
4225  * mode: indicates if shared or exclusive tuple lock is desired
4226  * wait_policy: what to do if tuple lock is not available
4227  * follow_updates: if true, follow the update chain to also lock descendant
4228  * tuples.
4229  *
4230  * Output parameters:
4231  * *tuple: all fields filled in
4232  * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4233  * *tmfd: filled in failure cases (see below)
4234  *
4235  * Function results are the same as the ones for table_tuple_lock().
4236  *
4237  * In the failure cases other than TM_Invisible, the routine fills
4238  * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4239  * if necessary), and t_cmax (the last only for TM_SelfModified,
4240  * since we cannot obtain cmax from a combo CID generated by another
4241  * transaction).
4242  * See comments for struct TM_FailureData for additional info.
4243  *
4244  * See README.tuplock for a thorough explanation of this mechanism.
4245  */
4246 TM_Result
4248  CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4249  bool follow_updates,
4250  Buffer *buffer, TM_FailureData *tmfd)
4251 {
4252  TM_Result result;
4253  ItemPointer tid = &(tuple->t_self);
4254  ItemId lp;
4255  Page page;
4256  Buffer vmbuffer = InvalidBuffer;
4257  BlockNumber block;
4258  TransactionId xid,
4259  xmax;
4260  uint16 old_infomask,
4261  new_infomask,
4262  new_infomask2;
4263  bool first_time = true;
4264  bool skip_tuple_lock = false;
4265  bool have_tuple_lock = false;
4266  bool cleared_all_frozen = false;
4267 
4268  *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4269  block = ItemPointerGetBlockNumber(tid);
4270 
4271  /*
4272  * Before locking the buffer, pin the visibility map page if it appears to
4273  * be necessary. Since we haven't got the lock yet, someone else might be
4274  * in the middle of changing this, so we'll need to recheck after we have
4275  * the lock.
4276  */
4277  if (PageIsAllVisible(BufferGetPage(*buffer)))
4278  visibilitymap_pin(relation, block, &vmbuffer);
4279 
4281 
4282  page = BufferGetPage(*buffer);
4283  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4284  Assert(ItemIdIsNormal(lp));
4285 
4286  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4287  tuple->t_len = ItemIdGetLength(lp);
4288  tuple->t_tableOid = RelationGetRelid(relation);
4289 
4290 l3:
4291  result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4292 
4293  if (result == TM_Invisible)
4294  {
4295  /*
4296  * This is possible, but only when locking a tuple for ON CONFLICT
4297  * UPDATE. We return this value here rather than throwing an error in
4298  * order to give that case the opportunity to throw a more specific
4299  * error.
4300  */
4301  result = TM_Invisible;
4302  goto out_locked;
4303  }
4304  else if (result == TM_BeingModified ||
4305  result == TM_Updated ||
4306  result == TM_Deleted)
4307  {
4308  TransactionId xwait;
4309  uint16 infomask;
4310  uint16 infomask2;
4311  bool require_sleep;
4312  ItemPointerData t_ctid;
4313 
4314  /* must copy state data before unlocking buffer */
4315  xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4316  infomask = tuple->t_data->t_infomask;
4317  infomask2 = tuple->t_data->t_infomask2;
4318  ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4319 
4320  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4321 
4322  /*
4323  * If any subtransaction of the current top transaction already holds
4324  * a lock as strong as or stronger than what we're requesting, we
4325  * effectively hold the desired lock already. We *must* succeed
4326  * without trying to take the tuple lock, else we will deadlock
4327  * against anyone wanting to acquire a stronger lock.
4328  *
4329  * Note we only do this the first time we loop on the HTSU result;
4330  * there is no point in testing in subsequent passes, because
4331  * evidently our own transaction cannot have acquired a new lock after
4332  * the first time we checked.
4333  */
4334  if (first_time)
4335  {
4336  first_time = false;
4337 
4338  if (infomask & HEAP_XMAX_IS_MULTI)
4339  {
4340  int i;
4341  int nmembers;
4342  MultiXactMember *members;
4343 
4344  /*
4345  * We don't need to allow old multixacts here; if that had
4346  * been the case, HeapTupleSatisfiesUpdate would have returned
4347  * MayBeUpdated and we wouldn't be here.
4348  */
4349  nmembers =
4350  GetMultiXactIdMembers(xwait, &members, false,
4351  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4352 
4353  for (i = 0; i < nmembers; i++)
4354  {
4355  /* only consider members of our own transaction */
4356  if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4357  continue;
4358 
4359  if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4360  {
4361  pfree(members);
4362  result = TM_Ok;
4363  goto out_unlocked;
4364  }
4365  else
4366  {
4367  /*
4368  * Disable acquisition of the heavyweight tuple lock.
4369  * Otherwise, when promoting a weaker lock, we might
4370  * deadlock with another locker that has acquired the
4371  * heavyweight tuple lock and is waiting for our
4372  * transaction to finish.
4373  *
4374  * Note that in this case we still need to wait for
4375  * the multixact if required, to avoid acquiring
4376  * conflicting locks.
4377  */
4378  skip_tuple_lock = true;
4379  }
4380  }
4381 
4382  if (members)
4383  pfree(members);
4384  }
4385  else if (TransactionIdIsCurrentTransactionId(xwait))
4386  {
4387  switch (mode)
4388  {
4389  case LockTupleKeyShare:
4390  Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4391  HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4392  HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4393  result = TM_Ok;
4394  goto out_unlocked;
4395  case LockTupleShare:
4396  if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4397  HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4398  {
4399  result = TM_Ok;
4400  goto out_unlocked;
4401  }
4402  break;
4404  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4405  {
4406  result = TM_Ok;
4407  goto out_unlocked;
4408  }
4409  break;
4410  case LockTupleExclusive:
4411  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4412  infomask2 & HEAP_KEYS_UPDATED)
4413  {
4414  result = TM_Ok;
4415  goto out_unlocked;
4416  }
4417  break;
4418  }
4419  }
4420  }
4421 
4422  /*
4423  * Initially assume that we will have to wait for the locking
4424  * transaction(s) to finish. We check various cases below in which
4425  * this can be turned off.
4426  */
4427  require_sleep = true;
4428  if (mode == LockTupleKeyShare)
4429  {
4430  /*
4431  * If we're requesting KeyShare, and there's no update present, we
4432  * don't need to wait. Even if there is an update, we can still
4433  * continue if the key hasn't been modified.
4434  *
4435  * However, if there are updates, we need to walk the update chain
4436  * to mark future versions of the row as locked, too. That way,
4437  * if somebody deletes that future version, we're protected
4438  * against the key going away. This locking of future versions
4439  * could block momentarily, if a concurrent transaction is
4440  * deleting a key; or it could return a value to the effect that
4441  * the transaction deleting the key has already committed. So we
4442  * do this before re-locking the buffer; otherwise this would be
4443  * prone to deadlocks.
4444  *
4445  * Note that the TID we're locking was grabbed before we unlocked
4446  * the buffer. For it to change while we're not looking, the
4447  * other properties we're testing for below after re-locking the
4448  * buffer would also change, in which case we would restart this
4449  * loop above.
4450  */
4451  if (!(infomask2 & HEAP_KEYS_UPDATED))
4452  {
4453  bool updated;
4454 
4455  updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4456 
4457  /*
4458  * If there are updates, follow the update chain; bail out if
4459  * that cannot be done.
4460  */
4461  if (follow_updates && updated)
4462  {
4463  TM_Result res;
4464 
4465  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4467  mode);
4468  if (res != TM_Ok)
4469  {
4470  result = res;
4471  /* recovery code expects to have buffer lock held */
4473  goto failed;
4474  }
4475  }
4476 
4478 
4479  /*
4480  * Make sure it's still an appropriate lock, else start over.
4481  * Also, if it wasn't updated before we released the lock, but
4482  * is updated now, we start over too; the reason is that we
4483  * now need to follow the update chain to lock the new
4484  * versions.
4485  */
4486  if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4487  ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4488  !updated))
4489  goto l3;
4490 
4491  /* Things look okay, so we can skip sleeping */
4492  require_sleep = false;
4493 
4494  /*
4495  * Note we allow Xmax to change here; other updaters/lockers
4496  * could have modified it before we grabbed the buffer lock.
4497  * However, this is not a problem, because with the recheck we
4498  * just did we ensure that they still don't conflict with the
4499  * lock we want.
4500  */
4501  }
4502  }
4503  else if (mode == LockTupleShare)
4504  {
4505  /*
4506  * If we're requesting Share, we can similarly avoid sleeping if
4507  * there's no update and no exclusive lock present.
4508  */
4509  if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4510  !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4511  {
4513 
4514  /*
4515  * Make sure it's still an appropriate lock, else start over.
4516  * See above about allowing xmax to change.
4517  */
4518  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4520  goto l3;
4521  require_sleep = false;
4522  }
4523  }
4524  else if (mode == LockTupleNoKeyExclusive)
4525  {
4526  /*
4527  * If we're requesting NoKeyExclusive, we might also be able to
4528  * avoid sleeping; just ensure that there no conflicting lock
4529  * already acquired.
4530  */
4531  if (infomask & HEAP_XMAX_IS_MULTI)
4532  {
4533  if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4534  mode, NULL))
4535  {
4536  /*
4537  * No conflict, but if the xmax changed under us in the
4538  * meantime, start over.
4539  */
4541  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4543  xwait))
4544  goto l3;
4545 
4546  /* otherwise, we're good */
4547  require_sleep = false;
4548  }
4549  }
4550  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4551  {
4553 
4554  /* if the xmax changed in the meantime, start over */
4555  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4557  xwait))
4558  goto l3;
4559  /* otherwise, we're good */
4560  require_sleep = false;
4561  }
4562  }
4563 
4564  /*
4565  * As a check independent from those above, we can also avoid sleeping
4566  * if the current transaction is the sole locker of the tuple. Note
4567  * that the strength of the lock already held is irrelevant; this is
4568  * not about recording the lock in Xmax (which will be done regardless
4569  * of this optimization, below). Also, note that the cases where we
4570  * hold a lock stronger than we are requesting are already handled
4571  * above by not doing anything.
4572  *
4573  * Note we only deal with the non-multixact case here; MultiXactIdWait
4574  * is well equipped to deal with this situation on its own.
4575  */
4576  if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4578  {
4579  /* ... but if the xmax changed in the meantime, start over */
4581  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4583  xwait))
4584  goto l3;
4586  require_sleep = false;
4587  }
4588 
4589  /*
4590  * Time to sleep on the other transaction/multixact, if necessary.
4591  *
4592  * If the other transaction is an update/delete that's already
4593  * committed, then sleeping cannot possibly do any good: if we're
4594  * required to sleep, get out to raise an error instead.
4595  *
4596  * By here, we either have already acquired the buffer exclusive lock,
4597  * or we must wait for the locking transaction or multixact; so below
4598  * we ensure that we grab buffer lock after the sleep.
4599  */
4600  if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4601  {
4603  goto failed;
4604  }
4605  else if (require_sleep)
4606  {
4607  /*
4608  * Acquire tuple lock to establish our priority for the tuple, or
4609  * die trying. LockTuple will release us when we are next-in-line
4610  * for the tuple. We must do this even if we are share-locking,
4611  * but not if we already have a weaker lock on the tuple.
4612  *
4613  * If we are forced to "start over" below, we keep the tuple lock;
4614  * this arranges that we stay at the head of the line while
4615  * rechecking tuple state.
4616  */
4617  if (!skip_tuple_lock &&
4618  !heap_acquire_tuplock(relation, tid, mode, wait_policy,
4619  &have_tuple_lock))
4620  {
4621  /*
4622  * This can only happen if wait_policy is Skip and the lock
4623  * couldn't be obtained.
4624  */
4625  result = TM_WouldBlock;
4626  /* recovery code expects to have buffer lock held */
4628  goto failed;
4629  }
4630 
4631  if (infomask & HEAP_XMAX_IS_MULTI)
4632  {
4634 
4635  /* We only ever lock tuples, never update them */
4636  if (status >= MultiXactStatusNoKeyUpdate)
4637  elog(ERROR, "invalid lock mode in heap_lock_tuple");
4638 
4639  /* wait for multixact to end, or die trying */
4640  switch (wait_policy)
4641  {
4642  case LockWaitBlock:
4643  MultiXactIdWait((MultiXactId) xwait, status, infomask,
4644  relation, &tuple->t_self, XLTW_Lock, NULL);
4645  break;
4646  case LockWaitSkip:
4648  status, infomask, relation,
4649  NULL))
4650  {
4651  result = TM_WouldBlock;
4652  /* recovery code expects to have buffer lock held */
4654  goto failed;
4655  }
4656  break;
4657  case LockWaitError:
4659  status, infomask, relation,
4660  NULL))
4661  ereport(ERROR,
4662  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4663  errmsg("could not obtain lock on row in relation \"%s\"",
4664  RelationGetRelationName(relation))));
4665 
4666  break;
4667  }
4668 
4669  /*
4670  * Of course, the multixact might not be done here: if we're
4671  * requesting a light lock mode, other transactions with light
4672  * locks could still be alive, as well as locks owned by our
4673  * own xact or other subxacts of this backend. We need to
4674  * preserve the surviving MultiXact members. Note that it
4675  * isn't absolutely necessary in the latter case, but doing so
4676  * is simpler.
4677  */
4678  }
4679  else
4680  {
4681  /* wait for regular transaction to end, or die trying */
4682  switch (wait_policy)
4683  {
4684  case LockWaitBlock:
4685  XactLockTableWait(xwait, relation, &tuple->t_self,
4686  XLTW_Lock);
4687  break;
4688  case LockWaitSkip:
4689  if (!ConditionalXactLockTableWait(xwait))
4690  {
4691  result = TM_WouldBlock;
4692  /* recovery code expects to have buffer lock held */
4694  goto failed;
4695  }
4696  break;
4697  case LockWaitError:
4698  if (!ConditionalXactLockTableWait(xwait))
4699  ereport(ERROR,
4700  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4701  errmsg("could not obtain lock on row in relation \"%s\"",
4702  RelationGetRelationName(relation))));
4703  break;
4704  }
4705  }
4706 
4707  /* if there are updates, follow the update chain */
4708  if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4709  {
4710  TM_Result res;
4711 
4712  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4714  mode);
4715  if (res != TM_Ok)
4716  {
4717  result = res;
4718  /* recovery code expects to have buffer lock held */
4720  goto failed;
4721  }
4722  }
4723 
4725 
4726  /*
4727  * xwait is done, but if xwait had just locked the tuple then some
4728  * other xact could update this tuple before we get to this point.
4729  * Check for xmax change, and start over if so.
4730  */
4731  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4733  xwait))
4734  goto l3;
4735 
4736  if (!(infomask & HEAP_XMAX_IS_MULTI))
4737  {
4738  /*
4739  * Otherwise check if it committed or aborted. Note we cannot
4740  * be here if the tuple was only locked by somebody who didn't
4741  * conflict with us; that would have been handled above. So
4742  * that transaction must necessarily be gone by now. But
4743  * don't check for this in the multixact case, because some
4744  * locker transactions might still be running.
4745  */
4746  UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4747  }
4748  }
4749 
4750  /* By here, we're certain that we hold buffer exclusive lock again */
4751 
4752  /*
4753  * We may lock if previous xmax aborted, or if it committed but only
4754  * locked the tuple without updating it; or if we didn't have to wait
4755  * at all for whatever reason.
4756  */
4757  if (!require_sleep ||
4758  (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4761  result = TM_Ok;
4762  else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
4763  result = TM_Updated;
4764  else
4765  result = TM_Deleted;
4766  }
4767 
4768 failed:
4769  if (result != TM_Ok)
4770  {
4771  Assert(result == TM_SelfModified || result == TM_Updated ||
4772  result == TM_Deleted || result == TM_WouldBlock);
4773  Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4774  Assert(result != TM_Updated ||
4775  !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4776  tmfd->ctid = tuple->t_data->t_ctid;
4777  tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4778  if (result == TM_SelfModified)
4779  tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4780  else
4781  tmfd->cmax = InvalidCommandId;
4782  goto out_locked;
4783  }
4784 
4785  /*
4786  * If we didn't pin the visibility map page and the page has become all
4787  * visible while we were busy locking the buffer, or during some
4788  * subsequent window during which we had it unlocked, we'll have to unlock
4789  * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
4790  * unfortunate, especially since we'll now have to recheck whether the
4791  * tuple has been locked or updated under us, but hopefully it won't
4792  * happen very often.
4793  */
4794  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4795  {
4796  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4797  visibilitymap_pin(relation, block, &vmbuffer);
4799  goto l3;
4800  }
4801 
4802  xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4803  old_infomask = tuple->t_data->t_infomask;
4804 
4805  /*
4806  * If this is the first possibly-multixact-able operation in the current
4807  * transaction, set my per-backend OldestMemberMXactId setting. We can be
4808  * certain that the transaction will never become a member of any older
4809  * MultiXactIds than that. (We have to do this even if we end up just
4810  * using our own TransactionId below, since some other backend could
4811  * incorporate our XID into a MultiXact immediately afterwards.)
4812  */
4814 
4815  /*
4816  * Compute the new xmax and infomask to store into the tuple. Note we do
4817  * not modify the tuple just yet, because that would leave it in the wrong
4818  * state if multixact.c elogs.
4819  */
4820  compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
4821  GetCurrentTransactionId(), mode, false,
4822  &xid, &new_infomask, &new_infomask2);
4823 
4825 
4826  /*
4827  * Store transaction information of xact locking the tuple.
4828  *
4829  * Note: Cmax is meaningless in this context, so don't set it; this avoids
4830  * possibly generating a useless combo CID. Moreover, if we're locking a
4831  * previously updated tuple, it's important to preserve the Cmax.
4832  *
4833  * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
4834  * we would break the HOT chain.
4835  */
4836  tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
4837  tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4838  tuple->t_data->t_infomask |= new_infomask;
4839  tuple->t_data->t_infomask2 |= new_infomask2;
4840  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4842  HeapTupleHeaderSetXmax(tuple->t_data, xid);
4843 
4844  /*
4845  * Make sure there is no forward chain link in t_ctid. Note that in the
4846  * cases where the tuple has been updated, we must not overwrite t_ctid,
4847  * because it was set by the updater. Moreover, if the tuple has been
4848  * updated, we need to follow the update chain to lock the new versions of
4849  * the tuple as well.
4850  */
4851  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
4852  tuple->t_data->t_ctid = *tid;
4853 
4854  /* Clear only the all-frozen bit on visibility map if needed */
4855  if (PageIsAllVisible(page) &&
4856  visibilitymap_clear(relation, block, vmbuffer,
4858  cleared_all_frozen = true;
4859 
4860 
4861  MarkBufferDirty(*buffer);
4862 
4863  /*
4864  * XLOG stuff. You might think that we don't need an XLOG record because
4865  * there is no state change worth restoring after a crash. You would be
4866  * wrong however: we have just written either a TransactionId or a
4867  * MultiXactId that may never have been seen on disk before, and we need
4868  * to make sure that there are XLOG entries covering those ID numbers.
4869  * Else the same IDs might be re-used after a crash, which would be
4870  * disastrous if this page made it to disk before the crash. Essentially
4871  * we have to enforce the WAL log-before-data rule even in this case.
4872  * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
4873  * entries for everything anyway.)
4874  */
4875  if (RelationNeedsWAL(relation))
4876  {
4877  xl_heap_lock xlrec;
4878  XLogRecPtr recptr;
4879 
4880  XLogBeginInsert();
4881  XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
4882 
4883  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
4884  xlrec.locking_xid = xid;
4885  xlrec.infobits_set = compute_infobits(new_infomask,
4886  tuple->t_data->t_infomask2);
4887  xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4888  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4889 
4890  /* we don't decode row locks atm, so no need to log the origin */
4891 
4892  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4893 
4894  PageSetLSN(page, recptr);
4895  }
4896 
4897  END_CRIT_SECTION();
4898 
4899  result = TM_Ok;
4900 
4901 out_locked:
4902  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4903 
4904 out_unlocked:
4905  if (BufferIsValid(vmbuffer))
4906  ReleaseBuffer(vmbuffer);
4907 
4908  /*
4909  * Don't update the visibility map here. Locking a tuple doesn't change
4910  * visibility info.
4911  */
4912 
4913  /*
4914  * Now that we have successfully marked the tuple as locked, we can
4915  * release the lmgr tuple lock, if we had it.
4916  */
4917  if (have_tuple_lock)
4918  UnlockTupleTuplock(relation, tid, mode);
4919 
4920  return result;
4921 }
4922 
4923 /*
4924  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
4925  * its normal, Xmax-based tuple lock.
4926  *
4927  * have_tuple_lock is an input and output parameter: on input, it indicates
4928  * whether the lock has previously been acquired (and this function does
4929  * nothing in that case). If this function returns success, have_tuple_lock
4930  * has been flipped to true.
4931  *
4932  * Returns false if it was unable to obtain the lock; this can only happen if
4933  * wait_policy is Skip.
4934  */
4935 static bool
4937  LockWaitPolicy wait_policy, bool *have_tuple_lock)
4938 {
4939  if (*have_tuple_lock)
4940  return true;
4941 
4942  switch (wait_policy)
4943  {
4944  case LockWaitBlock:
4945  LockTupleTuplock(relation, tid, mode);
4946  break;
4947 
4948  case LockWaitSkip:
4949  if (!ConditionalLockTupleTuplock(relation, tid, mode))
4950  return false;
4951  break;
4952 
4953  case LockWaitError:
4954  if (!ConditionalLockTupleTuplock(relation, tid, mode))
4955  ereport(ERROR,
4956  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4957  errmsg("could not obtain lock on row in relation \"%s\"",
4958  RelationGetRelationName(relation))));
4959  break;
4960  }
4961  *have_tuple_lock = true;
4962 
4963  return true;
4964 }
4965 
4966 /*
4967  * Given an original set of Xmax and infomask, and a transaction (identified by
4968  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
4969  * corresponding infomasks to use on the tuple.
4970  *
4971  * Note that this might have side effects such as creating a new MultiXactId.
4972  *
4973  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
4974  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
4975  * but it was not running anymore. There is a race condition, which is that the
4976  * MultiXactId may have finished since then, but that uncommon case is handled
4977  * either here, or within MultiXactIdExpand.
4978  *
4979  * There is a similar race condition possible when the old xmax was a regular
4980  * TransactionId. We test TransactionIdIsInProgress again just to narrow the
4981  * window, but it's still possible to end up creating an unnecessary
4982  * MultiXactId. Fortunately this is harmless.
4983  */
4984 static void
4986  uint16 old_infomask2, TransactionId add_to_xmax,
4987  LockTupleMode mode, bool is_update,
4988  TransactionId *result_xmax, uint16 *result_infomask,
4989  uint16 *result_infomask2)
4990 {
4991  TransactionId new_xmax;
4992  uint16 new_infomask,
4993  new_infomask2;
4994 
4996 
4997 l5:
4998  new_infomask = 0;
4999  new_infomask2 = 0;
5000  if (old_infomask & HEAP_XMAX_INVALID)
5001  {
5002  /*
5003  * No previous locker; we just insert our own TransactionId.
5004  *
5005  * Note that it's critical that this case be the first one checked,
5006  * because there are several blocks below that come back to this one
5007  * to implement certain optimizations; old_infomask might contain
5008  * other dirty bits in those cases, but we don't really care.
5009  */
5010  if (is_update)
5011  {
5012  new_xmax = add_to_xmax;
5013  if (mode == LockTupleExclusive)
5014  new_infomask2 |= HEAP_KEYS_UPDATED;
5015  }
5016  else
5017  {
5018  new_infomask |= HEAP_XMAX_LOCK_ONLY;
5019  switch (mode)
5020  {
5021  case LockTupleKeyShare:
5022  new_xmax = add_to_xmax;
5023  new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5024  break;
5025  case LockTupleShare:
5026  new_xmax = add_to_xmax;
5027  new_infomask |= HEAP_XMAX_SHR_LOCK;
5028  break;
5030  new_xmax = add_to_xmax;
5031  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5032  break;
5033  case LockTupleExclusive:
5034  new_xmax = add_to_xmax;
5035  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5036  new_infomask2 |= HEAP_KEYS_UPDATED;
5037  break;
5038  default:
5039  new_xmax = InvalidTransactionId; /* silence compiler */
5040  elog(ERROR, "invalid lock mode");
5041  }
5042  }
5043  }
5044  else if (old_infomask & HEAP_XMAX_IS_MULTI)
5045  {
5046  MultiXactStatus new_status;
5047 
5048  /*
5049  * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5050  * cross-check.
5051  */
5052  Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5053 
5054  /*
5055  * A multixact together with LOCK_ONLY set but neither lock bit set
5056  * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5057  * anymore. This check is critical for databases upgraded by
5058  * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5059  * that such multis are never passed.
5060  */
5061  if (HEAP_LOCKED_UPGRADED(old_infomask))
5062  {
5063  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5064  old_infomask |= HEAP_XMAX_INVALID;
5065  goto l5;
5066  }
5067 
5068  /*
5069  * If the XMAX is already a MultiXactId, then we need to expand it to
5070  * include add_to_xmax; but if all the members were lockers and are
5071  * all gone, we can do away with the IS_MULTI bit and just set
5072  * add_to_xmax as the only locker/updater. If all lockers are gone
5073  * and we have an updater that aborted, we can also do without a
5074  * multi.
5075  *
5076  * The cost of doing GetMultiXactIdMembers would be paid by
5077  * MultiXactIdExpand if we weren't to do this, so this check is not
5078  * incurring extra work anyhow.
5079  */
5080  if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5081  {
5082  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5084  old_infomask)))
5085  {
5086  /*
5087  * Reset these bits and restart; otherwise fall through to
5088  * create a new multi below.
5089  */
5090  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5091  old_infomask |= HEAP_XMAX_INVALID;
5092  goto l5;
5093  }
5094  }
5095 
5096  new_status = get_mxact_status_for_lock(mode, is_update);
5097 
5098  new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5099  new_status);
5100  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5101  }
5102  else if (old_infomask & HEAP_XMAX_COMMITTED)
5103  {
5104  /*
5105  * It's a committed update, so we need to preserve him as updater of
5106  * the tuple.
5107  */
5109  MultiXactStatus new_status;
5110 
5111  if (old_infomask2 & HEAP_KEYS_UPDATED)
5112  status = MultiXactStatusUpdate;
5113  else
5114  status = MultiXactStatusNoKeyUpdate;
5115 
5116  new_status = get_mxact_status_for_lock(mode, is_update);
5117 
5118  /*
5119  * since it's not running, it's obviously impossible for the old
5120  * updater to be identical to the current one, so we need not check
5121  * for that case as we do in the block above.
5122  */
5123  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5124  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5125  }
5126  else if (TransactionIdIsInProgress(xmax))
5127  {
5128  /*
5129  * If the XMAX is a valid, in-progress TransactionId, then we need to
5130  * create a new MultiXactId that includes both the old locker or
5131  * updater and our own TransactionId.
5132  */
5133  MultiXactStatus new_status;
5134  MultiXactStatus old_status;
5135  LockTupleMode old_mode;
5136 
5137  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5138  {
5139  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5140  old_status = MultiXactStatusForKeyShare;
5141  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5142  old_status = MultiXactStatusForShare;
5143  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5144  {
5145  if (old_infomask2 & HEAP_KEYS_UPDATED)
5146  old_status = MultiXactStatusForUpdate;
5147  else
5148  old_status = MultiXactStatusForNoKeyUpdate;
5149  }
5150  else
5151  {
5152  /*
5153  * LOCK_ONLY can be present alone only when a page has been
5154  * upgraded by pg_upgrade. But in that case,
5155  * TransactionIdIsInProgress() should have returned false. We
5156  * assume it's no longer locked in this case.
5157  */
5158  elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5159  old_infomask |= HEAP_XMAX_INVALID;
5160  old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5161  goto l5;
5162  }
5163  }
5164  else
5165  {
5166  /* it's an update, but which kind? */
5167  if (old_infomask2 & HEAP_KEYS_UPDATED)
5168  old_status = MultiXactStatusUpdate;
5169  else
5170  old_status = MultiXactStatusNoKeyUpdate;
5171  }
5172 
5173  old_mode = TUPLOCK_from_mxstatus(old_status);
5174 
5175  /*
5176  * If the lock to be acquired is for the same TransactionId as the
5177  * existing lock, there's an optimization possible: consider only the
5178  * strongest of both locks as the only one present, and restart.
5179  */
5180  if (xmax == add_to_xmax)
5181  {
5182  /*
5183  * Note that it's not possible for the original tuple to be
5184  * updated: we wouldn't be here because the tuple would have been
5185  * invisible and we wouldn't try to update it. As a subtlety,
5186  * this code can also run when traversing an update chain to lock
5187  * future versions of a tuple. But we wouldn't be here either,
5188  * because the add_to_xmax would be different from the original
5189  * updater.
5190  */
5191  Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5192 
5193  /* acquire the strongest of both */
5194  if (mode < old_mode)
5195  mode = old_mode;
5196  /* mustn't touch is_update */
5197 
5198  old_infomask |= HEAP_XMAX_INVALID;
5199  goto l5;
5200  }
5201 
5202  /* otherwise, just fall back to creating a new multixact */
5203  new_status = get_mxact_status_for_lock(mode, is_update);
5204  new_xmax = MultiXactIdCreate(xmax, old_status,
5205  add_to_xmax, new_status);
5206  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5207  }
5208  else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5209  TransactionIdDidCommit(xmax))
5210  {
5211  /*
5212  * It's a committed update, so we gotta preserve him as updater of the
5213  * tuple.
5214  */
5216  MultiXactStatus new_status;
5217 
5218  if (old_infomask2 & HEAP_KEYS_UPDATED)
5219  status = MultiXactStatusUpdate;
5220  else
5221  status = MultiXactStatusNoKeyUpdate;
5222 
5223  new_status = get_mxact_status_for_lock(mode, is_update);
5224 
5225  /*
5226  * since it's not running, it's obviously impossible for the old
5227  * updater to be identical to the current one, so we need not check
5228  * for that case as we do in the block above.
5229  */
5230  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5231  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5232  }
5233  else
5234  {
5235  /*
5236  * Can get here iff the locking/updating transaction was running when
5237  * the infomask was extracted from the tuple, but finished before
5238  * TransactionIdIsInProgress got to run. Deal with it as if there was
5239  * no locker at all in the first place.
5240  */
5241  old_infomask |= HEAP_XMAX_INVALID;
5242  goto l5;
5243  }
5244 
5245  *result_infomask = new_infomask;
5246  *result_infomask2 = new_infomask2;
5247  *result_xmax = new_xmax;
5248 }
5249 
5250 /*
5251  * Subroutine for heap_lock_updated_tuple_rec.
5252  *
5253  * Given a hypothetical multixact status held by the transaction identified
5254  * with the given xid, does the current transaction need to wait, fail, or can
5255  * it continue if it wanted to acquire a lock of the given mode? "needwait"
5256  * is set to true if waiting is necessary; if it can continue, then TM_Ok is
5257  * returned. If the lock is already held by the current transaction, return
5258  * TM_SelfModified. In case of a conflict with another transaction, a
5259  * different HeapTupleSatisfiesUpdate return code is returned.
5260  *
5261  * The held status is said to be hypothetical because it might correspond to a
5262  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5263  * way for simplicity of API.
5264  */
5265 static TM_Result
5268  bool *needwait)
5269 {
5270  MultiXactStatus wantedstatus;
5271 
5272  *needwait = false;
5273  wantedstatus = get_mxact_status_for_lock(mode, false);
5274 
5275  /*
5276  * Note: we *must* check TransactionIdIsInProgress before
5277  * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5278  * for an explanation.
5279  */
5281  {
5282  /*
5283  * The tuple has already been locked by our own transaction. This is
5284  * very rare but can happen if multiple transactions are trying to
5285  * lock an ancient version of the same tuple.
5286  */
5287  return TM_SelfModified;
5288  }
5289  else if (TransactionIdIsInProgress(xid))
5290  {
5291  /*
5292  * If the locking transaction is running, what we do depends on
5293  * whether the lock modes conflict: if they do, then we must wait for
5294  * it to finish; otherwise we can fall through to lock this tuple
5295  * version without waiting.
5296  */
5298  LOCKMODE_from_mxstatus(wantedstatus)))
5299  {
5300  *needwait = true;
5301  }
5302 
5303  /*
5304  * If we set needwait above, then this value doesn't matter;
5305  * otherwise, this value signals to caller that it's okay to proceed.
5306  */
5307  return TM_Ok;
5308  }
5309  else if (TransactionIdDidAbort(xid))
5310  return TM_Ok;
5311  else if (TransactionIdDidCommit(xid))
5312  {
5313  /*
5314  * The other transaction committed. If it was only a locker, then the
5315  * lock is completely gone now and we can return success; but if it
5316  * was an update, then what we do depends on whether the two lock
5317  * modes conflict. If they conflict, then we must report error to
5318  * caller. But if they don't, we can fall through to allow the current
5319  * transaction to lock the tuple.
5320  *
5321  * Note: the reason we worry about ISUPDATE here is because as soon as
5322  * a transaction ends, all its locks are gone and meaningless, and
5323  * thus we can ignore them; whereas its updates persist. In the
5324  * TransactionIdIsInProgress case, above, we don't need to check
5325  * because we know the lock is still "alive" and thus a conflict needs
5326  * always be checked.
5327  */
5328  if (!ISUPDATE_from_mxstatus(status))
5329  return TM_Ok;
5330 
5332  LOCKMODE_from_mxstatus(wantedstatus)))
5333  {
5334  /* bummer */
5335  if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5336  return TM_Updated;
5337  else
5338  return TM_Deleted;
5339  }
5340 
5341  return TM_Ok;
5342  }
5343 
5344  /* Not in progress, not aborted, not committed -- must have crashed */
5345  return TM_Ok;
5346 }
5347 
5348 
5349 /*
5350  * Recursive part of heap_lock_updated_tuple
5351  *
5352  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5353  * xid with the given mode; if this tuple is updated, recurse to lock the new
5354  * version as well.
5355  */
5356 static TM_Result
5359 {
5360  TM_Result result;
5361  ItemPointerData tupid;
5362  HeapTupleData mytup;
5363  Buffer buf;
5364  uint16 new_infomask,
5365  new_infomask2,
5366  old_infomask,
5367  old_infomask2;
5368  TransactionId xmax,
5369  new_xmax;
5370  TransactionId priorXmax = InvalidTransactionId;
5371  bool cleared_all_frozen = false;
5372  bool pinned_desired_page;
5373  Buffer vmbuffer = InvalidBuffer;
5374  BlockNumber block;
5375 
5376  ItemPointerCopy(tid, &tupid);
5377 
5378  for (;;)
5379  {
5380  new_infomask = 0;
5381  new_xmax = InvalidTransactionId;
5382  block = ItemPointerGetBlockNumber(&tupid);
5383  ItemPointerCopy(&tupid, &(mytup.t_self));
5384 
5385  if (!heap_fetch(rel, SnapshotAny, &mytup, &buf))
5386  {
5387  /*
5388  * if we fail to find the updated version of the tuple, it's
5389  * because it was vacuumed/pruned away after its creator
5390  * transaction aborted. So behave as if we got to the end of the
5391  * chain, and there's no further tuple to lock: return success to
5392  * caller.
5393  */
5394  result = TM_Ok;
5395  goto out_unlocked;
5396  }
5397 
5398 l4:
5400 
5401  /*
5402  * Before locking the buffer, pin the visibility map page if it
5403  * appears to be necessary. Since we haven't got the lock yet,
5404  * someone else might be in the middle of changing this, so we'll need
5405  * to recheck after we have the lock.
5406  */
5407  if (PageIsAllVisible(BufferGetPage(buf)))
5408  {
5409  visibilitymap_pin(rel, block, &vmbuffer);
5410  pinned_desired_page = true;
5411  }
5412  else
5413  pinned_desired_page = false;
5414 
5416 
5417  /*
5418  * If we didn't pin the visibility map page and the page has become
5419  * all visible while we were busy locking the buffer, we'll have to
5420  * unlock and re-lock, to avoid holding the buffer lock across I/O.
5421  * That's a bit unfortunate, but hopefully shouldn't happen often.
5422  *
5423  * Note: in some paths through this function, we will reach here
5424  * holding a pin on a vm page that may or may not be the one matching
5425  * this page. If this page isn't all-visible, we won't use the vm
5426  * page, but we hold onto such a pin till the end of the function.
5427  */
5428  if (!pinned_desired_page && PageIsAllVisible(BufferGetPage(buf)))
5429  {
5431  visibilitymap_pin(rel, block, &vmbuffer);
5433  }
5434 
5435  /*
5436  * Check the tuple XMIN against prior XMAX, if any. If we reached the
5437  * end of the chain, we're done, so return success.
5438  */
5439  if (TransactionIdIsValid(priorXmax) &&
5441  priorXmax))
5442  {
5443  result = TM_Ok;
5444  goto out_locked;
5445  }
5446 
5447  /*
5448  * Also check Xmin: if this tuple was created by an aborted
5449  * (sub)transaction, then we already locked the last live one in the
5450  * chain, thus we're done, so return success.
5451  */
5453  {
5454  result = TM_Ok;
5455  goto out_locked;
5456  }
5457 
5458  old_infomask = mytup.t_data->t_infomask;
5459  old_infomask2 = mytup.t_data->t_infomask2;
5460  xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5461 
5462  /*
5463  * If this tuple version has been updated or locked by some concurrent
5464  * transaction(s), what we do depends on whether our lock mode
5465  * conflicts with what those other transactions hold, and also on the
5466  * status of them.
5467  */
5468  if (!(old_infomask & HEAP_XMAX_INVALID))
5469  {
5470  TransactionId rawxmax;
5471  bool needwait;
5472 
5473  rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5474  if (old_infomask & HEAP_XMAX_IS_MULTI)
5475  {
5476  int nmembers;
5477  int i;
5478  MultiXactMember *members;
5479 
5480  /*
5481  * We don't need a test for pg_upgrade'd tuples: this is only
5482  * applied to tuples after the first in an update chain. Said
5483  * first tuple in the chain may well be locked-in-9.2-and-
5484  * pg_upgraded, but that one was already locked by our caller,
5485  * not us; and any subsequent ones cannot be because our
5486  * caller must necessarily have obtained a snapshot later than
5487  * the pg_upgrade itself.
5488  */
5490 
5491  nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5492  HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5493  for (i = 0; i < nmembers; i++)
5494  {
5495  result = test_lockmode_for_conflict(members[i].status,
5496  members[i].xid,
5497  mode,
5498  &mytup,
5499  &needwait);
5500 
5501  /*
5502  * If the tuple was already locked by ourselves in a
5503  * previous iteration of this (say heap_lock_tuple was
5504  * forced to restart the locking loop because of a change
5505  * in xmax), then we hold the lock already on this tuple
5506  * version and we don't need to do anything; and this is
5507  * not an error condition either. We just need to skip
5508  * this tuple and continue locking the next version in the
5509  * update chain.
5510  */
5511  if (result == TM_SelfModified)
5512  {
5513  pfree(members);
5514  goto next;
5515  }
5516 
5517  if (needwait)
5518  {
5520  XactLockTableWait(members[i].xid, rel,
5521  &mytup.t_self,
5523  pfree(members);
5524  goto l4;
5525  }
5526  if (result != TM_Ok)
5527  {
5528  pfree(members);
5529  goto out_locked;
5530  }
5531  }
5532  if (members)
5533  pfree(members);
5534  }
5535  else
5536  {
5538 
5539  /*
5540  * For a non-multi Xmax, we first need to compute the
5541  * corresponding MultiXactStatus by using the infomask bits.
5542  */
5543  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5544  {
5545  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5546  status = MultiXactStatusForKeyShare;
5547  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5548  status = MultiXactStatusForShare;
5549  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5550  {
5551  if (old_infomask2 & HEAP_KEYS_UPDATED)
5552  status = MultiXactStatusForUpdate;
5553  else
5555  }
5556  else
5557  {
5558  /*
5559  * LOCK_ONLY present alone (a pg_upgraded tuple marked
5560  * as share-locked in the old cluster) shouldn't be
5561  * seen in the middle of an update chain.
5562  */
5563  elog(ERROR, "invalid lock status in tuple");
5564  }
5565  }
5566  else
5567  {
5568  /* it's an update, but which kind? */
5569  if (old_infomask2 & HEAP_KEYS_UPDATED)
5570  status = MultiXactStatusUpdate;
5571  else
5572  status = MultiXactStatusNoKeyUpdate;
5573  }
5574 
5575  result = test_lockmode_for_conflict(status, rawxmax, mode,
5576  &mytup, &needwait);
5577 
5578  /*
5579  * If the tuple was already locked by ourselves in a previous
5580  * iteration of this (say heap_lock_tuple was forced to
5581  * restart the locking loop because of a change in xmax), then
5582  * we hold the lock already on this tuple version and we don't
5583  * need to do anything; and this is not an error condition
5584  * either. We just need to skip this tuple and continue
5585  * locking the next version in the update chain.
5586  */
5587  if (result == TM_SelfModified)
5588  goto next;
5589 
5590  if (needwait)
5591  {
5593  XactLockTableWait(rawxmax, rel, &mytup.t_self,
5595  goto l4;
5596  }
5597  if (result != TM_Ok)
5598  {
5599  goto out_locked;
5600  }
5601  }
5602  }
5603 
5604  /* compute the new Xmax and infomask values for the tuple ... */
5605  compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5606  xid, mode, false,
5607  &new_xmax, &new_infomask, &new_infomask2);
5608 
5609  if (PageIsAllVisible(BufferGetPage(buf)) &&
5610  visibilitymap_clear(rel, block, vmbuffer,
5612  cleared_all_frozen = true;
5613 
5615 
5616  /* ... and set them */
5617  HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5618  mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5620  mytup.t_data->t_infomask |= new_infomask;
5621  mytup.t_data->t_infomask2 |= new_infomask2;
5622 
5623  MarkBufferDirty(buf);
5624 
5625  /* XLOG stuff */
5626  if (RelationNeedsWAL(rel))
5627  {
5628  xl_heap_lock_updated xlrec;
5629  XLogRecPtr recptr;
5630  Page page = BufferGetPage(buf);
5631 
5632  XLogBeginInsert();
5634 
5635  xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5636  xlrec.xmax = new_xmax;
5637  xlrec.infobits_set = compute_infobits(new_infomask, new_infomask2);
5638  xlrec.flags =
5639  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5640 
5641  XLogRegisterData((char *) &xlrec, SizeOfHeapLockUpdated);
5642 
5643  recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_LOCK_UPDATED);
5644 
5645  PageSetLSN(page, recptr);
5646  }
5647 
5648  END_CRIT_SECTION();
5649 
5650 next:
5651  /* if we find the end of update chain, we're done. */
5652  if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
5654  ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
5656  {
5657  result = TM_Ok;
5658  goto out_locked;
5659  }
5660 
5661  /* tail recursion */
5662  priorXmax = HeapTupleHeaderGetUpdateXid(mytup.t_data);
5663  ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
5664  UnlockReleaseBuffer(buf);
5665  }
5666 
5667  result = TM_Ok;
5668 
5669 out_locked:
5670  UnlockReleaseBuffer(buf);
5671 
5672 out_unlocked:
5673  if (vmbuffer != InvalidBuffer)
5674  ReleaseBuffer(vmbuffer);
5675 
5676  return result;
5677 }
5678 
5679 /*
5680  * heap_lock_updated_tuple
5681  * Follow update chain when locking an updated tuple, acquiring locks (row
5682  * marks) on the updated versions.
5683  *
5684  * The initial tuple is assumed to be already locked.
5685  *
5686  * This function doesn't check visibility, it just unconditionally marks the
5687  * tuple(s) as locked. If any tuple in the updated chain is being deleted
5688  * concurrently (or updated with the key being modified), sleep until the
5689  * transaction doing it is finished.
5690  *
5691  * Note that we don't acquire heavyweight tuple locks on the tuples we walk
5692  * when we have to wait for other transactions to release them, as opposed to
5693  * what heap_lock_tuple does. The reason is that having more than one
5694  * transaction walking the chain is probably uncommon enough that risk of
5695  * starvation is not likely: one of the preconditions for being here is that
5696  * the snapshot in use predates the update that created this tuple (because we
5697  * started at an earlier version of the tuple), but at the same time such a
5698  * transaction cannot be using repeatable read or serializable isolation
5699  * levels, because that would lead to a serializability failure.
5700  */
5701 static TM_Result
5704 {
5705  /*
5706  * If the tuple has not been updated, or has moved into another partition
5707  * (effectively a delete) stop here.
5708  */
5710  !ItemPointerEquals(&tuple->t_self, ctid))
5711  {
5712  /*
5713  * If this is the first possibly-multixact-able operation in the
5714  * current transaction, set my per-backend OldestMemberMXactId
5715  * setting. We can be certain that the transaction will never become a
5716  * member of any older MultiXactIds than that. (We have to do this
5717  * even if we end up just using our own TransactionId below, since
5718  * some other backend could incorporate our XID into a MultiXact
5719  * immediately afterwards.)
5720  */
5722 
5723  return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
5724  }
5725 
5726  /* nothing to lock */
5727  return TM_Ok;
5728 }
5729 
5730 /*
5731  * heap_finish_speculative - mark speculative insertion as successful
5732  *
5733  * To successfully finish a speculative insertion we have to clear speculative
5734  * token from tuple. To do so the t_ctid field, which will contain a
5735  * speculative token value, is modified in place to point to the tuple itself,
5736  * which is characteristic of a newly inserted ordinary tuple.
5737  *
5738  * NB: It is not ok to commit without either finishing or aborting a
5739  * speculative insertion. We could treat speculative tuples of committed
5740  * transactions implicitly as completed, but then we would have to be prepared
5741  * to deal with speculative tokens on committed tuples. That wouldn't be
5742  * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
5743  * but clearing the token at completion isn't very expensive either.
5744  * An explicit confirmation WAL record also makes logical decoding simpler.
5745  */
5746 void
5748 {
5749  Buffer buffer;
5750  Page page;
5751  OffsetNumber offnum;
5752  ItemId lp = NULL;
5753  HeapTupleHeader htup;
5754 
5755  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
5757  page = (Page) BufferGetPage(buffer);
5758 
5759  offnum = ItemPointerGetOffsetNumber(tid);
5760  if (PageGetMaxOffsetNumber(page) >= offnum)
5761  lp = PageGetItemId(page, offnum);
5762 
5763  if (PageGetMaxOffsetNumber(page) < offnum || !ItemIdIsNormal(lp))
5764  elog(ERROR, "invalid lp");
5765 
5766  htup = (HeapTupleHeader) PageGetItem(page, lp);
5767 
5768  /* SpecTokenOffsetNumber should be distinguishable from any real offset */
5770  "invalid speculative token constant");
5771 
5772  /* NO EREPORT(ERROR) from here till changes are logged */
5774 
5776 
5777  MarkBufferDirty(buffer);
5778 
5779  /*
5780  * Replace the speculative insertion token with a real t_ctid, pointing to
5781  * itself like it does on regular tuples.
5782  */
5783  htup->t_ctid = *tid;
5784 
5785  /* XLOG stuff */
5786  if (RelationNeedsWAL(relation))
5787  {
5788  xl_heap_confirm xlrec;
5789  XLogRecPtr recptr;
5790 
5791  xlrec.offnum = ItemPointerGetOffsetNumber(tid);
5792 
5793  XLogBeginInsert();
5794 
5795  /* We want the same filtering on this as on a plain insert */
5797 
5798  XLogRegisterData((char *) &xlrec, SizeOfHeapConfirm);
5799  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
5800 
5801  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_CONFIRM);
5802 
5803  PageSetLSN(page, recptr);
5804  }
5805 
5806