PostgreSQL Source Code  git master
heapam.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  * heap access method code
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  * relation_open - open any relation by relation OID
16  * relation_openrv - open any relation specified by a RangeVar
17  * relation_close - close any relation
18  * heap_open - open a heap relation by relation OID
19  * heap_openrv - open a heap relation specified by a RangeVar
20  * heap_close - (now just a macro for relation_close)
21  * heap_beginscan - begin relation scan
22  * heap_rescan - restart a relation scan
23  * heap_endscan - end relation scan
24  * heap_getnext - retrieve next tuple in scan
25  * heap_fetch - retrieve tuple with given tid
26  * heap_insert - insert tuple into a relation
27  * heap_multi_insert - insert multiple tuples into a relation
28  * heap_delete - delete a tuple from a relation
29  * heap_update - replace a tuple in a relation with another tuple
30  * heap_sync - sync heap, for when no WAL has been written
31  *
32  * NOTES
33  * This file contains the heap_ routines which implement
34  * the POSTGRES heap access method used for all POSTGRES
35  * relations.
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include "access/bufmask.h"
42 #include "access/heapam.h"
43 #include "access/heapam_xlog.h"
44 #include "access/hio.h"
45 #include "access/multixact.h"
46 #include "access/parallel.h"
47 #include "access/relscan.h"
48 #include "access/sysattr.h"
49 #include "access/transam.h"
50 #include "access/tuptoaster.h"
51 #include "access/valid.h"
52 #include "access/visibilitymap.h"
53 #include "access/xact.h"
54 #include "access/xlog.h"
55 #include "access/xloginsert.h"
56 #include "access/xlogutils.h"
57 #include "catalog/catalog.h"
58 #include "catalog/namespace.h"
59 #include "miscadmin.h"
60 #include "pgstat.h"
61 #include "port/atomics.h"
62 #include "storage/bufmgr.h"
63 #include "storage/freespace.h"
64 #include "storage/lmgr.h"
65 #include "storage/predicate.h"
66 #include "storage/procarray.h"
67 #include "storage/smgr.h"
68 #include "storage/spin.h"
69 #include "storage/standby.h"
70 #include "utils/datum.h"
71 #include "utils/inval.h"
72 #include "utils/lsyscache.h"
73 #include "utils/relcache.h"
74 #include "utils/snapmgr.h"
75 #include "utils/syscache.h"
76 #include "utils/tqual.h"
77 
78 
79 /* GUC variable */
81 
82 
84  Snapshot snapshot,
85  int nkeys, ScanKey key,
86  ParallelHeapScanDesc parallel_scan,
87  bool allow_strat,
88  bool allow_sync,
89  bool allow_pagemode,
90  bool is_bitmapscan,
91  bool is_samplescan,
92  bool temp_snap);
95 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
96  TransactionId xid, CommandId cid, int options);
97 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
98  Buffer newbuf, HeapTuple oldtup,
99  HeapTuple newtup, HeapTuple old_key_tup,
100  bool all_visible_cleared, bool new_all_visible_cleared);
102  Bitmapset *interesting_cols,
103  HeapTuple oldtup, HeapTuple newtup);
104 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
105  LockTupleMode mode, LockWaitPolicy wait_policy,
106  bool *have_tuple_lock);
107 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
108  uint16 old_infomask2, TransactionId add_to_xmax,
109  LockTupleMode mode, bool is_update,
110  TransactionId *result_xmax, uint16 *result_infomask,
111  uint16 *result_infomask2);
113  ItemPointer ctid, TransactionId xid,
114  LockTupleMode mode);
115 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
116  uint16 *new_infomask2);
118  uint16 t_infomask);
119 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
120  LockTupleMode lockmode);
121 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
122  Relation rel, ItemPointer ctid, XLTW_Oper oper,
123  int *remaining);
125  uint16 infomask, Relation rel, int *remaining);
126 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
127 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
128  bool *copy);
129 
130 
131 /*
132  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
133  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
134  * update them). This table (and the macros below) helps us determine the
135  * heavyweight lock mode and MultiXactStatus values to use for any particular
136  * tuple lock strength.
137  *
138  * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
139  * instead.
140  */
141 static const struct
142 {
146 }
147 
149 {
150  { /* LockTupleKeyShare */
153  -1 /* KeyShare does not allow updating tuples */
154  },
155  { /* LockTupleShare */
156  RowShareLock,
158  -1 /* Share does not allow updating tuples */
159  },
160  { /* LockTupleNoKeyExclusive */
164  },
165  { /* LockTupleExclusive */
169  }
170 };
171 
172 /* Get the LOCKMODE for a given MultiXactStatus */
173 #define LOCKMODE_from_mxstatus(status) \
174  (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
175 
176 /*
177  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
178  * This is more readable than having every caller translate it to lock.h's
179  * LOCKMODE.
180  */
181 #define LockTupleTuplock(rel, tup, mode) \
182  LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
183 #define UnlockTupleTuplock(rel, tup, mode) \
184  UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
185 #define ConditionalLockTupleTuplock(rel, tup, mode) \
186  ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
187 
188 /*
189  * This table maps tuple lock strength values for each particular
190  * MultiXactStatus value.
191  */
193 {
194  LockTupleKeyShare, /* ForKeyShare */
195  LockTupleShare, /* ForShare */
196  LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
197  LockTupleExclusive, /* ForUpdate */
198  LockTupleNoKeyExclusive, /* NoKeyUpdate */
199  LockTupleExclusive /* Update */
200 };
201 
202 /* Get the LockTupleMode for a given MultiXactStatus */
203 #define TUPLOCK_from_mxstatus(status) \
204  (MultiXactStatusLock[(status)])
205 
206 /* ----------------------------------------------------------------
207  * heap support routines
208  * ----------------------------------------------------------------
209  */
210 
211 /* ----------------
212  * initscan - scan code common to heap_beginscan and heap_rescan
213  * ----------------
214  */
215 static void
216 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
217 {
218  bool allow_strat;
219  bool allow_sync;
220 
221  /*
222  * Determine the number of blocks we have to scan.
223  *
224  * It is sufficient to do this once at scan start, since any tuples added
225  * while the scan is in progress will be invisible to my snapshot anyway.
226  * (That is not true when using a non-MVCC snapshot. However, we couldn't
227  * guarantee to return tuples added after scan start anyway, since they
228  * might go into pages we already scanned. To guarantee consistent
229  * results for a non-MVCC snapshot, the caller must hold some higher-level
230  * lock that ensures the interesting tuple(s) won't change.)
231  */
232  if (scan->rs_parallel != NULL)
233  scan->rs_nblocks = scan->rs_parallel->phs_nblocks;
234  else
236 
237  /*
238  * If the table is large relative to NBuffers, use a bulk-read access
239  * strategy and enable synchronized scanning (see syncscan.c). Although
240  * the thresholds for these features could be different, we make them the
241  * same so that there are only two behaviors to tune rather than four.
242  * (However, some callers need to be able to disable one or both of these
243  * behaviors, independently of the size of the table; also there is a GUC
244  * variable that can disable synchronized scanning.)
245  *
246  * Note that heap_parallelscan_initialize has a very similar test; if you
247  * change this, consider changing that one, too.
248  */
249  if (!RelationUsesLocalBuffers(scan->rs_rd) &&
250  scan->rs_nblocks > NBuffers / 4)
251  {
252  allow_strat = scan->rs_allow_strat;
253  allow_sync = scan->rs_allow_sync;
254  }
255  else
256  allow_strat = allow_sync = false;
257 
258  if (allow_strat)
259  {
260  /* During a rescan, keep the previous strategy object. */
261  if (scan->rs_strategy == NULL)
263  }
264  else
265  {
266  if (scan->rs_strategy != NULL)
268  scan->rs_strategy = NULL;
269  }
270 
271  if (scan->rs_parallel != NULL)
272  {
273  /* For parallel scan, believe whatever ParallelHeapScanDesc says. */
274  scan->rs_syncscan = scan->rs_parallel->phs_syncscan;
275  }
276  else if (keep_startblock)
277  {
278  /*
279  * When rescanning, we want to keep the previous startblock setting,
280  * so that rewinding a cursor doesn't generate surprising results.
281  * Reset the active syncscan setting, though.
282  */
283  scan->rs_syncscan = (allow_sync && synchronize_seqscans);
284  }
285  else if (allow_sync && synchronize_seqscans)
286  {
287  scan->rs_syncscan = true;
288  scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
289  }
290  else
291  {
292  scan->rs_syncscan = false;
293  scan->rs_startblock = 0;
294  }
295 
297  scan->rs_inited = false;
298  scan->rs_ctup.t_data = NULL;
300  scan->rs_cbuf = InvalidBuffer;
302 
303  /* page-at-a-time fields are always invalid when not rs_inited */
304 
305  /*
306  * copy the scan key, if appropriate
307  */
308  if (key != NULL)
309  memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
310 
311  /*
312  * Currently, we don't have a stats counter for bitmap heap scans (but the
313  * underlying bitmap index scans will be counted) or sample scans (we only
314  * update stats for tuple fetches there)
315  */
316  if (!scan->rs_bitmapscan && !scan->rs_samplescan)
318 }
319 
320 /*
321  * heap_setscanlimits - restrict range of a heapscan
322  *
323  * startBlk is the page to start at
324  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
325  */
326 void
328 {
329  Assert(!scan->rs_inited); /* else too late to change */
330  Assert(!scan->rs_syncscan); /* else rs_startblock is significant */
331 
332  /* Check startBlk is valid (but allow case of zero blocks...) */
333  Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
334 
335  scan->rs_startblock = startBlk;
336  scan->rs_numblocks = numBlks;
337 }
338 
339 /*
340  * heapgetpage - subroutine for heapgettup()
341  *
342  * This routine reads and pins the specified page of the relation.
343  * In page-at-a-time mode it performs additional work, namely determining
344  * which tuples on the page are visible.
345  */
346 void
348 {
349  Buffer buffer;
350  Snapshot snapshot;
351  Page dp;
352  int lines;
353  int ntup;
354  OffsetNumber lineoff;
355  ItemId lpp;
356  bool all_visible;
357 
358  Assert(page < scan->rs_nblocks);
359 
360  /* release previous scan buffer, if any */
361  if (BufferIsValid(scan->rs_cbuf))
362  {
363  ReleaseBuffer(scan->rs_cbuf);
364  scan->rs_cbuf = InvalidBuffer;
365  }
366 
367  /*
368  * Be sure to check for interrupts at least once per page. Checks at
369  * higher code levels won't be able to stop a seqscan that encounters many
370  * pages' worth of consecutive dead tuples.
371  */
373 
374  /* read page using selected strategy */
375  scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
376  RBM_NORMAL, scan->rs_strategy);
377  scan->rs_cblock = page;
378 
379  if (!scan->rs_pageatatime)
380  return;
381 
382  buffer = scan->rs_cbuf;
383  snapshot = scan->rs_snapshot;
384 
385  /*
386  * Prune and repair fragmentation for the whole page, if possible.
387  */
388  heap_page_prune_opt(scan->rs_rd, buffer);
389 
390  /*
391  * We must hold share lock on the buffer content while examining tuple
392  * visibility. Afterwards, however, the tuples we have found to be
393  * visible are guaranteed good as long as we hold the buffer pin.
394  */
395  LockBuffer(buffer, BUFFER_LOCK_SHARE);
396 
397  dp = BufferGetPage(buffer);
398  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
399  lines = PageGetMaxOffsetNumber(dp);
400  ntup = 0;
401 
402  /*
403  * If the all-visible flag indicates that all tuples on the page are
404  * visible to everyone, we can skip the per-tuple visibility tests.
405  *
406  * Note: In hot standby, a tuple that's already visible to all
407  * transactions in the master might still be invisible to a read-only
408  * transaction in the standby. We partly handle this problem by tracking
409  * the minimum xmin of visible tuples as the cut-off XID while marking a
410  * page all-visible on master and WAL log that along with the visibility
411  * map SET operation. In hot standby, we wait for (or abort) all
412  * transactions that can potentially may not see one or more tuples on the
413  * page. That's how index-only scans work fine in hot standby. A crucial
414  * difference between index-only scans and heap scans is that the
415  * index-only scan completely relies on the visibility map where as heap
416  * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
417  * the page-level flag can be trusted in the same way, because it might
418  * get propagated somehow without being explicitly WAL-logged, e.g. via a
419  * full page write. Until we can prove that beyond doubt, let's check each
420  * tuple for visibility the hard way.
421  */
422  all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
423 
424  for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
425  lineoff <= lines;
426  lineoff++, lpp++)
427  {
428  if (ItemIdIsNormal(lpp))
429  {
430  HeapTupleData loctup;
431  bool valid;
432 
433  loctup.t_tableOid = RelationGetRelid(scan->rs_rd);
434  loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
435  loctup.t_len = ItemIdGetLength(lpp);
436  ItemPointerSet(&(loctup.t_self), page, lineoff);
437 
438  if (all_visible)
439  valid = true;
440  else
441  valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
442 
443  CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
444  buffer, snapshot);
445 
446  if (valid)
447  scan->rs_vistuples[ntup++] = lineoff;
448  }
449  }
450 
452 
453  Assert(ntup <= MaxHeapTuplesPerPage);
454  scan->rs_ntuples = ntup;
455 }
456 
457 /* ----------------
458  * heapgettup - fetch next heap tuple
459  *
460  * Initialize the scan if not already done; then advance to the next
461  * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
462  * or set scan->rs_ctup.t_data = NULL if no more tuples.
463  *
464  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
465  * by scan->rs_ctup".
466  *
467  * Note: the reason nkeys/key are passed separately, even though they are
468  * kept in the scan descriptor, is that the caller may not want us to check
469  * the scankeys.
470  *
471  * Note: when we fall off the end of the scan in either direction, we
472  * reset rs_inited. This means that a further request with the same
473  * scan direction will restart the scan, which is a bit odd, but a
474  * request with the opposite scan direction will start a fresh scan
475  * in the proper direction. The latter is required behavior for cursors,
476  * while the former case is generally undefined behavior in Postgres
477  * so we don't care too much.
478  * ----------------
479  */
480 static void
482  ScanDirection dir,
483  int nkeys,
484  ScanKey key)
485 {
486  HeapTuple tuple = &(scan->rs_ctup);
487  Snapshot snapshot = scan->rs_snapshot;
488  bool backward = ScanDirectionIsBackward(dir);
489  BlockNumber page;
490  bool finished;
491  Page dp;
492  int lines;
493  OffsetNumber lineoff;
494  int linesleft;
495  ItemId lpp;
496 
497  /*
498  * calculate next starting lineoff, given scan direction
499  */
500  if (ScanDirectionIsForward(dir))
501  {
502  if (!scan->rs_inited)
503  {
504  /*
505  * return null immediately if relation is empty
506  */
507  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
508  {
509  Assert(!BufferIsValid(scan->rs_cbuf));
510  tuple->t_data = NULL;
511  return;
512  }
513  if (scan->rs_parallel != NULL)
514  {
516 
517  page = heap_parallelscan_nextpage(scan);
518 
519  /* Other processes might have already finished the scan. */
520  if (page == InvalidBlockNumber)
521  {
522  Assert(!BufferIsValid(scan->rs_cbuf));
523  tuple->t_data = NULL;
524  return;
525  }
526  }
527  else
528  page = scan->rs_startblock; /* first page */
529  heapgetpage(scan, page);
530  lineoff = FirstOffsetNumber; /* first offnum */
531  scan->rs_inited = true;
532  }
533  else
534  {
535  /* continue from previously returned page/tuple */
536  page = scan->rs_cblock; /* current page */
537  lineoff = /* next offnum */
539  }
540 
542 
543  dp = BufferGetPage(scan->rs_cbuf);
544  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
545  lines = PageGetMaxOffsetNumber(dp);
546  /* page and lineoff now reference the physically next tid */
547 
548  linesleft = lines - lineoff + 1;
549  }
550  else if (backward)
551  {
552  /* backward parallel scan not supported */
553  Assert(scan->rs_parallel == NULL);
554 
555  if (!scan->rs_inited)
556  {
557  /*
558  * return null immediately if relation is empty
559  */
560  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
561  {
562  Assert(!BufferIsValid(scan->rs_cbuf));
563  tuple->t_data = NULL;
564  return;
565  }
566 
567  /*
568  * Disable reporting to syncscan logic in a backwards scan; it's
569  * not very likely anyone else is doing the same thing at the same
570  * time, and much more likely that we'll just bollix things for
571  * forward scanners.
572  */
573  scan->rs_syncscan = false;
574  /* start from last page of the scan */
575  if (scan->rs_startblock > 0)
576  page = scan->rs_startblock - 1;
577  else
578  page = scan->rs_nblocks - 1;
579  heapgetpage(scan, page);
580  }
581  else
582  {
583  /* continue from previously returned page/tuple */
584  page = scan->rs_cblock; /* current page */
585  }
586 
588 
589  dp = BufferGetPage(scan->rs_cbuf);
590  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
591  lines = PageGetMaxOffsetNumber(dp);
592 
593  if (!scan->rs_inited)
594  {
595  lineoff = lines; /* final offnum */
596  scan->rs_inited = true;
597  }
598  else
599  {
600  lineoff = /* previous offnum */
602  }
603  /* page and lineoff now reference the physically previous tid */
604 
605  linesleft = lineoff;
606  }
607  else
608  {
609  /*
610  * ``no movement'' scan direction: refetch prior tuple
611  */
612  if (!scan->rs_inited)
613  {
614  Assert(!BufferIsValid(scan->rs_cbuf));
615  tuple->t_data = NULL;
616  return;
617  }
618 
619  page = ItemPointerGetBlockNumber(&(tuple->t_self));
620  if (page != scan->rs_cblock)
621  heapgetpage(scan, page);
622 
623  /* Since the tuple was previously fetched, needn't lock page here */
624  dp = BufferGetPage(scan->rs_cbuf);
625  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
626  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
627  lpp = PageGetItemId(dp, lineoff);
628  Assert(ItemIdIsNormal(lpp));
629 
630  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
631  tuple->t_len = ItemIdGetLength(lpp);
632 
633  return;
634  }
635 
636  /*
637  * advance the scan until we find a qualifying tuple or run out of stuff
638  * to scan
639  */
640  lpp = PageGetItemId(dp, lineoff);
641  for (;;)
642  {
643  while (linesleft > 0)
644  {
645  if (ItemIdIsNormal(lpp))
646  {
647  bool valid;
648 
649  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
650  tuple->t_len = ItemIdGetLength(lpp);
651  ItemPointerSet(&(tuple->t_self), page, lineoff);
652 
653  /*
654  * if current tuple qualifies, return it.
655  */
656  valid = HeapTupleSatisfiesVisibility(tuple,
657  snapshot,
658  scan->rs_cbuf);
659 
660  CheckForSerializableConflictOut(valid, scan->rs_rd, tuple,
661  scan->rs_cbuf, snapshot);
662 
663  if (valid && key != NULL)
664  HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
665  nkeys, key, valid);
666 
667  if (valid)
668  {
670  return;
671  }
672  }
673 
674  /*
675  * otherwise move to the next item on the page
676  */
677  --linesleft;
678  if (backward)
679  {
680  --lpp; /* move back in this page's ItemId array */
681  --lineoff;
682  }
683  else
684  {
685  ++lpp; /* move forward in this page's ItemId array */
686  ++lineoff;
687  }
688  }
689 
690  /*
691  * if we get here, it means we've exhausted the items on this page and
692  * it's time to move to the next.
693  */
695 
696  /*
697  * advance to next/prior page and detect end of scan
698  */
699  if (backward)
700  {
701  finished = (page == scan->rs_startblock) ||
702  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
703  if (page == 0)
704  page = scan->rs_nblocks;
705  page--;
706  }
707  else if (scan->rs_parallel != NULL)
708  {
709  page = heap_parallelscan_nextpage(scan);
710  finished = (page == InvalidBlockNumber);
711  }
712  else
713  {
714  page++;
715  if (page >= scan->rs_nblocks)
716  page = 0;
717  finished = (page == scan->rs_startblock) ||
718  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
719 
720  /*
721  * Report our new scan position for synchronization purposes. We
722  * don't do that when moving backwards, however. That would just
723  * mess up any other forward-moving scanners.
724  *
725  * Note: we do this before checking for end of scan so that the
726  * final state of the position hint is back at the start of the
727  * rel. That's not strictly necessary, but otherwise when you run
728  * the same query multiple times the starting position would shift
729  * a little bit backwards on every invocation, which is confusing.
730  * We don't guarantee any specific ordering in general, though.
731  */
732  if (scan->rs_syncscan)
733  ss_report_location(scan->rs_rd, page);
734  }
735 
736  /*
737  * return NULL if we've exhausted all the pages
738  */
739  if (finished)
740  {
741  if (BufferIsValid(scan->rs_cbuf))
742  ReleaseBuffer(scan->rs_cbuf);
743  scan->rs_cbuf = InvalidBuffer;
745  tuple->t_data = NULL;
746  scan->rs_inited = false;
747  return;
748  }
749 
750  heapgetpage(scan, page);
751 
753 
754  dp = BufferGetPage(scan->rs_cbuf);
755  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
756  lines = PageGetMaxOffsetNumber((Page) dp);
757  linesleft = lines;
758  if (backward)
759  {
760  lineoff = lines;
761  lpp = PageGetItemId(dp, lines);
762  }
763  else
764  {
765  lineoff = FirstOffsetNumber;
766  lpp = PageGetItemId(dp, FirstOffsetNumber);
767  }
768  }
769 }
770 
771 /* ----------------
772  * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
773  *
774  * Same API as heapgettup, but used in page-at-a-time mode
775  *
776  * The internal logic is much the same as heapgettup's too, but there are some
777  * differences: we do not take the buffer content lock (that only needs to
778  * happen inside heapgetpage), and we iterate through just the tuples listed
779  * in rs_vistuples[] rather than all tuples on the page. Notice that
780  * lineindex is 0-based, where the corresponding loop variable lineoff in
781  * heapgettup is 1-based.
782  * ----------------
783  */
784 static void
786  ScanDirection dir,
787  int nkeys,
788  ScanKey key)
789 {
790  HeapTuple tuple = &(scan->rs_ctup);
791  bool backward = ScanDirectionIsBackward(dir);
792  BlockNumber page;
793  bool finished;
794  Page dp;
795  int lines;
796  int lineindex;
797  OffsetNumber lineoff;
798  int linesleft;
799  ItemId lpp;
800 
801  /*
802  * calculate next starting lineindex, given scan direction
803  */
804  if (ScanDirectionIsForward(dir))
805  {
806  if (!scan->rs_inited)
807  {
808  /*
809  * return null immediately if relation is empty
810  */
811  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
812  {
813  Assert(!BufferIsValid(scan->rs_cbuf));
814  tuple->t_data = NULL;
815  return;
816  }
817  if (scan->rs_parallel != NULL)
818  {
820 
821  page = heap_parallelscan_nextpage(scan);
822 
823  /* Other processes might have already finished the scan. */
824  if (page == InvalidBlockNumber)
825  {
826  Assert(!BufferIsValid(scan->rs_cbuf));
827  tuple->t_data = NULL;
828  return;
829  }
830  }
831  else
832  page = scan->rs_startblock; /* first page */
833  heapgetpage(scan, page);
834  lineindex = 0;
835  scan->rs_inited = true;
836  }
837  else
838  {
839  /* continue from previously returned page/tuple */
840  page = scan->rs_cblock; /* current page */
841  lineindex = scan->rs_cindex + 1;
842  }
843 
844  dp = BufferGetPage(scan->rs_cbuf);
845  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
846  lines = scan->rs_ntuples;
847  /* page and lineindex now reference the next visible tid */
848 
849  linesleft = lines - lineindex;
850  }
851  else if (backward)
852  {
853  /* backward parallel scan not supported */
854  Assert(scan->rs_parallel == NULL);
855 
856  if (!scan->rs_inited)
857  {
858  /*
859  * return null immediately if relation is empty
860  */
861  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
862  {
863  Assert(!BufferIsValid(scan->rs_cbuf));
864  tuple->t_data = NULL;
865  return;
866  }
867 
868  /*
869  * Disable reporting to syncscan logic in a backwards scan; it's
870  * not very likely anyone else is doing the same thing at the same
871  * time, and much more likely that we'll just bollix things for
872  * forward scanners.
873  */
874  scan->rs_syncscan = false;
875  /* start from last page of the scan */
876  if (scan->rs_startblock > 0)
877  page = scan->rs_startblock - 1;
878  else
879  page = scan->rs_nblocks - 1;
880  heapgetpage(scan, page);
881  }
882  else
883  {
884  /* continue from previously returned page/tuple */
885  page = scan->rs_cblock; /* current page */
886  }
887 
888  dp = BufferGetPage(scan->rs_cbuf);
889  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
890  lines = scan->rs_ntuples;
891 
892  if (!scan->rs_inited)
893  {
894  lineindex = lines - 1;
895  scan->rs_inited = true;
896  }
897  else
898  {
899  lineindex = scan->rs_cindex - 1;
900  }
901  /* page and lineindex now reference the previous visible tid */
902 
903  linesleft = lineindex + 1;
904  }
905  else
906  {
907  /*
908  * ``no movement'' scan direction: refetch prior tuple
909  */
910  if (!scan->rs_inited)
911  {
912  Assert(!BufferIsValid(scan->rs_cbuf));
913  tuple->t_data = NULL;
914  return;
915  }
916 
917  page = ItemPointerGetBlockNumber(&(tuple->t_self));
918  if (page != scan->rs_cblock)
919  heapgetpage(scan, page);
920 
921  /* Since the tuple was previously fetched, needn't lock page here */
922  dp = BufferGetPage(scan->rs_cbuf);
923  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
924  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
925  lpp = PageGetItemId(dp, lineoff);
926  Assert(ItemIdIsNormal(lpp));
927 
928  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
929  tuple->t_len = ItemIdGetLength(lpp);
930 
931  /* check that rs_cindex is in sync */
932  Assert(scan->rs_cindex < scan->rs_ntuples);
933  Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
934 
935  return;
936  }
937 
938  /*
939  * advance the scan until we find a qualifying tuple or run out of stuff
940  * to scan
941  */
942  for (;;)
943  {
944  while (linesleft > 0)
945  {
946  lineoff = scan->rs_vistuples[lineindex];
947  lpp = PageGetItemId(dp, lineoff);
948  Assert(ItemIdIsNormal(lpp));
949 
950  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
951  tuple->t_len = ItemIdGetLength(lpp);
952  ItemPointerSet(&(tuple->t_self), page, lineoff);
953 
954  /*
955  * if current tuple qualifies, return it.
956  */
957  if (key != NULL)
958  {
959  bool valid;
960 
961  HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
962  nkeys, key, valid);
963  if (valid)
964  {
965  scan->rs_cindex = lineindex;
966  return;
967  }
968  }
969  else
970  {
971  scan->rs_cindex = lineindex;
972  return;
973  }
974 
975  /*
976  * otherwise move to the next item on the page
977  */
978  --linesleft;
979  if (backward)
980  --lineindex;
981  else
982  ++lineindex;
983  }
984 
985  /*
986  * if we get here, it means we've exhausted the items on this page and
987  * it's time to move to the next.
988  */
989  if (backward)
990  {
991  finished = (page == scan->rs_startblock) ||
992  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
993  if (page == 0)
994  page = scan->rs_nblocks;
995  page--;
996  }
997  else if (scan->rs_parallel != NULL)
998  {
999  page = heap_parallelscan_nextpage(scan);
1000  finished = (page == InvalidBlockNumber);
1001  }
1002  else
1003  {
1004  page++;
1005  if (page >= scan->rs_nblocks)
1006  page = 0;
1007  finished = (page == scan->rs_startblock) ||
1008  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1009 
1010  /*
1011  * Report our new scan position for synchronization purposes. We
1012  * don't do that when moving backwards, however. That would just
1013  * mess up any other forward-moving scanners.
1014  *
1015  * Note: we do this before checking for end of scan so that the
1016  * final state of the position hint is back at the start of the
1017  * rel. That's not strictly necessary, but otherwise when you run
1018  * the same query multiple times the starting position would shift
1019  * a little bit backwards on every invocation, which is confusing.
1020  * We don't guarantee any specific ordering in general, though.
1021  */
1022  if (scan->rs_syncscan)
1023  ss_report_location(scan->rs_rd, page);
1024  }
1025 
1026  /*
1027  * return NULL if we've exhausted all the pages
1028  */
1029  if (finished)
1030  {
1031  if (BufferIsValid(scan->rs_cbuf))
1032  ReleaseBuffer(scan->rs_cbuf);
1033  scan->rs_cbuf = InvalidBuffer;
1034  scan->rs_cblock = InvalidBlockNumber;
1035  tuple->t_data = NULL;
1036  scan->rs_inited = false;
1037  return;
1038  }
1039 
1040  heapgetpage(scan, page);
1041 
1042  dp = BufferGetPage(scan->rs_cbuf);
1043  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
1044  lines = scan->rs_ntuples;
1045  linesleft = lines;
1046  if (backward)
1047  lineindex = lines - 1;
1048  else
1049  lineindex = 0;
1050  }
1051 }
1052 
1053 
1054 #if defined(DISABLE_COMPLEX_MACRO)
1055 /*
1056  * This is formatted so oddly so that the correspondence to the macro
1057  * definition in access/htup_details.h is maintained.
1058  */
1059 Datum
1060 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1061  bool *isnull)
1062 {
1063  return (
1064  (attnum) > 0 ?
1065  (
1066  (*(isnull) = false),
1067  HeapTupleNoNulls(tup) ?
1068  (
1069  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1070  (
1071  fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1072  (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1073  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1074  )
1075  :
1076  nocachegetattr((tup), (attnum), (tupleDesc))
1077  )
1078  :
1079  (
1080  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1081  (
1082  (*(isnull) = true),
1083  (Datum) NULL
1084  )
1085  :
1086  (
1087  nocachegetattr((tup), (attnum), (tupleDesc))
1088  )
1089  )
1090  )
1091  :
1092  (
1093  (Datum) NULL
1094  )
1095  );
1096 }
1097 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1098 
1099 
1100 /* ----------------------------------------------------------------
1101  * heap access method interface
1102  * ----------------------------------------------------------------
1103  */
1104 
1105 /* ----------------
1106  * relation_open - open any relation by relation OID
1107  *
1108  * If lockmode is not "NoLock", the specified kind of lock is
1109  * obtained on the relation. (Generally, NoLock should only be
1110  * used if the caller knows it has some appropriate lock on the
1111  * relation already.)
1112  *
1113  * An error is raised if the relation does not exist.
1114  *
1115  * NB: a "relation" is anything with a pg_class entry. The caller is
1116  * expected to check whether the relkind is something it can handle.
1117  * ----------------
1118  */
1119 Relation
1120 relation_open(Oid relationId, LOCKMODE lockmode)
1121 {
1122  Relation r;
1123 
1124  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1125 
1126  /* Get the lock before trying to open the relcache entry */
1127  if (lockmode != NoLock)
1128  LockRelationOid(relationId, lockmode);
1129 
1130  /* The relcache does all the real work... */
1131  r = RelationIdGetRelation(relationId);
1132 
1133  if (!RelationIsValid(r))
1134  elog(ERROR, "could not open relation with OID %u", relationId);
1135 
1136  /* Make note that we've accessed a temporary relation */
1137  if (RelationUsesLocalBuffers(r))
1139 
1140  pgstat_initstats(r);
1141 
1142  return r;
1143 }
1144 
1145 /* ----------------
1146  * try_relation_open - open any relation by relation OID
1147  *
1148  * Same as relation_open, except return NULL instead of failing
1149  * if the relation does not exist.
1150  * ----------------
1151  */
1152 Relation
1153 try_relation_open(Oid relationId, LOCKMODE lockmode)
1154 {
1155  Relation r;
1156 
1157  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1158 
1159  /* Get the lock first */
1160  if (lockmode != NoLock)
1161  LockRelationOid(relationId, lockmode);
1162 
1163  /*
1164  * Now that we have the lock, probe to see if the relation really exists
1165  * or not.
1166  */
1167  if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId)))
1168  {
1169  /* Release useless lock */
1170  if (lockmode != NoLock)
1171  UnlockRelationOid(relationId, lockmode);
1172 
1173  return NULL;
1174  }
1175 
1176  /* Should be safe to do a relcache load */
1177  r = RelationIdGetRelation(relationId);
1178 
1179  if (!RelationIsValid(r))
1180  elog(ERROR, "could not open relation with OID %u", relationId);
1181 
1182  /* Make note that we've accessed a temporary relation */
1183  if (RelationUsesLocalBuffers(r))
1185 
1186  pgstat_initstats(r);
1187 
1188  return r;
1189 }
1190 
1191 /* ----------------
1192  * relation_openrv - open any relation specified by a RangeVar
1193  *
1194  * Same as relation_open, but the relation is specified by a RangeVar.
1195  * ----------------
1196  */
1197 Relation
1198 relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
1199 {
1200  Oid relOid;
1201 
1202  /*
1203  * Check for shared-cache-inval messages before trying to open the
1204  * relation. This is needed even if we already hold a lock on the
1205  * relation, because GRANT/REVOKE are executed without taking any lock on
1206  * the target relation, and we want to be sure we see current ACL
1207  * information. We can skip this if asked for NoLock, on the assumption
1208  * that such a call is not the first one in the current command, and so we
1209  * should be reasonably up-to-date already. (XXX this all could stand to
1210  * be redesigned, but for the moment we'll keep doing this like it's been
1211  * done historically.)
1212  */
1213  if (lockmode != NoLock)
1215 
1216  /* Look up and lock the appropriate relation using namespace search */
1217  relOid = RangeVarGetRelid(relation, lockmode, false);
1218 
1219  /* Let relation_open do the rest */
1220  return relation_open(relOid, NoLock);
1221 }
1222 
1223 /* ----------------
1224  * relation_openrv_extended - open any relation specified by a RangeVar
1225  *
1226  * Same as relation_openrv, but with an additional missing_ok argument
1227  * allowing a NULL return rather than an error if the relation is not
1228  * found. (Note that some other causes, such as permissions problems,
1229  * will still result in an ereport.)
1230  * ----------------
1231  */
1232 Relation
1233 relation_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1234  bool missing_ok)
1235 {
1236  Oid relOid;
1237 
1238  /*
1239  * Check for shared-cache-inval messages before trying to open the
1240  * relation. See comments in relation_openrv().
1241  */
1242  if (lockmode != NoLock)
1244 
1245  /* Look up and lock the appropriate relation using namespace search */
1246  relOid = RangeVarGetRelid(relation, lockmode, missing_ok);
1247 
1248  /* Return NULL on not-found */
1249  if (!OidIsValid(relOid))
1250  return NULL;
1251 
1252  /* Let relation_open do the rest */
1253  return relation_open(relOid, NoLock);
1254 }
1255 
1256 /* ----------------
1257  * relation_close - close any relation
1258  *
1259  * If lockmode is not "NoLock", we then release the specified lock.
1260  *
1261  * Note that it is often sensible to hold a lock beyond relation_close;
1262  * in that case, the lock is released automatically at xact end.
1263  * ----------------
1264  */
1265 void
1266 relation_close(Relation relation, LOCKMODE lockmode)
1267 {
1268  LockRelId relid = relation->rd_lockInfo.lockRelId;
1269 
1270  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1271 
1272  /* The relcache does the real work... */
1273  RelationClose(relation);
1274 
1275  if (lockmode != NoLock)
1276  UnlockRelationId(&relid, lockmode);
1277 }
1278 
1279 
1280 /* ----------------
1281  * heap_open - open a heap relation by relation OID
1282  *
1283  * This is essentially relation_open plus check that the relation
1284  * is not an index nor a composite type. (The caller should also
1285  * check that it's not a view or foreign table before assuming it has
1286  * storage.)
1287  * ----------------
1288  */
1289 Relation
1290 heap_open(Oid relationId, LOCKMODE lockmode)
1291 {
1292  Relation r;
1293 
1294  r = relation_open(relationId, lockmode);
1295 
1296  if (r->rd_rel->relkind == RELKIND_INDEX)
1297  ereport(ERROR,
1298  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1299  errmsg("\"%s\" is an index",
1301  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1302  ereport(ERROR,
1303  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1304  errmsg("\"%s\" is a composite type",
1306 
1307  return r;
1308 }
1309 
1310 /* ----------------
1311  * heap_openrv - open a heap relation specified
1312  * by a RangeVar node
1313  *
1314  * As above, but relation is specified by a RangeVar.
1315  * ----------------
1316  */
1317 Relation
1318 heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
1319 {
1320  Relation r;
1321 
1322  r = relation_openrv(relation, lockmode);
1323 
1324  if (r->rd_rel->relkind == RELKIND_INDEX)
1325  ereport(ERROR,
1326  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1327  errmsg("\"%s\" is an index",
1329  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1330  ereport(ERROR,
1331  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1332  errmsg("\"%s\" is a composite type",
1334 
1335  return r;
1336 }
1337 
1338 /* ----------------
1339  * heap_openrv_extended - open a heap relation specified
1340  * by a RangeVar node
1341  *
1342  * As above, but optionally return NULL instead of failing for
1343  * relation-not-found.
1344  * ----------------
1345  */
1346 Relation
1347 heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1348  bool missing_ok)
1349 {
1350  Relation r;
1351 
1352  r = relation_openrv_extended(relation, lockmode, missing_ok);
1353 
1354  if (r)
1355  {
1356  if (r->rd_rel->relkind == RELKIND_INDEX)
1357  ereport(ERROR,
1358  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1359  errmsg("\"%s\" is an index",
1361  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1362  ereport(ERROR,
1363  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1364  errmsg("\"%s\" is a composite type",
1366  }
1367 
1368  return r;
1369 }
1370 
1371 
1372 /* ----------------
1373  * heap_beginscan - begin relation scan
1374  *
1375  * heap_beginscan is the "standard" case.
1376  *
1377  * heap_beginscan_catalog differs in setting up its own temporary snapshot.
1378  *
1379  * heap_beginscan_strat offers an extended API that lets the caller control
1380  * whether a nondefault buffer access strategy can be used, and whether
1381  * syncscan can be chosen (possibly resulting in the scan not starting from
1382  * block zero). Both of these default to true with plain heap_beginscan.
1383  *
1384  * heap_beginscan_bm is an alternative entry point for setting up a
1385  * HeapScanDesc for a bitmap heap scan. Although that scan technology is
1386  * really quite unlike a standard seqscan, there is just enough commonality
1387  * to make it worth using the same data structure.
1388  *
1389  * heap_beginscan_sampling is an alternative entry point for setting up a
1390  * HeapScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
1391  * using the same data structure although the behavior is rather different.
1392  * In addition to the options offered by heap_beginscan_strat, this call
1393  * also allows control of whether page-mode visibility checking is used.
1394  * ----------------
1395  */
1397 heap_beginscan(Relation relation, Snapshot snapshot,
1398  int nkeys, ScanKey key)
1399 {
1400  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1401  true, true, true, false, false, false);
1402 }
1403 
1405 heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key)
1406 {
1407  Oid relid = RelationGetRelid(relation);
1408  Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
1409 
1410  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1411  true, true, true, false, false, true);
1412 }
1413 
1416  int nkeys, ScanKey key,
1417  bool allow_strat, bool allow_sync)
1418 {
1419  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1420  allow_strat, allow_sync, true,
1421  false, false, false);
1422 }
1423 
1426  int nkeys, ScanKey key)
1427 {
1428  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1429  false, false, true, true, false, false);
1430 }
1431 
1434  int nkeys, ScanKey key,
1435  bool allow_strat, bool allow_sync, bool allow_pagemode)
1436 {
1437  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1438  allow_strat, allow_sync, allow_pagemode,
1439  false, true, false);
1440 }
1441 
1442 static HeapScanDesc
1444  int nkeys, ScanKey key,
1445  ParallelHeapScanDesc parallel_scan,
1446  bool allow_strat,
1447  bool allow_sync,
1448  bool allow_pagemode,
1449  bool is_bitmapscan,
1450  bool is_samplescan,
1451  bool temp_snap)
1452 {
1453  HeapScanDesc scan;
1454 
1455  /*
1456  * increment relation ref count while scanning relation
1457  *
1458  * This is just to make really sure the relcache entry won't go away while
1459  * the scan has a pointer to it. Caller should be holding the rel open
1460  * anyway, so this is redundant in all normal scenarios...
1461  */
1463 
1464  /*
1465  * allocate and initialize scan descriptor
1466  */
1467  scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1468 
1469  scan->rs_rd = relation;
1470  scan->rs_snapshot = snapshot;
1471  scan->rs_nkeys = nkeys;
1472  scan->rs_bitmapscan = is_bitmapscan;
1473  scan->rs_samplescan = is_samplescan;
1474  scan->rs_strategy = NULL; /* set in initscan */
1475  scan->rs_allow_strat = allow_strat;
1476  scan->rs_allow_sync = allow_sync;
1477  scan->rs_temp_snap = temp_snap;
1478  scan->rs_parallel = parallel_scan;
1479 
1480  /*
1481  * we can use page-at-a-time mode if it's an MVCC-safe snapshot
1482  */
1483  scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot);
1484 
1485  /*
1486  * For a seqscan in a serializable transaction, acquire a predicate lock
1487  * on the entire relation. This is required not only to lock all the
1488  * matching tuples, but also to conflict with new insertions into the
1489  * table. In an indexscan, we take page locks on the index pages covering
1490  * the range specified in the scan qual, but in a heap scan there is
1491  * nothing more fine-grained to lock. A bitmap scan is a different story,
1492  * there we have already scanned the index and locked the index pages
1493  * covering the predicate. But in that case we still have to lock any
1494  * matching heap tuples.
1495  */
1496  if (!is_bitmapscan)
1497  PredicateLockRelation(relation, snapshot);
1498 
1499  /* we only need to set this up once */
1500  scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1501 
1502  /*
1503  * we do this here instead of in initscan() because heap_rescan also calls
1504  * initscan() and we don't want to allocate memory again
1505  */
1506  if (nkeys > 0)
1507  scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1508  else
1509  scan->rs_key = NULL;
1510 
1511  initscan(scan, key, false);
1512 
1513  return scan;
1514 }
1515 
1516 /* ----------------
1517  * heap_rescan - restart a relation scan
1518  * ----------------
1519  */
1520 void
1522  ScanKey key)
1523 {
1524  /*
1525  * unpin scan buffers
1526  */
1527  if (BufferIsValid(scan->rs_cbuf))
1528  ReleaseBuffer(scan->rs_cbuf);
1529 
1530  /*
1531  * reinitialize scan descriptor
1532  */
1533  initscan(scan, key, true);
1534 }
1535 
1536 /* ----------------
1537  * heap_rescan_set_params - restart a relation scan after changing params
1538  *
1539  * This call allows changing the buffer strategy, syncscan, and pagemode
1540  * options before starting a fresh scan. Note that although the actual use
1541  * of syncscan might change (effectively, enabling or disabling reporting),
1542  * the previously selected startblock will be kept.
1543  * ----------------
1544  */
1545 void
1547  bool allow_strat, bool allow_sync, bool allow_pagemode)
1548 {
1549  /* adjust parameters */
1550  scan->rs_allow_strat = allow_strat;
1551  scan->rs_allow_sync = allow_sync;
1552  scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(scan->rs_snapshot);
1553  /* ... and rescan */
1554  heap_rescan(scan, key);
1555 }
1556 
1557 /* ----------------
1558  * heap_endscan - end relation scan
1559  *
1560  * See how to integrate with index scans.
1561  * Check handling if reldesc caching.
1562  * ----------------
1563  */
1564 void
1566 {
1567  /* Note: no locking manipulations needed */
1568 
1569  /*
1570  * unpin scan buffers
1571  */
1572  if (BufferIsValid(scan->rs_cbuf))
1573  ReleaseBuffer(scan->rs_cbuf);
1574 
1575  /*
1576  * decrement relation reference count and free scan descriptor storage
1577  */
1579 
1580  if (scan->rs_key)
1581  pfree(scan->rs_key);
1582 
1583  if (scan->rs_strategy != NULL)
1585 
1586  if (scan->rs_temp_snap)
1588 
1589  pfree(scan);
1590 }
1591 
1592 /* ----------------
1593  * heap_parallelscan_estimate - estimate storage for ParallelHeapScanDesc
1594  *
1595  * Sadly, this doesn't reduce to a constant, because the size required
1596  * to serialize the snapshot can vary.
1597  * ----------------
1598  */
1599 Size
1601 {
1602  return add_size(offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1603  EstimateSnapshotSpace(snapshot));
1604 }
1605 
1606 /* ----------------
1607  * heap_parallelscan_initialize - initialize ParallelHeapScanDesc
1608  *
1609  * Must allow as many bytes of shared memory as returned by
1610  * heap_parallelscan_estimate. Call this just once in the leader
1611  * process; then, individual workers attach via heap_beginscan_parallel.
1612  * ----------------
1613  */
1614 void
1616  Snapshot snapshot)
1617 {
1618  target->phs_relid = RelationGetRelid(relation);
1619  target->phs_nblocks = RelationGetNumberOfBlocks(relation);
1620  /* compare phs_syncscan initialization to similar logic in initscan */
1621  target->phs_syncscan = synchronize_seqscans &&
1622  !RelationUsesLocalBuffers(relation) &&
1623  target->phs_nblocks > NBuffers / 4;
1624  SpinLockInit(&target->phs_mutex);
1626  pg_atomic_init_u64(&target->phs_nallocated, 0);
1627  SerializeSnapshot(snapshot, target->phs_snapshot_data);
1628 }
1629 
1630 /* ----------------
1631  * heap_parallelscan_reinitialize - reset a parallel scan
1632  *
1633  * Call this in the leader process. Caller is responsible for
1634  * making sure that all workers have finished the scan beforehand.
1635  * ----------------
1636  */
1637 void
1639 {
1640  pg_atomic_write_u64(&parallel_scan->phs_nallocated, 0);
1641 }
1642 
1643 /* ----------------
1644  * heap_beginscan_parallel - join a parallel scan
1645  *
1646  * Caller must hold a suitable lock on the correct relation.
1647  * ----------------
1648  */
1651 {
1652  Snapshot snapshot;
1653 
1654  Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
1655  snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data);
1656  RegisterSnapshot(snapshot);
1657 
1658  return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan,
1659  true, true, true, false, false, true);
1660 }
1661 
1662 /* ----------------
1663  * heap_parallelscan_startblock_init - find and set the scan's startblock
1664  *
1665  * Determine where the parallel seq scan should start. This function may
1666  * be called many times, once by each parallel worker. We must be careful
1667  * only to set the startblock once.
1668  * ----------------
1669  */
1670 static void
1672 {
1673  BlockNumber sync_startpage = InvalidBlockNumber;
1674  ParallelHeapScanDesc parallel_scan;
1675 
1676  Assert(scan->rs_parallel);
1677  parallel_scan = scan->rs_parallel;
1678 
1679 retry:
1680  /* Grab the spinlock. */
1681  SpinLockAcquire(&parallel_scan->phs_mutex);
1682 
1683  /*
1684  * If the scan's startblock has not yet been initialized, we must do so
1685  * now. If this is not a synchronized scan, we just start at block 0, but
1686  * if it is a synchronized scan, we must get the starting position from
1687  * the synchronized scan machinery. We can't hold the spinlock while
1688  * doing that, though, so release the spinlock, get the information we
1689  * need, and retry. If nobody else has initialized the scan in the
1690  * meantime, we'll fill in the value we fetched on the second time
1691  * through.
1692  */
1693  if (parallel_scan->phs_startblock == InvalidBlockNumber)
1694  {
1695  if (!parallel_scan->phs_syncscan)
1696  parallel_scan->phs_startblock = 0;
1697  else if (sync_startpage != InvalidBlockNumber)
1698  parallel_scan->phs_startblock = sync_startpage;
1699  else
1700  {
1701  SpinLockRelease(&parallel_scan->phs_mutex);
1702  sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks);
1703  goto retry;
1704  }
1705  }
1706  SpinLockRelease(&parallel_scan->phs_mutex);
1707 }
1708 
1709 /* ----------------
1710  * heap_parallelscan_nextpage - get the next page to scan
1711  *
1712  * Get the next page to scan. Even if there are no pages left to scan,
1713  * another backend could have grabbed a page to scan and not yet finished
1714  * looking at it, so it doesn't follow that the scan is done when the
1715  * first backend gets an InvalidBlockNumber return.
1716  * ----------------
1717  */
1718 static BlockNumber
1720 {
1721  BlockNumber page;
1722  ParallelHeapScanDesc parallel_scan;
1723  uint64 nallocated;
1724 
1725  Assert(scan->rs_parallel);
1726  parallel_scan = scan->rs_parallel;
1727 
1728  /*
1729  * phs_nallocated tracks how many pages have been allocated to workers
1730  * already. When phs_nallocated >= rs_nblocks, all blocks have been
1731  * allocated.
1732  *
1733  * Because we use an atomic fetch-and-add to fetch the current value, the
1734  * phs_nallocated counter will exceed rs_nblocks, because workers will
1735  * still increment the value, when they try to allocate the next block but
1736  * all blocks have been allocated already. The counter must be 64 bits
1737  * wide because of that, to avoid wrapping around when rs_nblocks is close
1738  * to 2^32.
1739  *
1740  * The actual page to return is calculated by adding the counter to the
1741  * starting block number, modulo nblocks.
1742  */
1743  nallocated = pg_atomic_fetch_add_u64(&parallel_scan->phs_nallocated, 1);
1744  if (nallocated >= scan->rs_nblocks)
1745  page = InvalidBlockNumber; /* all blocks have been allocated */
1746  else
1747  page = (nallocated + parallel_scan->phs_startblock) % scan->rs_nblocks;
1748 
1749  /*
1750  * Report scan location. Normally, we report the current page number.
1751  * When we reach the end of the scan, though, we report the starting page,
1752  * not the ending page, just so the starting positions for later scans
1753  * doesn't slew backwards. We only report the position at the end of the
1754  * scan once, though: subsequent callers will report nothing.
1755  */
1756  if (scan->rs_syncscan)
1757  {
1758  if (page != InvalidBlockNumber)
1759  ss_report_location(scan->rs_rd, page);
1760  else if (nallocated == scan->rs_nblocks)
1761  ss_report_location(scan->rs_rd, parallel_scan->phs_startblock);
1762  }
1763 
1764  return page;
1765 }
1766 
1767 /* ----------------
1768  * heap_update_snapshot
1769  *
1770  * Update snapshot info in heap scan descriptor.
1771  * ----------------
1772  */
1773 void
1775 {
1776  Assert(IsMVCCSnapshot(snapshot));
1777 
1778  RegisterSnapshot(snapshot);
1779  scan->rs_snapshot = snapshot;
1780  scan->rs_temp_snap = true;
1781 }
1782 
1783 /* ----------------
1784  * heap_getnext - retrieve next tuple in scan
1785  *
1786  * Fix to work with index relations.
1787  * We don't return the buffer anymore, but you can get it from the
1788  * returned HeapTuple.
1789  * ----------------
1790  */
1791 
1792 #ifdef HEAPDEBUGALL
1793 #define HEAPDEBUG_1 \
1794  elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1795  RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1796 #define HEAPDEBUG_2 \
1797  elog(DEBUG2, "heap_getnext returning EOS")
1798 #define HEAPDEBUG_3 \
1799  elog(DEBUG2, "heap_getnext returning tuple")
1800 #else
1801 #define HEAPDEBUG_1
1802 #define HEAPDEBUG_2
1803 #define HEAPDEBUG_3
1804 #endif /* !defined(HEAPDEBUGALL) */
1805 
1806 
1807 HeapTuple
1809 {
1810  /* Note: no locking manipulations needed */
1811 
1812  HEAPDEBUG_1; /* heap_getnext( info ) */
1813 
1814  if (scan->rs_pageatatime)
1815  heapgettup_pagemode(scan, direction,
1816  scan->rs_nkeys, scan->rs_key);
1817  else
1818  heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);
1819 
1820  if (scan->rs_ctup.t_data == NULL)
1821  {
1822  HEAPDEBUG_2; /* heap_getnext returning EOS */
1823  return NULL;
1824  }
1825 
1826  /*
1827  * if we get here it means we have a new current scan tuple, so point to
1828  * the proper return buffer and return the tuple.
1829  */
1830  HEAPDEBUG_3; /* heap_getnext returning tuple */
1831 
1833 
1834  return &(scan->rs_ctup);
1835 }
1836 
1837 /*
1838  * heap_fetch - retrieve tuple with given tid
1839  *
1840  * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1841  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1842  * against the specified snapshot.
1843  *
1844  * If successful (tuple found and passes snapshot time qual), then *userbuf
1845  * is set to the buffer holding the tuple and true is returned. The caller
1846  * must unpin the buffer when done with the tuple.
1847  *
1848  * If the tuple is not found (ie, item number references a deleted slot),
1849  * then tuple->t_data is set to NULL and false is returned.
1850  *
1851  * If the tuple is found but fails the time qual check, then false is returned
1852  * but tuple->t_data is left pointing to the tuple.
1853  *
1854  * keep_buf determines what is done with the buffer in the false-result cases.
1855  * When the caller specifies keep_buf = true, we retain the pin on the buffer
1856  * and return it in *userbuf (so the caller must eventually unpin it); when
1857  * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
1858  *
1859  * stats_relation is the relation to charge the heap_fetch operation against
1860  * for statistical purposes. (This could be the heap rel itself, an
1861  * associated index, or NULL to not count the fetch at all.)
1862  *
1863  * heap_fetch does not follow HOT chains: only the exact TID requested will
1864  * be fetched.
1865  *
1866  * It is somewhat inconsistent that we ereport() on invalid block number but
1867  * return false on invalid item number. There are a couple of reasons though.
1868  * One is that the caller can relatively easily check the block number for
1869  * validity, but cannot check the item number without reading the page
1870  * himself. Another is that when we are following a t_ctid link, we can be
1871  * reasonably confident that the page number is valid (since VACUUM shouldn't
1872  * truncate off the destination page without having killed the referencing
1873  * tuple first), but the item number might well not be good.
1874  */
1875 bool
1877  Snapshot snapshot,
1878  HeapTuple tuple,
1879  Buffer *userbuf,
1880  bool keep_buf,
1881  Relation stats_relation)
1882 {
1883  ItemPointer tid = &(tuple->t_self);
1884  ItemId lp;
1885  Buffer buffer;
1886  Page page;
1887  OffsetNumber offnum;
1888  bool valid;
1889 
1890  /*
1891  * Fetch and pin the appropriate page of the relation.
1892  */
1893  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1894 
1895  /*
1896  * Need share lock on buffer to examine tuple commit status.
1897  */
1898  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1899  page = BufferGetPage(buffer);
1900  TestForOldSnapshot(snapshot, relation, page);
1901 
1902  /*
1903  * We'd better check for out-of-range offnum in case of VACUUM since the
1904  * TID was obtained.
1905  */
1906  offnum = ItemPointerGetOffsetNumber(tid);
1907  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1908  {
1909  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1910  if (keep_buf)
1911  *userbuf = buffer;
1912  else
1913  {
1914  ReleaseBuffer(buffer);
1915  *userbuf = InvalidBuffer;
1916  }
1917  tuple->t_data = NULL;
1918  return false;
1919  }
1920 
1921  /*
1922  * get the item line pointer corresponding to the requested tid
1923  */
1924  lp = PageGetItemId(page, offnum);
1925 
1926  /*
1927  * Must check for deleted tuple.
1928  */
1929  if (!ItemIdIsNormal(lp))
1930  {
1931  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1932  if (keep_buf)
1933  *userbuf = buffer;
1934  else
1935  {
1936  ReleaseBuffer(buffer);
1937  *userbuf = InvalidBuffer;
1938  }
1939  tuple->t_data = NULL;
1940  return false;
1941  }
1942 
1943  /*
1944  * fill in *tuple fields
1945  */
1946  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1947  tuple->t_len = ItemIdGetLength(lp);
1948  tuple->t_tableOid = RelationGetRelid(relation);
1949 
1950  /*
1951  * check time qualification of tuple, then release lock
1952  */
1953  valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1954 
1955  if (valid)
1956  PredicateLockTuple(relation, tuple, snapshot);
1957 
1958  CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1959 
1960  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1961 
1962  if (valid)
1963  {
1964  /*
1965  * All checks passed, so return the tuple as valid. Caller is now
1966  * responsible for releasing the buffer.
1967  */
1968  *userbuf = buffer;
1969 
1970  /* Count the successful fetch against appropriate rel, if any */
1971  if (stats_relation != NULL)
1972  pgstat_count_heap_fetch(stats_relation);
1973 
1974  return true;
1975  }
1976 
1977  /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1978  if (keep_buf)
1979  *userbuf = buffer;
1980  else
1981  {
1982  ReleaseBuffer(buffer);
1983  *userbuf = InvalidBuffer;
1984  }
1985 
1986  return false;
1987 }
1988 
1989 /*
1990  * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1991  *
1992  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1993  * of a HOT chain), and buffer is the buffer holding this tuple. We search
1994  * for the first chain member satisfying the given snapshot. If one is
1995  * found, we update *tid to reference that tuple's offset number, and
1996  * return true. If no match, return false without modifying *tid.
1997  *
1998  * heapTuple is a caller-supplied buffer. When a match is found, we return
1999  * the tuple here, in addition to updating *tid. If no match is found, the
2000  * contents of this buffer on return are undefined.
2001  *
2002  * If all_dead is not NULL, we check non-visible tuples to see if they are
2003  * globally dead; *all_dead is set true if all members of the HOT chain
2004  * are vacuumable, false if not.
2005  *
2006  * Unlike heap_fetch, the caller must already have pin and (at least) share
2007  * lock on the buffer; it is still pinned/locked at exit. Also unlike
2008  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
2009  */
2010 bool
2012  Snapshot snapshot, HeapTuple heapTuple,
2013  bool *all_dead, bool first_call)
2014 {
2015  Page dp = (Page) BufferGetPage(buffer);
2016  TransactionId prev_xmax = InvalidTransactionId;
2017  OffsetNumber offnum;
2018  bool at_chain_start;
2019  bool valid;
2020  bool skip;
2021 
2022  /* If this is not the first call, previous call returned a (live!) tuple */
2023  if (all_dead)
2024  *all_dead = first_call;
2025 
2027 
2029  offnum = ItemPointerGetOffsetNumber(tid);
2030  at_chain_start = first_call;
2031  skip = !first_call;
2032 
2033  heapTuple->t_self = *tid;
2034 
2035  /* Scan through possible multiple members of HOT-chain */
2036  for (;;)
2037  {
2038  ItemId lp;
2039 
2040  /* check for bogus TID */
2041  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
2042  break;
2043 
2044  lp = PageGetItemId(dp, offnum);
2045 
2046  /* check for unused, dead, or redirected items */
2047  if (!ItemIdIsNormal(lp))
2048  {
2049  /* We should only see a redirect at start of chain */
2050  if (ItemIdIsRedirected(lp) && at_chain_start)
2051  {
2052  /* Follow the redirect */
2053  offnum = ItemIdGetRedirect(lp);
2054  at_chain_start = false;
2055  continue;
2056  }
2057  /* else must be end of chain */
2058  break;
2059  }
2060 
2061  heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
2062  heapTuple->t_len = ItemIdGetLength(lp);
2063  heapTuple->t_tableOid = RelationGetRelid(relation);
2064  ItemPointerSetOffsetNumber(&heapTuple->t_self, offnum);
2065 
2066  /*
2067  * Shouldn't see a HEAP_ONLY tuple at chain start.
2068  */
2069  if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
2070  break;
2071 
2072  /*
2073  * The xmin should match the previous xmax value, else chain is
2074  * broken.
2075  */
2076  if (TransactionIdIsValid(prev_xmax) &&
2077  !TransactionIdEquals(prev_xmax,
2078  HeapTupleHeaderGetXmin(heapTuple->t_data)))
2079  break;
2080 
2081  /*
2082  * When first_call is true (and thus, skip is initially false) we'll
2083  * return the first tuple we find. But on later passes, heapTuple
2084  * will initially be pointing to the tuple we returned last time.
2085  * Returning it again would be incorrect (and would loop forever), so
2086  * we skip it and return the next match we find.
2087  */
2088  if (!skip)
2089  {
2090  /*
2091  * For the benefit of logical decoding, have t_self point at the
2092  * element of the HOT chain we're currently investigating instead
2093  * of the root tuple of the HOT chain. This is important because
2094  * the *Satisfies routine for historical mvcc snapshots needs the
2095  * correct tid to decide about the visibility in some cases.
2096  */
2097  ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum);
2098 
2099  /* If it's visible per the snapshot, we must return it */
2100  valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
2101  CheckForSerializableConflictOut(valid, relation, heapTuple,
2102  buffer, snapshot);
2103  /* reset to original, non-redirected, tid */
2104  heapTuple->t_self = *tid;
2105 
2106  if (valid)
2107  {
2108  ItemPointerSetOffsetNumber(tid, offnum);
2109  PredicateLockTuple(relation, heapTuple, snapshot);
2110  if (all_dead)
2111  *all_dead = false;
2112  return true;
2113  }
2114  }
2115  skip = false;
2116 
2117  /*
2118  * If we can't see it, maybe no one else can either. At caller
2119  * request, check whether all chain members are dead to all
2120  * transactions.
2121  *
2122  * Note: if you change the criterion here for what is "dead", fix the
2123  * planner's get_actual_variable_range() function to match.
2124  */
2125  if (all_dead && *all_dead &&
2127  *all_dead = false;
2128 
2129  /*
2130  * Check to see if HOT chain continues past this tuple; if so fetch
2131  * the next offnum and loop around.
2132  */
2133  if (HeapTupleIsHotUpdated(heapTuple))
2134  {
2137  offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
2138  at_chain_start = false;
2139  prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
2140  }
2141  else
2142  break; /* end of chain */
2143  }
2144 
2145  return false;
2146 }
2147 
2148 /*
2149  * heap_hot_search - search HOT chain for tuple satisfying snapshot
2150  *
2151  * This has the same API as heap_hot_search_buffer, except that the caller
2152  * does not provide the buffer containing the page, rather we access it
2153  * locally.
2154  */
2155 bool
2157  bool *all_dead)
2158 {
2159  bool result;
2160  Buffer buffer;
2161  HeapTupleData heapTuple;
2162 
2163  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
2164  LockBuffer(buffer, BUFFER_LOCK_SHARE);
2165  result = heap_hot_search_buffer(tid, relation, buffer, snapshot,
2166  &heapTuple, all_dead, true);
2167  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2168  ReleaseBuffer(buffer);
2169  return result;
2170 }
2171 
2172 /*
2173  * heap_get_latest_tid - get the latest tid of a specified tuple
2174  *
2175  * Actually, this gets the latest version that is visible according to
2176  * the passed snapshot. You can pass SnapshotDirty to get the very latest,
2177  * possibly uncommitted version.
2178  *
2179  * *tid is both an input and an output parameter: it is updated to
2180  * show the latest version of the row. Note that it will not be changed
2181  * if no version of the row passes the snapshot test.
2182  */
2183 void
2185  Snapshot snapshot,
2186  ItemPointer tid)
2187 {
2188  BlockNumber blk;
2189  ItemPointerData ctid;
2190  TransactionId priorXmax;
2191 
2192  /* this is to avoid Assert failures on bad input */
2193  if (!ItemPointerIsValid(tid))
2194  return;
2195 
2196  /*
2197  * Since this can be called with user-supplied TID, don't trust the input
2198  * too much. (RelationGetNumberOfBlocks is an expensive check, so we
2199  * don't check t_ctid links again this way. Note that it would not do to
2200  * call it just once and save the result, either.)
2201  */
2202  blk = ItemPointerGetBlockNumber(tid);
2203  if (blk >= RelationGetNumberOfBlocks(relation))
2204  elog(ERROR, "block number %u is out of range for relation \"%s\"",
2205  blk, RelationGetRelationName(relation));
2206 
2207  /*
2208  * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
2209  * need to examine, and *tid is the TID we will return if ctid turns out
2210  * to be bogus.
2211  *
2212  * Note that we will loop until we reach the end of the t_ctid chain.
2213  * Depending on the snapshot passed, there might be at most one visible
2214  * version of the row, but we don't try to optimize for that.
2215  */
2216  ctid = *tid;
2217  priorXmax = InvalidTransactionId; /* cannot check first XMIN */
2218  for (;;)
2219  {
2220  Buffer buffer;
2221  Page page;
2222  OffsetNumber offnum;
2223  ItemId lp;
2224  HeapTupleData tp;
2225  bool valid;
2226 
2227  /*
2228  * Read, pin, and lock the page.
2229  */
2230  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
2231  LockBuffer(buffer, BUFFER_LOCK_SHARE);
2232  page = BufferGetPage(buffer);
2233  TestForOldSnapshot(snapshot, relation, page);
2234 
2235  /*
2236  * Check for bogus item number. This is not treated as an error
2237  * condition because it can happen while following a t_ctid link. We
2238  * just assume that the prior tid is OK and return it unchanged.
2239  */
2240  offnum = ItemPointerGetOffsetNumber(&ctid);
2241  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
2242  {
2243  UnlockReleaseBuffer(buffer);
2244  break;
2245  }
2246  lp = PageGetItemId(page, offnum);
2247  if (!ItemIdIsNormal(lp))
2248  {
2249  UnlockReleaseBuffer(buffer);
2250  break;
2251  }
2252 
2253  /* OK to access the tuple */
2254  tp.t_self = ctid;
2255  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2256  tp.t_len = ItemIdGetLength(lp);
2257  tp.t_tableOid = RelationGetRelid(relation);
2258 
2259  /*
2260  * After following a t_ctid link, we might arrive at an unrelated
2261  * tuple. Check for XMIN match.
2262  */
2263  if (TransactionIdIsValid(priorXmax) &&
2265  {
2266  UnlockReleaseBuffer(buffer);
2267  break;
2268  }
2269 
2270  /*
2271  * Check time qualification of tuple; if visible, set it as the new
2272  * result candidate.
2273  */
2274  valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2275  CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2276  if (valid)
2277  *tid = ctid;
2278 
2279  /*
2280  * If there's a valid t_ctid link, follow it, else we're done.
2281  */
2282  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2285  {
2286  UnlockReleaseBuffer(buffer);
2287  break;
2288  }
2289 
2290  ctid = tp.t_data->t_ctid;
2291  priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2292  UnlockReleaseBuffer(buffer);
2293  } /* end of loop */
2294 }
2295 
2296 
2297 /*
2298  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2299  *
2300  * This is called after we have waited for the XMAX transaction to terminate.
2301  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2302  * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2303  * hint bit if possible --- but beware that that may not yet be possible,
2304  * if the transaction committed asynchronously.
2305  *
2306  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2307  * even if it commits.
2308  *
2309  * Hence callers should look only at XMAX_INVALID.
2310  *
2311  * Note this is not allowed for tuples whose xmax is a multixact.
2312  */
2313 static void
2315 {
2317  Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
2318 
2319  if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
2320  {
2321  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2324  xid);
2325  else
2326  HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
2328  }
2329 }
2330 
2331 
2332 /*
2333  * GetBulkInsertState - prepare status object for a bulk insert
2334  */
2337 {
2338  BulkInsertState bistate;
2339 
2340  bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2342  bistate->current_buf = InvalidBuffer;
2343  return bistate;
2344 }
2345 
2346 /*
2347  * FreeBulkInsertState - clean up after finishing a bulk insert
2348  */
2349 void
2351 {
2352  if (bistate->current_buf != InvalidBuffer)
2353  ReleaseBuffer(bistate->current_buf);
2354  FreeAccessStrategy(bistate->strategy);
2355  pfree(bistate);
2356 }
2357 
2358 /*
2359  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2360  */
2361 void
2363 {
2364  if (bistate->current_buf != InvalidBuffer)
2365  ReleaseBuffer(bistate->current_buf);
2366  bistate->current_buf = InvalidBuffer;
2367 }
2368 
2369 
2370 /*
2371  * heap_insert - insert tuple into a heap
2372  *
2373  * The new tuple is stamped with current transaction ID and the specified
2374  * command ID.
2375  *
2376  * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
2377  * logged in WAL, even for a non-temp relation. Safe usage of this behavior
2378  * requires that we arrange that all new tuples go into new pages not
2379  * containing any tuples from other transactions, and that the relation gets
2380  * fsync'd before commit. (See also heap_sync() comments)
2381  *
2382  * The HEAP_INSERT_SKIP_FSM option is passed directly to
2383  * RelationGetBufferForTuple, which see for more info.
2384  *
2385  * HEAP_INSERT_FROZEN should only be specified for inserts into
2386  * relfilenodes created during the current subtransaction and when
2387  * there are no prior snapshots or pre-existing portals open.
2388  * This causes rows to be frozen, which is an MVCC violation and
2389  * requires explicit options chosen by user.
2390  *
2391  * HEAP_INSERT_IS_SPECULATIVE is used on so-called "speculative insertions",
2392  * which can be backed out afterwards without aborting the whole transaction.
2393  * Other sessions can wait for the speculative insertion to be confirmed,
2394  * turning it into a regular tuple, or aborted, as if it never existed.
2395  * Speculatively inserted tuples behave as "value locks" of short duration,
2396  * used to implement INSERT .. ON CONFLICT.
2397  *
2398  * Note that most of these options will be applied when inserting into the
2399  * heap's TOAST table, too, if the tuple requires any out-of-line data. Only
2400  * HEAP_INSERT_IS_SPECULATIVE is explicitly ignored, as the toast data does
2401  * not partake in speculative insertion.
2402  *
2403  * The BulkInsertState object (if any; bistate can be NULL for default
2404  * behavior) is also just passed through to RelationGetBufferForTuple.
2405  *
2406  * The return value is the OID assigned to the tuple (either here or by the
2407  * caller), or InvalidOid if no OID. The header fields of *tup are updated
2408  * to match the stored tuple; in particular tup->t_self receives the actual
2409  * TID where the tuple was stored. But note that any toasting of fields
2410  * within the tuple data is NOT reflected into *tup.
2411  */
2412 Oid
2414  int options, BulkInsertState bistate)
2415 {
2417  HeapTuple heaptup;
2418  Buffer buffer;
2419  Buffer vmbuffer = InvalidBuffer;
2420  bool all_visible_cleared = false;
2421 
2422  /*
2423  * Fill in tuple header fields, assign an OID, and toast the tuple if
2424  * necessary.
2425  *
2426  * Note: below this point, heaptup is the data we actually intend to store
2427  * into the relation; tup is the caller's original untoasted data.
2428  */
2429  heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2430 
2431  /*
2432  * Find buffer to insert this tuple into. If the page is all visible,
2433  * this will also pin the requisite visibility map page.
2434  */
2435  buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2436  InvalidBuffer, options, bistate,
2437  &vmbuffer, NULL);
2438 
2439  /*
2440  * We're about to do the actual insert -- but check for conflict first, to
2441  * avoid possibly having to roll back work we've just done.
2442  *
2443  * This is safe without a recheck as long as there is no possibility of
2444  * another process scanning the page between this check and the insert
2445  * being visible to the scan (i.e., an exclusive buffer content lock is
2446  * continuously held from this point until the tuple insert is visible).
2447  *
2448  * For a heap insert, we only need to check for table-level SSI locks. Our
2449  * new tuple can't possibly conflict with existing tuple locks, and heap
2450  * page locks are only consolidated versions of tuple locks; they do not
2451  * lock "gaps" as index page locks do. So we don't need to specify a
2452  * buffer when making the call, which makes for a faster check.
2453  */
2455 
2456  /* NO EREPORT(ERROR) from here till changes are logged */
2458 
2459  RelationPutHeapTuple(relation, buffer, heaptup,
2460  (options & HEAP_INSERT_SPECULATIVE) != 0);
2461 
2462  if (PageIsAllVisible(BufferGetPage(buffer)))
2463  {
2464  all_visible_cleared = true;
2466  visibilitymap_clear(relation,
2467  ItemPointerGetBlockNumber(&(heaptup->t_self)),
2468  vmbuffer, VISIBILITYMAP_VALID_BITS);
2469  }
2470 
2471  /*
2472  * XXX Should we set PageSetPrunable on this page ?
2473  *
2474  * The inserting transaction may eventually abort thus making this tuple
2475  * DEAD and hence available for pruning. Though we don't want to optimize
2476  * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2477  * aborted tuple will never be pruned until next vacuum is triggered.
2478  *
2479  * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2480  */
2481 
2482  MarkBufferDirty(buffer);
2483 
2484  /* XLOG stuff */
2485  if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
2486  {
2487  xl_heap_insert xlrec;
2488  xl_heap_header xlhdr;
2489  XLogRecPtr recptr;
2490  Page page = BufferGetPage(buffer);
2491  uint8 info = XLOG_HEAP_INSERT;
2492  int bufflags = 0;
2493 
2494  /*
2495  * If this is a catalog, we need to transmit combocids to properly
2496  * decode, so log that as well.
2497  */
2499  log_heap_new_cid(relation, heaptup);
2500 
2501  /*
2502  * If this is the single and first tuple on page, we can reinit the
2503  * page instead of restoring the whole thing. Set flag, and hide
2504  * buffer references from XLogInsert.
2505  */
2506  if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2508  {
2509  info |= XLOG_HEAP_INIT_PAGE;
2510  bufflags |= REGBUF_WILL_INIT;
2511  }
2512 
2513  xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2514  xlrec.flags = 0;
2515  if (all_visible_cleared)
2517  if (options & HEAP_INSERT_SPECULATIVE)
2520 
2521  /*
2522  * For logical decoding, we need the tuple even if we're doing a full
2523  * page write, so make sure it's included even if we take a full-page
2524  * image. (XXX We could alternatively store a pointer into the FPW).
2525  */
2526  if (RelationIsLogicallyLogged(relation))
2527  {
2529  bufflags |= REGBUF_KEEP_DATA;
2530  }
2531 
2532  XLogBeginInsert();
2533  XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2534 
2535  xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2536  xlhdr.t_infomask = heaptup->t_data->t_infomask;
2537  xlhdr.t_hoff = heaptup->t_data->t_hoff;
2538 
2539  /*
2540  * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2541  * write the whole page to the xlog, we don't need to store
2542  * xl_heap_header in the xlog.
2543  */
2544  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2545  XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2546  /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2548  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2549  heaptup->t_len - SizeofHeapTupleHeader);
2550 
2551  /* filtering by origin on a row level is much more efficient */
2553 
2554  recptr = XLogInsert(RM_HEAP_ID, info);
2555 
2556  PageSetLSN(page, recptr);
2557  }
2558 
2559  END_CRIT_SECTION();
2560 
2561  UnlockReleaseBuffer(buffer);
2562  if (vmbuffer != InvalidBuffer)
2563  ReleaseBuffer(vmbuffer);
2564 
2565  /*
2566  * If tuple is cachable, mark it for invalidation from the caches in case
2567  * we abort. Note it is OK to do this after releasing the buffer, because
2568  * the heaptup data structure is all in local memory, not in the shared
2569  * buffer.
2570  */
2571  CacheInvalidateHeapTuple(relation, heaptup, NULL);
2572 
2573  /* Note: speculative insertions are counted too, even if aborted later */
2574  pgstat_count_heap_insert(relation, 1);
2575 
2576  /*
2577  * If heaptup is a private copy, release it. Don't forget to copy t_self
2578  * back to the caller's image, too.
2579  */
2580  if (heaptup != tup)
2581  {
2582  tup->t_self = heaptup->t_self;
2583  heap_freetuple(heaptup);
2584  }
2585 
2586  return HeapTupleGetOid(tup);
2587 }
2588 
2589 /*
2590  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2591  * tuple header fields, assigns an OID, and toasts the tuple if necessary.
2592  * Returns a toasted version of the tuple if it was toasted, or the original
2593  * tuple if not. Note that in any case, the header fields are also set in
2594  * the original tuple.
2595  */
2596 static HeapTuple
2598  CommandId cid, int options)
2599 {
2600  /*
2601  * Parallel operations are required to be strictly read-only in a parallel
2602  * worker. Parallel inserts are not safe even in the leader in the
2603  * general case, because group locking means that heavyweight locks for
2604  * relation extension or GIN page locks will not conflict between members
2605  * of a lock group, but we don't prohibit that case here because there are
2606  * useful special cases that we can safely allow, such as CREATE TABLE AS.
2607  */
2608  if (IsParallelWorker())
2609  ereport(ERROR,
2610  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2611  errmsg("cannot insert tuples in a parallel worker")));
2612 
2613  if (relation->rd_rel->relhasoids)
2614  {
2615 #ifdef NOT_USED
2616  /* this is redundant with an Assert in HeapTupleSetOid */
2618 #endif
2619 
2620  /*
2621  * If the object id of this tuple has already been assigned, trust the
2622  * caller. There are a couple of ways this can happen. At initial db
2623  * creation, the backend program sets oids for tuples. When we define
2624  * an index, we set the oid. Finally, in the future, we may allow
2625  * users to set their own object ids in order to support a persistent
2626  * object store (objects need to contain pointers to one another).
2627  */
2628  if (!OidIsValid(HeapTupleGetOid(tup)))
2629  HeapTupleSetOid(tup, GetNewOid(relation));
2630  }
2631  else
2632  {
2633  /* check there is not space for an OID */
2634  Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
2635  }
2636 
2637  tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2638  tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2640  HeapTupleHeaderSetXmin(tup->t_data, xid);
2641  if (options & HEAP_INSERT_FROZEN)
2643 
2644  HeapTupleHeaderSetCmin(tup->t_data, cid);
2645  HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2646  tup->t_tableOid = RelationGetRelid(relation);
2647 
2648  /*
2649  * If the new tuple is too big for storage or contains already toasted
2650  * out-of-line attributes from some other relation, invoke the toaster.
2651  */
2652  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2653  relation->rd_rel->relkind != RELKIND_MATVIEW)
2654  {
2655  /* toast table entries should never be recursively toasted */
2657  return tup;
2658  }
2659  else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2660  return toast_insert_or_update(relation, tup, NULL, options);
2661  else
2662  return tup;
2663 }
2664 
2665 /*
2666  * heap_multi_insert - insert multiple tuple into a heap
2667  *
2668  * This is like heap_insert(), but inserts multiple tuples in one operation.
2669  * That's faster than calling heap_insert() in a loop, because when multiple
2670  * tuples can be inserted on a single page, we can write just a single WAL
2671  * record covering all of them, and only need to lock/unlock the page once.
2672  *
2673  * Note: this leaks memory into the current memory context. You can create a
2674  * temporary context before calling this, if that's a problem.
2675  */
2676 void
2677 heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
2678  CommandId cid, int options, BulkInsertState bistate)
2679 {
2681  HeapTuple *heaptuples;
2682  int i;
2683  int ndone;
2684  char *scratch = NULL;
2685  Page page;
2686  bool needwal;
2687  Size saveFreeSpace;
2688  bool need_tuple_data = RelationIsLogicallyLogged(relation);
2689  bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2690 
2691  needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2692  saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2694 
2695  /* Toast and set header data in all the tuples */
2696  heaptuples = palloc(ntuples * sizeof(HeapTuple));
2697  for (i = 0; i < ntuples; i++)
2698  heaptuples[i] = heap_prepare_insert(relation, tuples[i],
2699  xid, cid, options);
2700 
2701  /*
2702  * Allocate some memory to use for constructing the WAL record. Using
2703  * palloc() within a critical section is not safe, so we allocate this
2704  * beforehand.
2705  */
2706  if (needwal)
2707  scratch = palloc(BLCKSZ);
2708 
2709  /*
2710  * We're about to do the actual inserts -- but check for conflict first,
2711  * to minimize the possibility of having to roll back work we've just
2712  * done.
2713  *
2714  * A check here does not definitively prevent a serialization anomaly;
2715  * that check MUST be done at least past the point of acquiring an
2716  * exclusive buffer content lock on every buffer that will be affected,
2717  * and MAY be done after all inserts are reflected in the buffers and
2718  * those locks are released; otherwise there race condition. Since
2719  * multiple buffers can be locked and unlocked in the loop below, and it
2720  * would not be feasible to identify and lock all of those buffers before
2721  * the loop, we must do a final check at the end.
2722  *
2723  * The check here could be omitted with no loss of correctness; it is
2724  * present strictly as an optimization.
2725  *
2726  * For heap inserts, we only need to check for table-level SSI locks. Our
2727  * new tuples can't possibly conflict with existing tuple locks, and heap
2728  * page locks are only consolidated versions of tuple locks; they do not
2729  * lock "gaps" as index page locks do. So we don't need to specify a
2730  * buffer when making the call, which makes for a faster check.
2731  */
2733 
2734  ndone = 0;
2735  while (ndone < ntuples)
2736  {
2737  Buffer buffer;
2738  Buffer vmbuffer = InvalidBuffer;
2739  bool all_visible_cleared = false;
2740  int nthispage;
2741 
2743 
2744  /*
2745  * Find buffer where at least the next tuple will fit. If the page is
2746  * all-visible, this will also pin the requisite visibility map page.
2747  */
2748  buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2749  InvalidBuffer, options, bistate,
2750  &vmbuffer, NULL);
2751  page = BufferGetPage(buffer);
2752 
2753  /* NO EREPORT(ERROR) from here till changes are logged */
2755 
2756  /*
2757  * RelationGetBufferForTuple has ensured that the first tuple fits.
2758  * Put that on the page, and then as many other tuples as fit.
2759  */
2760  RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2761  for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2762  {
2763  HeapTuple heaptup = heaptuples[ndone + nthispage];
2764 
2765  if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2766  break;
2767 
2768  RelationPutHeapTuple(relation, buffer, heaptup, false);
2769 
2770  /*
2771  * We don't use heap_multi_insert for catalog tuples yet, but
2772  * better be prepared...
2773  */
2774  if (needwal && need_cids)
2775  log_heap_new_cid(relation, heaptup);
2776  }
2777 
2778  if (PageIsAllVisible(page))
2779  {
2780  all_visible_cleared = true;
2781  PageClearAllVisible(page);
2782  visibilitymap_clear(relation,
2783  BufferGetBlockNumber(buffer),
2784  vmbuffer, VISIBILITYMAP_VALID_BITS);
2785  }
2786 
2787  /*
2788  * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2789  */
2790 
2791  MarkBufferDirty(buffer);
2792 
2793  /* XLOG stuff */
2794  if (needwal)
2795  {
2796  XLogRecPtr recptr;
2797  xl_heap_multi_insert *xlrec;
2799  char *tupledata;
2800  int totaldatalen;
2801  char *scratchptr = scratch;
2802  bool init;
2803  int bufflags = 0;
2804 
2805  /*
2806  * If the page was previously empty, we can reinit the page
2807  * instead of restoring the whole thing.
2808  */
2809  init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2810  PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2811 
2812  /* allocate xl_heap_multi_insert struct from the scratch area */
2813  xlrec = (xl_heap_multi_insert *) scratchptr;
2814  scratchptr += SizeOfHeapMultiInsert;
2815 
2816  /*
2817  * Allocate offsets array. Unless we're reinitializing the page,
2818  * in that case the tuples are stored in order starting at
2819  * FirstOffsetNumber and we don't need to store the offsets
2820  * explicitly.
2821  */
2822  if (!init)
2823  scratchptr += nthispage * sizeof(OffsetNumber);
2824 
2825  /* the rest of the scratch space is used for tuple data */
2826  tupledata = scratchptr;
2827 
2828  xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2829  xlrec->ntuples = nthispage;
2830 
2831  /*
2832  * Write out an xl_multi_insert_tuple and the tuple data itself
2833  * for each tuple.
2834  */
2835  for (i = 0; i < nthispage; i++)
2836  {
2837  HeapTuple heaptup = heaptuples[ndone + i];
2838  xl_multi_insert_tuple *tuphdr;
2839  int datalen;
2840 
2841  if (!init)
2842  xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2843  /* xl_multi_insert_tuple needs two-byte alignment. */
2844  tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2845  scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2846 
2847  tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2848  tuphdr->t_infomask = heaptup->t_data->t_infomask;
2849  tuphdr->t_hoff = heaptup->t_data->t_hoff;
2850 
2851  /* write bitmap [+ padding] [+ oid] + data */
2852  datalen = heaptup->t_len - SizeofHeapTupleHeader;
2853  memcpy(scratchptr,
2854  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2855  datalen);
2856  tuphdr->datalen = datalen;
2857  scratchptr += datalen;
2858  }
2859  totaldatalen = scratchptr - tupledata;
2860  Assert((scratchptr - scratch) < BLCKSZ);
2861 
2862  if (need_tuple_data)
2864 
2865  /*
2866  * Signal that this is the last xl_heap_multi_insert record
2867  * emitted by this call to heap_multi_insert(). Needed for logical
2868  * decoding so it knows when to cleanup temporary data.
2869  */
2870  if (ndone + nthispage == ntuples)
2871  xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2872 
2873  if (init)
2874  {
2875  info |= XLOG_HEAP_INIT_PAGE;
2876  bufflags |= REGBUF_WILL_INIT;
2877  }
2878 
2879  /*
2880  * If we're doing logical decoding, include the new tuple data
2881  * even if we take a full-page image of the page.
2882  */
2883  if (need_tuple_data)
2884  bufflags |= REGBUF_KEEP_DATA;
2885 
2886  XLogBeginInsert();
2887  XLogRegisterData((char *) xlrec, tupledata - scratch);
2888  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2889 
2890  XLogRegisterBufData(0, tupledata, totaldatalen);
2891 
2892  /* filtering by origin on a row level is much more efficient */
2894 
2895  recptr = XLogInsert(RM_HEAP2_ID, info);
2896 
2897  PageSetLSN(page, recptr);
2898  }
2899 
2900  END_CRIT_SECTION();
2901 
2902  UnlockReleaseBuffer(buffer);
2903  if (vmbuffer != InvalidBuffer)
2904  ReleaseBuffer(vmbuffer);
2905 
2906  ndone += nthispage;
2907  }
2908 
2909  /*
2910  * We're done with the actual inserts. Check for conflicts again, to
2911  * ensure that all rw-conflicts in to these inserts are detected. Without
2912  * this final check, a sequential scan of the heap may have locked the
2913  * table after the "before" check, missing one opportunity to detect the
2914  * conflict, and then scanned the table before the new tuples were there,
2915  * missing the other chance to detect the conflict.
2916  *
2917  * For heap inserts, we only need to check for table-level SSI locks. Our
2918  * new tuples can't possibly conflict with existing tuple locks, and heap
2919  * page locks are only consolidated versions of tuple locks; they do not
2920  * lock "gaps" as index page locks do. So we don't need to specify a
2921  * buffer when making the call.
2922  */
2924 
2925  /*
2926  * If tuples are cachable, mark them for invalidation from the caches in
2927  * case we abort. Note it is OK to do this after releasing the buffer,
2928  * because the heaptuples data structure is all in local memory, not in
2929  * the shared buffer.
2930  */
2931  if (IsCatalogRelation(relation))
2932  {
2933  for (i = 0; i < ntuples; i++)
2934  CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2935  }
2936 
2937  /*
2938  * Copy t_self fields back to the caller's original tuples. This does
2939  * nothing for untoasted tuples (tuples[i] == heaptuples[i)], but it's
2940  * probably faster to always copy than check.
2941  */
2942  for (i = 0; i < ntuples; i++)
2943  tuples[i]->t_self = heaptuples[i]->t_self;
2944 
2945  pgstat_count_heap_insert(relation, ntuples);
2946 }
2947 
2948 /*
2949  * simple_heap_insert - insert a tuple
2950  *
2951  * Currently, this routine differs from heap_insert only in supplying
2952  * a default command ID and not allowing access to the speedup options.
2953  *
2954  * This should be used rather than using heap_insert directly in most places
2955  * where we are modifying system catalogs.
2956  */
2957 Oid
2959 {
2960  return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2961 }
2962 
2963 /*
2964  * Given infomask/infomask2, compute the bits that must be saved in the
2965  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2966  * xl_heap_lock_updated WAL records.
2967  *
2968  * See fix_infomask_from_infobits.
2969  */
2970 static uint8
2971 compute_infobits(uint16 infomask, uint16 infomask2)
2972 {
2973  return
2974  ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2975  ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2976  ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2977  /* note we ignore HEAP_XMAX_SHR_LOCK here */
2978  ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2979  ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2980  XLHL_KEYS_UPDATED : 0);
2981 }
2982 
2983 /*
2984  * Given two versions of the same t_infomask for a tuple, compare them and
2985  * return whether the relevant status for a tuple Xmax has changed. This is
2986  * used after a buffer lock has been released and reacquired: we want to ensure
2987  * that the tuple state continues to be the same it was when we previously
2988  * examined it.
2989  *
2990  * Note the Xmax field itself must be compared separately.
2991  */
2992 static inline bool
2993 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2994 {
2995  const uint16 interesting =
2997 
2998  if ((new_infomask & interesting) != (old_infomask & interesting))
2999  return true;
3000 
3001  return false;
3002 }
3003 
3004 /*
3005  * heap_delete - delete a tuple
3006  *
3007  * NB: do not call this directly unless you are prepared to deal with
3008  * concurrent-update conditions. Use simple_heap_delete instead.
3009  *
3010  * relation - table to be modified (caller must hold suitable lock)
3011  * tid - TID of tuple to be deleted
3012  * cid - delete command ID (used for visibility test, and stored into
3013  * cmax if successful)
3014  * crosscheck - if not InvalidSnapshot, also check tuple against this
3015  * wait - true if should wait for any conflicting update to commit/abort
3016  * hufd - output parameter, filled in failure cases (see below)
3017  *
3018  * Normal, successful return value is HeapTupleMayBeUpdated, which
3019  * actually means we did delete it. Failure return codes are
3020  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3021  * (the last only possible if wait == false).
3022  *
3023  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3024  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3025  * (the last only for HeapTupleSelfUpdated, since we
3026  * cannot obtain cmax from a combocid generated by another transaction).
3027  * See comments for struct HeapUpdateFailureData for additional info.
3028  */
3031  CommandId cid, Snapshot crosscheck, bool wait,
3032  HeapUpdateFailureData *hufd)
3033 {
3034  HTSU_Result result;
3036  ItemId lp;
3037  HeapTupleData tp;
3038  Page page;
3039  BlockNumber block;
3040  Buffer buffer;
3041  Buffer vmbuffer = InvalidBuffer;
3042  TransactionId new_xmax;
3043  uint16 new_infomask,
3044  new_infomask2;
3045  bool have_tuple_lock = false;
3046  bool iscombo;
3047  bool all_visible_cleared = false;
3048  HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
3049  bool old_key_copied = false;
3050 
3051  Assert(ItemPointerIsValid(tid));
3052 
3053  /*
3054  * Forbid this during a parallel operation, lest it allocate a combocid.
3055  * Other workers might need that combocid for visibility checks, and we
3056  * have no provision for broadcasting it to them.
3057  */
3058  if (IsInParallelMode())
3059  ereport(ERROR,
3060  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3061  errmsg("cannot delete tuples during a parallel operation")));
3062 
3063  block = ItemPointerGetBlockNumber(tid);
3064  buffer = ReadBuffer(relation, block);
3065  page = BufferGetPage(buffer);
3066 
3067  /*
3068  * Before locking the buffer, pin the visibility map page if it appears to
3069  * be necessary. Since we haven't got the lock yet, someone else might be
3070  * in the middle of changing this, so we'll need to recheck after we have
3071  * the lock.
3072  */
3073  if (PageIsAllVisible(page))
3074  visibilitymap_pin(relation, block, &vmbuffer);
3075 
3077 
3078  /*
3079  * If we didn't pin the visibility map page and the page has become all
3080  * visible while we were busy locking the buffer, we'll have to unlock and
3081  * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
3082  * unfortunate, but hopefully shouldn't happen often.
3083  */
3084  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3085  {
3086  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3087  visibilitymap_pin(relation, block, &vmbuffer);
3089  }
3090 
3091  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
3092  Assert(ItemIdIsNormal(lp));
3093 
3094  tp.t_tableOid = RelationGetRelid(relation);
3095  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3096  tp.t_len = ItemIdGetLength(lp);
3097  tp.t_self = *tid;
3098 
3099 l1:
3100  result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
3101 
3102  if (result == HeapTupleInvisible)
3103  {
3104  UnlockReleaseBuffer(buffer);
3105  ereport(ERROR,
3106  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3107  errmsg("attempted to delete invisible tuple")));
3108  }
3109  else if (result == HeapTupleBeingUpdated && wait)
3110  {
3111  TransactionId xwait;
3112  uint16 infomask;
3113 
3114  /* must copy state data before unlocking buffer */
3115  xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
3116  infomask = tp.t_data->t_infomask;
3117 
3118  /*
3119  * Sleep until concurrent transaction ends -- except when there's a
3120  * single locker and it's our own transaction. Note we don't care
3121  * which lock mode the locker has, because we need the strongest one.
3122  *
3123  * Before sleeping, we need to acquire tuple lock to establish our
3124  * priority for the tuple (see heap_lock_tuple). LockTuple will
3125  * release us when we are next-in-line for the tuple.
3126  *
3127  * If we are forced to "start over" below, we keep the tuple lock;
3128  * this arranges that we stay at the head of the line while rechecking
3129  * tuple state.
3130  */
3131  if (infomask & HEAP_XMAX_IS_MULTI)
3132  {
3133  /* wait for multixact */
3134  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3136  {
3137  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3138 
3139  /* acquire tuple lock, if necessary */
3141  LockWaitBlock, &have_tuple_lock);
3142 
3143  /* wait for multixact */
3145  relation, &(tp.t_self), XLTW_Delete,
3146  NULL);
3148 
3149  /*
3150  * If xwait had just locked the tuple then some other xact
3151  * could update this tuple before we get to this point. Check
3152  * for xmax change, and start over if so.
3153  */
3154  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3156  xwait))
3157  goto l1;
3158  }
3159 
3160  /*
3161  * You might think the multixact is necessarily done here, but not
3162  * so: it could have surviving members, namely our own xact or
3163  * other subxacts of this backend. It is legal for us to delete
3164  * the tuple in either case, however (the latter case is
3165  * essentially a situation of upgrading our former shared lock to
3166  * exclusive). We don't bother changing the on-disk hint bits
3167  * since we are about to overwrite the xmax altogether.
3168  */
3169  }
3170  else if (!TransactionIdIsCurrentTransactionId(xwait))
3171  {
3172  /*
3173  * Wait for regular transaction to end; but first, acquire tuple
3174  * lock.
3175  */
3176  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3178  LockWaitBlock, &have_tuple_lock);
3179  XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3181 
3182  /*
3183  * xwait is done, but if xwait had just locked the tuple then some
3184  * other xact could update this tuple before we get to this point.
3185  * Check for xmax change, and start over if so.
3186  */
3187  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3189  xwait))
3190  goto l1;
3191 
3192  /* Otherwise check if it committed or aborted */
3193  UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3194  }
3195 
3196  /*
3197  * We may overwrite if previous xmax aborted, or if it committed but
3198  * only locked the tuple without updating it.
3199  */
3200  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3203  result = HeapTupleMayBeUpdated;
3204  else
3205  result = HeapTupleUpdated;
3206  }
3207 
3208  if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3209  {
3210  /* Perform additional check for transaction-snapshot mode RI updates */
3211  if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3212  result = HeapTupleUpdated;
3213  }
3214 
3215  if (result != HeapTupleMayBeUpdated)
3216  {
3217  Assert(result == HeapTupleSelfUpdated ||
3218  result == HeapTupleUpdated ||
3219  result == HeapTupleBeingUpdated);
3221  hufd->ctid = tp.t_data->t_ctid;
3223  if (result == HeapTupleSelfUpdated)
3224  hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
3225  else
3226  hufd->cmax = InvalidCommandId;
3227  UnlockReleaseBuffer(buffer);
3228  if (have_tuple_lock)
3229  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3230  if (vmbuffer != InvalidBuffer)
3231  ReleaseBuffer(vmbuffer);
3232  return result;
3233  }
3234 
3235  /*
3236  * We're about to do the actual delete -- check for conflict first, to
3237  * avoid possibly having to roll back work we've just done.
3238  *
3239  * This is safe without a recheck as long as there is no possibility of
3240  * another process scanning the page between this check and the delete
3241  * being visible to the scan (i.e., an exclusive buffer content lock is
3242  * continuously held from this point until the tuple delete is visible).
3243  */
3244  CheckForSerializableConflictIn(relation, &tp, buffer);
3245 
3246  /* replace cid with a combo cid if necessary */
3247  HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
3248 
3249  /*
3250  * Compute replica identity tuple before entering the critical section so
3251  * we don't PANIC upon a memory allocation failure.
3252  */
3253  old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3254 
3255  /*
3256  * If this is the first possibly-multixact-able operation in the current
3257  * transaction, set my per-backend OldestMemberMXactId setting. We can be
3258  * certain that the transaction will never become a member of any older
3259  * MultiXactIds than that. (We have to do this even if we end up just
3260  * using our own TransactionId below, since some other backend could
3261  * incorporate our XID into a MultiXact immediately afterwards.)
3262  */
3264 
3267  xid, LockTupleExclusive, true,
3268  &new_xmax, &new_infomask, &new_infomask2);
3269 
3271 
3272  /*
3273  * If this transaction commits, the tuple will become DEAD sooner or
3274  * later. Set flag that this page is a candidate for pruning once our xid
3275  * falls below the OldestXmin horizon. If the transaction finally aborts,
3276  * the subsequent page pruning will be a no-op and the hint will be
3277  * cleared.
3278  */
3279  PageSetPrunable(page, xid);
3280 
3281  if (PageIsAllVisible(page))
3282  {
3283  all_visible_cleared = true;
3284  PageClearAllVisible(page);
3285  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3286  vmbuffer, VISIBILITYMAP_VALID_BITS);
3287  }
3288 
3289  /* store transaction information of xact deleting the tuple */
3292  tp.t_data->t_infomask |= new_infomask;
3293  tp.t_data->t_infomask2 |= new_infomask2;
3295  HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3296  HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
3297  /* Make sure there is no forward chain link in t_ctid */
3298  tp.t_data->t_ctid = tp.t_self;
3299 
3300  MarkBufferDirty(buffer);
3301 
3302  /*
3303  * XLOG stuff
3304  *
3305  * NB: heap_abort_speculative() uses the same xlog record and replay
3306  * routines.
3307  */
3308  if (RelationNeedsWAL(relation))
3309  {
3310  xl_heap_delete xlrec;
3311  XLogRecPtr recptr;
3312 
3313  /* For logical decode we need combocids to properly decode the catalog */
3315  log_heap_new_cid(relation, &tp);
3316 
3317  xlrec.flags = all_visible_cleared ? XLH_DELETE_ALL_VISIBLE_CLEARED : 0;
3319  tp.t_data->t_infomask2);
3321  xlrec.xmax = new_xmax;
3322 
3323  if (old_key_tuple != NULL)
3324  {
3325  if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3327  else
3329  }
3330 
3331  XLogBeginInsert();
3332  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3333 
3334  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3335 
3336  /*
3337  * Log replica identity of the deleted tuple if there is one
3338  */
3339  if (old_key_tuple != NULL)
3340  {
3341  xl_heap_header xlhdr;
3342 
3343  xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3344  xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3345  xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3346 
3347  XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3348  XLogRegisterData((char *) old_key_tuple->t_data
3350  old_key_tuple->t_len
3352  }
3353 
3354  /* filtering by origin on a row level is much more efficient */
3356 
3357  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3358 
3359  PageSetLSN(page, recptr);
3360  }
3361 
3362  END_CRIT_SECTION();
3363 
3364  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3365 
3366  if (vmbuffer != InvalidBuffer)
3367  ReleaseBuffer(vmbuffer);
3368 
3369  /*
3370  * If the tuple has toasted out-of-line attributes, we need to delete
3371  * those items too. We have to do this before releasing the buffer
3372  * because we need to look at the contents of the tuple, but it's OK to
3373  * release the content lock on the buffer first.
3374  */
3375  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3376  relation->rd_rel->relkind != RELKIND_MATVIEW)
3377  {
3378  /* toast table entries should never be recursively toasted */
3380  }
3381  else if (HeapTupleHasExternal(&tp))
3382  toast_delete(relation, &tp, false);
3383 
3384  /*
3385  * Mark tuple for invalidation from system caches at next command
3386  * boundary. We have to do this before releasing the buffer because we
3387  * need to look at the contents of the tuple.
3388  */
3389  CacheInvalidateHeapTuple(relation, &tp, NULL);
3390 
3391  /* Now we can release the buffer */
3392  ReleaseBuffer(buffer);
3393 
3394  /*
3395  * Release the lmgr tuple lock, if we had it.
3396  */
3397  if (have_tuple_lock)
3398  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3399 
3400  pgstat_count_heap_delete(relation);
3401 
3402  if (old_key_tuple != NULL && old_key_copied)
3403  heap_freetuple(old_key_tuple);
3404 
3405  return HeapTupleMayBeUpdated;
3406 }
3407 
3408 /*
3409  * simple_heap_delete - delete a tuple
3410  *
3411  * This routine may be used to delete a tuple when concurrent updates of
3412  * the target tuple are not expected (for example, because we have a lock
3413  * on the relation associated with the tuple). Any failure is reported
3414  * via ereport().
3415  */
3416 void
3418 {
3419  HTSU_Result result;
3420  HeapUpdateFailureData hufd;
3421 
3422  result = heap_delete(relation, tid,
3424  true /* wait for commit */ ,
3425  &hufd);
3426  switch (result)
3427  {
3428  case HeapTupleSelfUpdated:
3429  /* Tuple was already updated in current command? */
3430  elog(ERROR, "tuple already updated by self");
3431  break;
3432 
3433  case HeapTupleMayBeUpdated:
3434  /* done successfully */
3435  break;
3436 
3437  case HeapTupleUpdated:
3438  elog(ERROR, "tuple concurrently updated");
3439  break;
3440 
3441  default:
3442  elog(ERROR, "unrecognized heap_delete status: %u", result);
3443  break;
3444  }
3445 }
3446 
3447 /*
3448  * heap_update - replace a tuple
3449  *
3450  * NB: do not call this directly unless you are prepared to deal with
3451  * concurrent-update conditions. Use simple_heap_update instead.
3452  *
3453  * relation - table to be modified (caller must hold suitable lock)
3454  * otid - TID of old tuple to be replaced
3455  * newtup - newly constructed tuple data to store
3456  * cid - update command ID (used for visibility test, and stored into
3457  * cmax/cmin if successful)
3458  * crosscheck - if not InvalidSnapshot, also check old tuple against this
3459  * wait - true if should wait for any conflicting update to commit/abort
3460  * hufd - output parameter, filled in failure cases (see below)
3461  * lockmode - output parameter, filled with lock mode acquired on tuple
3462  *
3463  * Normal, successful return value is HeapTupleMayBeUpdated, which
3464  * actually means we *did* update it. Failure return codes are
3465  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3466  * (the last only possible if wait == false).
3467  *
3468  * On success, the header fields of *newtup are updated to match the new
3469  * stored tuple; in particular, newtup->t_self is set to the TID where the
3470  * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
3471  * update was done. However, any TOAST changes in the new tuple's
3472  * data are not reflected into *newtup.
3473  *
3474  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3475  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3476  * (the last only for HeapTupleSelfUpdated, since we
3477  * cannot obtain cmax from a combocid generated by another transaction).
3478  * See comments for struct HeapUpdateFailureData for additional info.
3479  */
3482  CommandId cid, Snapshot crosscheck, bool wait,
3483  HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
3484 {
3485  HTSU_Result result;
3487  Bitmapset *hot_attrs;
3488  Bitmapset *key_attrs;
3489  Bitmapset *id_attrs;
3490  Bitmapset *interesting_attrs;
3491  Bitmapset *modified_attrs;
3492  ItemId lp;
3493  HeapTupleData oldtup;
3494  HeapTuple heaptup;
3495  HeapTuple old_key_tuple = NULL;
3496  bool old_key_copied = false;
3497  Page page;
3498  BlockNumber block;
3499  MultiXactStatus mxact_status;
3500  Buffer buffer,
3501  newbuf,
3502  vmbuffer = InvalidBuffer,
3503  vmbuffer_new = InvalidBuffer;
3504  bool need_toast;
3505  Size newtupsize,
3506  pagefree;
3507  bool have_tuple_lock = false;
3508  bool iscombo;
3509  bool use_hot_update = false;
3510  bool hot_attrs_checked = false;
3511  bool key_intact;
3512  bool all_visible_cleared = false;
3513  bool all_visible_cleared_new = false;
3514  bool checked_lockers;
3515  bool locker_remains;
3516  TransactionId xmax_new_tuple,
3517  xmax_old_tuple;
3518  uint16 infomask_old_tuple,
3519  infomask2_old_tuple,
3520  infomask_new_tuple,
3521  infomask2_new_tuple;
3522 
3523  Assert(ItemPointerIsValid(otid));
3524 
3525  /*
3526  * Forbid this during a parallel operation, lest it allocate a combocid.
3527  * Other workers might need that combocid for visibility checks, and we
3528  * have no provision for broadcasting it to them.
3529  */
3530  if (IsInParallelMode())
3531  ereport(ERROR,
3532  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3533  errmsg("cannot update tuples during a parallel operation")));
3534 
3535  /*
3536  * Fetch the list of attributes to be checked for various operations.
3537  *
3538  * For HOT considerations, this is wasted effort if we fail to update or
3539  * have to put the new tuple on a different page. But we must compute the
3540  * list before obtaining buffer lock --- in the worst case, if we are
3541  * doing an update on one of the relevant system catalogs, we could
3542  * deadlock if we try to fetch the list later. In any case, the relcache
3543  * caches the data so this is usually pretty cheap.
3544  *
3545  * We also need columns used by the replica identity and columns that are
3546  * considered the "key" of rows in the table.
3547  *
3548  * Note that we get copies of each bitmap, so we need not worry about
3549  * relcache flush happening midway through.
3550  */
3551  hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
3552  key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3553  id_attrs = RelationGetIndexAttrBitmap(relation,
3555 
3556 
3557  block = ItemPointerGetBlockNumber(otid);
3558  buffer = ReadBuffer(relation, block);
3559  page = BufferGetPage(buffer);
3560 
3561  interesting_attrs = NULL;
3562 
3563  /*
3564  * If the page is already full, there is hardly any chance of doing a HOT
3565  * update on this page. It might be wasteful effort to look for index
3566  * column updates only to later reject HOT updates for lack of space in
3567  * the same page. So we be conservative and only fetch hot_attrs if the
3568  * page is not already full. Since we are already holding a pin on the
3569  * buffer, there is no chance that the buffer can get cleaned up
3570  * concurrently and even if that was possible, in the worst case we lose a
3571  * chance to do a HOT update.
3572  */
3573  if (!PageIsFull(page))
3574  {
3575  interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3576  hot_attrs_checked = true;
3577  }
3578  interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3579  interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3580 
3581  /*
3582  * Before locking the buffer, pin the visibility map page if it appears to
3583  * be necessary. Since we haven't got the lock yet, someone else might be
3584  * in the middle of changing this, so we'll need to recheck after we have
3585  * the lock.
3586  */
3587  if (PageIsAllVisible(page))
3588  visibilitymap_pin(relation, block, &vmbuffer);
3589 
3591 
3592  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3593  Assert(ItemIdIsNormal(lp));
3594 
3595  /*
3596  * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3597  * properly.
3598  */
3599  oldtup.t_tableOid = RelationGetRelid(relation);
3600  oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3601  oldtup.t_len = ItemIdGetLength(lp);
3602  oldtup.t_self = *otid;
3603 
3604  /* the new tuple is ready, except for this: */
3605  newtup->t_tableOid = RelationGetRelid(relation);
3606 
3607  /* Fill in OID for newtup */
3608  if (relation->rd_rel->relhasoids)
3609  {
3610 #ifdef NOT_USED
3611  /* this is redundant with an Assert in HeapTupleSetOid */
3612  Assert(newtup->t_data->t_infomask & HEAP_HASOID);
3613 #endif
3614  HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
3615  }
3616  else
3617  {
3618  /* check there is not space for an OID */
3619  Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
3620  }
3621 
3622  /* Determine columns modified by the update. */
3623  modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3624  &oldtup, newtup);
3625 
3626  /*
3627  * If we're not updating any "key" column, we can grab a weaker lock type.
3628  * This allows for more concurrency when we are running simultaneously
3629  * with foreign key checks.
3630  *
3631  * Note that if a column gets detoasted while executing the update, but
3632  * the value ends up being the same, this test will fail and we will use
3633  * the stronger lock. This is acceptable; the important case to optimize
3634  * is updates that don't manipulate key columns, not those that
3635  * serendipitiously arrive at the same key values.
3636  */
3637  if (!bms_overlap(modified_attrs, key_attrs))
3638  {
3639  *lockmode = LockTupleNoKeyExclusive;
3640  mxact_status = MultiXactStatusNoKeyUpdate;
3641  key_intact = true;
3642 
3643  /*
3644  * If this is the first possibly-multixact-able operation in the
3645  * current transaction, set my per-backend OldestMemberMXactId
3646  * setting. We can be certain that the transaction will never become a
3647  * member of any older MultiXactIds than that. (We have to do this
3648  * even if we end up just using our own TransactionId below, since
3649  * some other backend could incorporate our XID into a MultiXact
3650  * immediately afterwards.)
3651  */
3653  }
3654  else
3655  {
3656  *lockmode = LockTupleExclusive;
3657  mxact_status = MultiXactStatusUpdate;
3658  key_intact = false;
3659  }
3660 
3661  /*
3662  * Note: beyond this point, use oldtup not otid to refer to old tuple.
3663  * otid may very well point at newtup->t_self, which we will overwrite
3664  * with the new tuple's location, so there's great risk of confusion if we
3665  * use otid anymore.
3666  */
3667 
3668 l2:
3669  checked_lockers = false;
3670  locker_remains = false;
3671  result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3672 
3673  /* see below about the "no wait" case */
3674  Assert(result != HeapTupleBeingUpdated || wait);
3675 
3676  if (result == HeapTupleInvisible)
3677  {
3678  UnlockReleaseBuffer(buffer);
3679  ereport(ERROR,
3680  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3681  errmsg("attempted to update invisible tuple")));
3682  }
3683  else if (result == HeapTupleBeingUpdated && wait)
3684  {
3685  TransactionId xwait;
3686  uint16 infomask;
3687  bool can_continue = false;
3688 
3689  /*
3690  * XXX note that we don't consider the "no wait" case here. This
3691  * isn't a problem currently because no caller uses that case, but it
3692  * should be fixed if such a caller is introduced. It wasn't a
3693  * problem previously because this code would always wait, but now
3694  * that some tuple locks do not conflict with one of the lock modes we
3695  * use, it is possible that this case is interesting to handle
3696  * specially.
3697  *
3698  * This may cause failures with third-party code that calls
3699  * heap_update directly.
3700  */
3701 
3702  /* must copy state data before unlocking buffer */
3703  xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3704  infomask = oldtup.t_data->t_infomask;
3705 
3706  /*
3707  * Now we have to do something about the existing locker. If it's a
3708  * multi, sleep on it; we might be awakened before it is completely
3709  * gone (or even not sleep at all in some cases); we need to preserve
3710  * it as locker, unless it is gone completely.
3711  *
3712  * If it's not a multi, we need to check for sleeping conditions
3713  * before actually going to sleep. If the update doesn't conflict
3714  * with the locks, we just continue without sleeping (but making sure
3715  * it is preserved).
3716  *
3717  * Before sleeping, we need to acquire tuple lock to establish our
3718  * priority for the tuple (see heap_lock_tuple). LockTuple will
3719  * release us when we are next-in-line for the tuple. Note we must
3720  * not acquire the tuple lock until we're sure we're going to sleep;
3721  * otherwise we're open for race conditions with other transactions
3722  * holding the tuple lock which sleep on us.
3723  *
3724  * If we are forced to "start over" below, we keep the tuple lock;
3725  * this arranges that we stay at the head of the line while rechecking
3726  * tuple state.
3727  */
3728  if (infomask & HEAP_XMAX_IS_MULTI)
3729  {
3730  TransactionId update_xact;
3731  int remain;
3732 
3733  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3734  *lockmode))
3735  {
3736  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3737 
3738  /* acquire tuple lock, if necessary */
3739  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3740  LockWaitBlock, &have_tuple_lock);
3741 
3742  /* wait for multixact */
3743  MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3744  relation, &oldtup.t_self, XLTW_Update,
3745  &remain);
3746  checked_lockers = true;
3747  locker_remains = remain != 0;
3749 
3750  /*
3751  * If xwait had just locked the tuple then some other xact
3752  * could update this tuple before we get to this point. Check
3753  * for xmax change, and start over if so.
3754  */
3756  infomask) ||
3758  xwait))
3759  goto l2;
3760  }
3761 
3762  /*
3763  * Note that the multixact may not be done by now. It could have
3764  * surviving members; our own xact or other subxacts of this
3765  * backend, and also any other concurrent transaction that locked
3766  * the tuple with KeyShare if we only got TupleLockUpdate. If
3767  * this is the case, we have to be careful to mark the updated
3768  * tuple with the surviving members in Xmax.
3769  *
3770  * Note that there could have been another update in the
3771  * MultiXact. In that case, we need to check whether it committed
3772  * or aborted. If it aborted we are safe to update it again;
3773  * otherwise there is an update conflict, and we have to return
3774  * HeapTupleUpdated below.
3775  *
3776  * In the LockTupleExclusive case, we still need to preserve the
3777  * surviving members: those would include the tuple locks we had
3778  * before this one, which are important to keep in case this
3779  * subxact aborts.
3780  */
3782  update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3783  else
3784  update_xact = InvalidTransactionId;
3785 
3786  /*
3787  * There was no UPDATE in the MultiXact; or it aborted. No
3788  * TransactionIdIsInProgress() call needed here, since we called
3789  * MultiXactIdWait() above.
3790  */
3791  if (!TransactionIdIsValid(update_xact) ||
3792  TransactionIdDidAbort(update_xact))
3793  can_continue = true;
3794  }
3795  else if (TransactionIdIsCurrentTransactionId(xwait))
3796  {
3797  /*
3798  * The only locker is ourselves; we can avoid grabbing the tuple
3799  * lock here, but must preserve our locking information.
3800  */
3801  checked_lockers = true;
3802  locker_remains = true;
3803  can_continue = true;
3804  }
3805  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3806  {
3807  /*
3808  * If it's just a key-share locker, and we're not changing the key
3809  * columns, we don't need to wait for it to end; but we need to
3810  * preserve it as locker.
3811  */
3812  checked_lockers = true;
3813  locker_remains = true;
3814  can_continue = true;
3815  }
3816  else
3817  {
3818  /*
3819  * Wait for regular transaction to end; but first, acquire tuple
3820  * lock.
3821  */
3822  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3823  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3824  LockWaitBlock, &have_tuple_lock);
3825  XactLockTableWait(xwait, relation, &oldtup.t_self,
3826  XLTW_Update);
3827  checked_lockers = true;
3829 
3830  /*
3831  * xwait is done, but if xwait had just locked the tuple then some
3832  * other xact could update this tuple before we get to this point.
3833  * Check for xmax change, and start over if so.
3834  */
3835  if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3836  !TransactionIdEquals(xwait,
3838  goto l2;
3839 
3840  /* Otherwise check if it committed or aborted */
3841  UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3842  if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3843  can_continue = true;
3844  }
3845 
3846  result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
3847  }
3848 
3849  if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3850  {
3851  /* Perform additional check for transaction-snapshot mode RI updates */
3852  if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3853  result = HeapTupleUpdated;
3854  }
3855 
3856  if (result != HeapTupleMayBeUpdated)
3857  {
3858  Assert(result == HeapTupleSelfUpdated ||
3859  result == HeapTupleUpdated ||
3860  result == HeapTupleBeingUpdated);
3861  Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3862  hufd->ctid = oldtup.t_data->t_ctid;
3863  hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3864  if (result == HeapTupleSelfUpdated)
3865  hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3866  else
3867  hufd->cmax = InvalidCommandId;
3868  UnlockReleaseBuffer(buffer);
3869  if (have_tuple_lock)
3870  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3871  if (vmbuffer != InvalidBuffer)
3872  ReleaseBuffer(vmbuffer);
3873  bms_free(hot_attrs);
3874  bms_free(key_attrs);
3875  bms_free(id_attrs);
3876  bms_free(modified_attrs);
3877  bms_free(interesting_attrs);
3878  return result;
3879  }
3880 
3881  /*
3882  * If we didn't pin the visibility map page and the page has become all
3883  * visible while we were busy locking the buffer, or during some
3884  * subsequent window during which we had it unlocked, we'll have to unlock
3885  * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3886  * bit unfortunate, especially since we'll now have to recheck whether the
3887  * tuple has been locked or updated under us, but hopefully it won't
3888  * happen very often.
3889  */
3890  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3891  {
3892  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3893  visibilitymap_pin(relation, block, &vmbuffer);
3895  goto l2;
3896  }
3897 
3898  /* Fill in transaction status data */
3899 
3900  /*
3901  * If the tuple we're updating is locked, we need to preserve the locking
3902  * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3903  */
3905  oldtup.t_data->t_infomask,
3906  oldtup.t_data->t_infomask2,
3907  xid, *lockmode, true,
3908  &xmax_old_tuple, &infomask_old_tuple,
3909  &infomask2_old_tuple);
3910 
3911  /*
3912  * And also prepare an Xmax value for the new copy of the tuple. If there
3913  * was no xmax previously, or there was one but all lockers are now gone,
3914  * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3915  * rare cases that might also be InvalidXid and yet not have the
3916  * HEAP_XMAX_INVALID bit set; that's fine.)
3917  */
3918  if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3920  (checked_lockers && !locker_remains))
3921  xmax_new_tuple = InvalidTransactionId;
3922  else
3923  xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3924 
3925  if (!TransactionIdIsValid(xmax_new_tuple))
3926  {
3927  infomask_new_tuple = HEAP_XMAX_INVALID;
3928  infomask2_new_tuple = 0;
3929  }
3930  else
3931  {
3932  /*
3933  * If we found a valid Xmax for the new tuple, then the infomask bits
3934  * to use on the new tuple depend on what was there on the old one.
3935  * Note that since we're doing an update, the only possibility is that
3936  * the lockers had FOR KEY SHARE lock.
3937  */
3938  if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3939  {
3940  GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3941  &infomask2_new_tuple);
3942  }
3943  else
3944  {
3945  infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3946  infomask2_new_tuple = 0;
3947  }
3948  }
3949 
3950  /*
3951  * Prepare the new tuple with the appropriate initial values of Xmin and
3952  * Xmax, as well as initial infomask bits as computed above.
3953  */
3954  newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3955  newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3956  HeapTupleHeaderSetXmin(newtup->t_data, xid);
3957  HeapTupleHeaderSetCmin(newtup->t_data, cid);
3958  newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3959  newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3960  HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3961 
3962  /*
3963  * Replace cid with a combo cid if necessary. Note that we already put
3964  * the plain cid into the new tuple.
3965  */
3966  HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3967 
3968  /*
3969  * If the toaster needs to be activated, OR if the new tuple will not fit
3970  * on the same page as the old, then we need to release the content lock
3971  * (but not the pin!) on the old tuple's buffer while we are off doing
3972  * TOAST and/or table-file-extension work. We must mark the old tuple to
3973  * show that it's locked, else other processes may try to update it
3974  * themselves.
3975  *
3976  * We need to invoke the toaster if there are already any out-of-line
3977  * toasted values present, or if the new tuple is over-threshold.
3978  */
3979  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3980  relation->rd_rel->relkind != RELKIND_MATVIEW)
3981  {
3982  /* toast table entries should never be recursively toasted */
3983  Assert(!HeapTupleHasExternal(&oldtup));
3984  Assert(!HeapTupleHasExternal(newtup));
3985  need_toast = false;
3986  }
3987  else
3988  need_toast = (HeapTupleHasExternal(&oldtup) ||
3989  HeapTupleHasExternal(newtup) ||
3990  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3991 
3992  pagefree = PageGetHeapFreeSpace(page);
3993 
3994  newtupsize = MAXALIGN(newtup->t_len);
3995 
3996  if (need_toast || newtupsize > pagefree)
3997  {
3998  TransactionId xmax_lock_old_tuple;
3999  uint16 infomask_lock_old_tuple,
4000  infomask2_lock_old_tuple;
4001  bool cleared_all_frozen = false;
4002 
4003  /*
4004  * To prevent concurrent sessions from updating the tuple, we have to
4005  * temporarily mark it locked, while we release the page-level lock.
4006  *
4007  * To satisfy the rule that any xid potentially appearing in a buffer
4008  * written out to disk, we unfortunately have to WAL log this
4009  * temporary modification. We can reuse xl_heap_lock for this
4010  * purpose. If we crash/error before following through with the
4011  * actual update, xmax will be of an aborted transaction, allowing
4012  * other sessions to proceed.
4013  */
4014 
4015  /*
4016  * Compute xmax / infomask appropriate for locking the tuple. This has
4017  * to be done separately from the combo that's going to be used for
4018  * updating, because the potentially created multixact would otherwise
4019  * be wrong.
4020  */
4022  oldtup.t_data->t_infomask,
4023  oldtup.t_data->t_infomask2,
4024  xid, *lockmode, false,
4025  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
4026  &infomask2_lock_old_tuple);
4027 
4028  Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
4029 
4031 
4032  /* Clear obsolete visibility flags ... */
4033  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4034  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4035  HeapTupleClearHotUpdated(&oldtup);
4036  /* ... and store info about transaction updating this tuple */
4037  Assert(TransactionIdIsValid(xmax_lock_old_tuple));
4038  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
4039  oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
4040  oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
4041  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4042 
4043  /* temporarily make it look not-updated, but locked */
4044  oldtup.t_data->t_ctid = oldtup.t_self;
4045 
4046  /*
4047  * Clear all-frozen bit on visibility map if needed. We could
4048  * immediately reset ALL_VISIBLE, but given that the WAL logging
4049  * overhead would be unchanged, that doesn't seem necessarily
4050  * worthwhile.
4051  */
4052  if (PageIsAllVisible(BufferGetPage(buffer)) &&
4053  visibilitymap_clear(relation, block, vmbuffer,
4055  cleared_all_frozen = true;
4056 
4057  MarkBufferDirty(buffer);
4058 
4059  if (RelationNeedsWAL(relation))
4060  {
4061  xl_heap_lock xlrec;
4062  XLogRecPtr recptr;
4063 
4064  XLogBeginInsert();
4065  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
4066 
4067  xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
4068  xlrec.locking_xid = xmax_lock_old_tuple;
4070  oldtup.t_data->t_infomask2);
4071  xlrec.flags =
4072  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4073  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4074  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4075  PageSetLSN(page, recptr);
4076  }
4077 
4078  END_CRIT_SECTION();
4079 
4080  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4081 
4082  /*
4083  * Let the toaster do its thing, if needed.
4084  *
4085  * Note: below this point, heaptup is the data we actually intend to
4086  * store into the relation; newtup is the caller's original untoasted
4087  * data.
4088  */
4089  if (need_toast)
4090  {
4091  /* Note we always use WAL and FSM during updates */
4092  heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
4093  newtupsize = MAXALIGN(heaptup->t_len);
4094  }
4095  else
4096  heaptup = newtup;
4097 
4098  /*
4099  * Now, do we need a new page for the tuple, or not? This is a bit
4100  * tricky since someone else could have added tuples to the page while
4101  * we weren't looking. We have to recheck the available space after
4102  * reacquiring the buffer lock. But don't bother to do that if the
4103  * former amount of free space is still not enough; it's unlikely
4104  * there's more free now than before.
4105  *
4106  * What's more, if we need to get a new page, we will need to acquire
4107  * buffer locks on both old and new pages. To avoid deadlock against
4108  * some other backend trying to get the same two locks in the other
4109  * order, we must be consistent about the order we get the locks in.
4110  * We use the rule "lock the lower-numbered page of the relation
4111  * first". To implement this, we must do RelationGetBufferForTuple
4112  * while not holding the lock on the old page, and we must rely on it
4113  * to get the locks on both pages in the correct order.
4114  */
4115  if (newtupsize > pagefree)
4116  {
4117  /* Assume there's no chance to put heaptup on same page. */
4118  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4119  buffer, 0, NULL,
4120  &vmbuffer_new, &vmbuffer);
4121  }
4122  else
4123  {
4124  /* Re-acquire the lock on the old tuple's page. */
4126  /* Re-check using the up-to-date free space */
4127  pagefree = PageGetHeapFreeSpace(page);
4128  if (newtupsize > pagefree)
4129  {
4130  /*
4131  * Rats, it doesn't fit anymore. We must now unlock and
4132  * relock to avoid deadlock. Fortunately, this path should
4133  * seldom be taken.
4134  */
4135  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4136  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4137  buffer, 0, NULL,
4138  &vmbuffer_new, &vmbuffer);
4139  }
4140  else
4141  {
4142  /* OK, it fits here, so we're done. */
4143  newbuf = buffer;
4144  }
4145  }
4146  }
4147  else
4148  {
4149  /* No TOAST work needed, and it'll fit on same page */
4150  newbuf = buffer;
4151  heaptup = newtup;
4152  }
4153 
4154  /*
4155  * We're about to do the actual update -- check for conflict first, to
4156  * avoid possibly having to roll back work we've just done.
4157  *
4158  * This is safe without a recheck as long as there is no possibility of
4159  * another process scanning the pages between this check and the update
4160  * being visible to the scan (i.e., exclusive buffer content lock(s) are
4161  * continuously held from this point until the tuple update is visible).
4162  *
4163  * For the new tuple the only check needed is at the relation level, but
4164  * since both tuples are in the same relation and the check for oldtup
4165  * will include checking the relation level, there is no benefit to a
4166  * separate check for the new tuple.
4167  */
4168  CheckForSerializableConflictIn(relation, &oldtup, buffer);
4169 
4170  /*
4171  * At this point newbuf and buffer are both pinned and locked, and newbuf
4172  * has enough space for the new tuple. If they are the same buffer, only
4173  * one pin is held.
4174  */
4175 
4176  if (newbuf == buffer)
4177  {
4178  /*
4179  * Since the new tuple is going into the same page, we might be able
4180  * to do a HOT update. Check if any of the index columns have been
4181  * changed. If the page was already full, we may have skipped checking
4182  * for index columns. If so, HOT update is possible.
4183  */
4184  if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
4185  use_hot_update = true;
4186  }
4187  else
4188  {
4189  /* Set a hint that the old page could use prune/defrag */
4190  PageSetFull(page);
4191  }
4192 
4193  /*
4194  * Compute replica identity tuple before entering the critical section so
4195  * we don't PANIC upon a memory allocation failure.
4196  * ExtractReplicaIdentity() will return NULL if nothing needs to be
4197  * logged.
4198  */
4199  old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
4200  bms_overlap(modified_attrs, id_attrs),
4201  &old_key_copied);
4202 
4203  /* NO EREPORT(ERROR) from here till changes are logged */
4205 
4206  /*
4207  * If this transaction commits, the old tuple will become DEAD sooner or
4208  * later. Set flag that this page is a candidate for pruning once our xid
4209  * falls below the OldestXmin horizon. If the transaction finally aborts,
4210  * the subsequent page pruning will be a no-op and the hint will be
4211  * cleared.
4212  *
4213  * XXX Should we set hint on newbuf as well? If the transaction aborts,
4214  * there would be a prunable tuple in the newbuf; but for now we choose
4215  * not to optimize for aborts. Note that heap_xlog_update must be kept in
4216  * sync if this decision changes.
4217  */
4218  PageSetPrunable(page, xid);
4219 
4220  if (use_hot_update)
4221  {
4222  /* Mark the old tuple as HOT-updated */
4223  HeapTupleSetHotUpdated(&oldtup);
4224  /* And mark the new tuple as heap-only */
4225  HeapTupleSetHeapOnly(heaptup);
4226  /* Mark the caller's copy too, in case different from heaptup */
4227  HeapTupleSetHeapOnly(newtup);
4228  }
4229  else
4230  {
4231  /* Make sure tuples are correctly marked as not-HOT */
4232  HeapTupleClearHotUpdated(&oldtup);
4233  HeapTupleClearHeapOnly(heaptup);
4234  HeapTupleClearHeapOnly(newtup);
4235  }
4236 
4237  RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4238 
4239 
4240  /* Clear obsolete visibility flags, possibly set by ourselves above... */
4241  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4242  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4243  /* ... and store info about transaction updating this tuple */
4244  Assert(TransactionIdIsValid(xmax_old_tuple));
4245  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
4246  oldtup.t_data->t_infomask |= infomask_old_tuple;
4247  oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4248  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4249 
4250  /* record address of new tuple in t_ctid of old one */
4251  oldtup.t_data->t_ctid = heaptup->t_self;
4252 
4253  /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4254  if (PageIsAllVisible(BufferGetPage(buffer)))
4255  {
4256  all_visible_cleared = true;
4258  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4259  vmbuffer, VISIBILITYMAP_VALID_BITS);
4260  }
4261  if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4262  {
4263  all_visible_cleared_new = true;
4265  visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
4266  vmbuffer_new, VISIBILITYMAP_VALID_BITS);
4267  }
4268 
4269  if (newbuf != buffer)
4270  MarkBufferDirty(newbuf);
4271  MarkBufferDirty(buffer);
4272 
4273  /* XLOG stuff */
4274  if (RelationNeedsWAL(relation))
4275  {
4276  XLogRecPtr recptr;
4277 
4278  /*
4279  * For logical decoding we need combocids to properly decode the
4280  * catalog.
4281  */
4283  {
4284  log_heap_new_cid(relation, &oldtup);
4285  log_heap_new_cid(relation, heaptup);
4286  }
4287 
4288  recptr = log_heap_update(relation, buffer,
4289  newbuf, &oldtup, heaptup,
4290  old_key_tuple,
4291  all_visible_cleared,
4292  all_visible_cleared_new);
4293  if (newbuf != buffer)
4294  {
4295  PageSetLSN(BufferGetPage(newbuf), recptr);
4296  }
4297  PageSetLSN(BufferGetPage(buffer), recptr);
4298  }
4299 
4300  END_CRIT_SECTION();
4301 
4302  if (newbuf != buffer)
4303  LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
4304  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4305 
4306  /*
4307  * Mark old tuple for invalidation from system caches at next command
4308  * boundary, and mark the new tuple for invalidation in case we abort. We
4309  * have to do this before releasing the buffer because oldtup is in the
4310  * buffer. (heaptup is all in local memory, but it's necessary to process
4311  * both tuple versions in one call to inval.c so we can avoid redundant
4312  * sinval messages.)
4313  */
4314  CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4315 
4316  /* Now we can release the buffer(s) */
4317  if (newbuf != buffer)
4318  ReleaseBuffer(newbuf);
4319  ReleaseBuffer(buffer);
4320  if (BufferIsValid(vmbuffer_new))
4321  ReleaseBuffer(vmbuffer_new);
4322  if (BufferIsValid(vmbuffer))
4323  ReleaseBuffer(vmbuffer);
4324 
4325  /*
4326  * Release the lmgr tuple lock, if we had it.
4327  */
4328  if (have_tuple_lock)
4329  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4330 
4331  pgstat_count_heap_update(relation, use_hot_update);
4332 
4333  /*
4334  * If heaptup is a private copy, release it. Don't forget to copy t_self
4335  * back to the caller's image, too.
4336  */
4337  if (heaptup != newtup)
4338  {
4339  newtup->t_self = heaptup->t_self;
4340  heap_freetuple(heaptup);
4341  }
4342 
4343  if (old_key_tuple != NULL && old_key_copied)
4344  heap_freetuple(old_key_tuple);
4345 
4346  bms_free(hot_attrs);
4347  bms_free(key_attrs);
4348  bms_free(id_attrs);
4349  bms_free(modified_attrs);
4350  bms_free(interesting_attrs);
4351 
4352  return HeapTupleMayBeUpdated;
4353 }
4354 
4355 /*
4356  * Check if the specified attribute's value is same in both given tuples.
4357  * Subroutine for HeapDetermineModifiedColumns.
4358  */
4359 static bool
4360 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4361  HeapTuple tup1, HeapTuple tup2)
4362 {
4363  Datum value1,
4364  value2;
4365  bool isnull1,
4366  isnull2;
4367  Form_pg_attribute att;
4368 
4369  /*
4370  * If it's a whole-tuple reference, say "not equal". It's not really
4371  * worth supporting this case, since it could only succeed after a no-op
4372  * update, which is hardly a case worth optimizing for.
4373  */
4374  if (attrnum == 0)
4375  return false;
4376 
4377  /*
4378  * Likewise, automatically say "not equal" for any system attribute other
4379  * than OID and tableOID; we cannot expect these to be consistent in a HOT
4380  * chain, or even to be set correctly yet in the new tuple.
4381  */
4382  if (attrnum < 0)
4383  {
4384  if (attrnum != ObjectIdAttributeNumber &&
4385  attrnum != TableOidAttributeNumber)
4386  return false;
4387  }
4388 
4389  /*
4390  * Extract the corresponding values. XXX this is pretty inefficient if
4391  * there are many indexed columns. Should HeapDetermineModifiedColumns do
4392  * a single heap_deform_tuple call on each tuple, instead? But that
4393  * doesn't work for system columns ...
4394  */
4395  value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4396  value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4397 
4398  /*
4399  * If one value is NULL and other is not, then they are certainly not
4400  * equal
4401  */
4402  if (isnull1 != isnull2)
4403  return false;
4404 
4405  /*
4406  * If both are NULL, they can be considered equal.
4407  */
4408  if (isnull1)
4409  return true;
4410 
4411  /*
4412  * We do simple binary comparison of the two datums. This may be overly
4413  * strict because there can be multiple binary representations for the
4414  * same logical value. But we should be OK as long as there are no false
4415  * positives. Using a type-specific equality operator is messy because
4416  * there could be multiple notions of equality in different operator
4417  * classes; furthermore, we cannot safely invoke user-defined functions
4418  * while holding exclusive buffer lock.
4419  */
4420  if (attrnum <= 0)
4421  {
4422  /* The only allowed system columns are OIDs, so do this */
4423  return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4424  }
4425  else
4426  {
4427  Assert(attrnum <= tupdesc->natts);
4428  att = TupleDescAttr(tupdesc, attrnum - 1);
4429  return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4430  }
4431 }
4432 
4433 /*
4434  * Check which columns are being updated.
4435  *
4436  * Given an updated tuple, determine (and return into the output bitmapset),
4437  * from those listed as interesting, the set of columns that changed.
4438  *
4439  * The input bitmapset is destructively modified; that is OK since this is
4440  * invoked at most once in heap_update.
4441  */
4442 static Bitmapset *
4444  HeapTuple oldtup, HeapTuple newtup)
4445 {
4446  int attnum;
4447  Bitmapset *modified = NULL;
4448 
4449  while ((attnum = bms_first_member(interesting_cols)) >= 0)
4450  {
4452 
4454  attnum, oldtup, newtup))
4455  modified = bms_add_member(modified,
4457  }
4458 
4459  return modified;
4460 }
4461 
4462 /*
4463  * simple_heap_update - replace a tuple
4464  *
4465  * This routine may be used to update a tuple when concurrent updates of
4466  * the target tuple are not expected (for example, because we have a lock
4467  * on the relation associated with the tuple). Any failure is reported
4468  * via ereport().
4469  */
4470 void
4472 {
4473  HTSU_Result result;
4474  HeapUpdateFailureData hufd;
4475  LockTupleMode lockmode;
4476 
4477  result = heap_update(relation, otid, tup,
4479  true /* wait for commit */ ,
4480  &hufd, &lockmode);
4481  switch (result)
4482  {
4483  case HeapTupleSelfUpdated:
4484  /* Tuple was already updated in current command? */
4485  elog(ERROR, "tuple already updated by self");
4486  break;
4487 
4488  case HeapTupleMayBeUpdated:
4489  /* done successfully */
4490  break;
4491 
4492  case HeapTupleUpdated:
4493  elog(ERROR, "tuple concurrently updated");
4494  break;
4495 
4496  default:
4497  elog(ERROR, "unrecognized heap_update status: %u", result);
4498  break;
4499  }
4500 }
4501 
4502 
4503 /*
4504  * Return the MultiXactStatus corresponding to the given tuple lock mode.
4505  */
4506 static MultiXactStatus
4508 {
4509  int retval;
4510 
4511  if (is_update)
4512  retval = tupleLockExtraInfo[mode].updstatus;
4513  else
4514  retval = tupleLockExtraInfo[mode].lockstatus;
4515 
4516  if (retval == -1)
4517  elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4518  is_update ? "true" : "false");
4519 
4520  return (MultiXactStatus) retval;
4521 }
4522 
4523 /*
4524  * heap_lock_tuple - lock a tuple in shared or exclusive mode
4525  *
4526  * Note that this acquires a buffer pin, which the caller must release.
4527  *
4528  * Input parameters:
4529  * relation: relation containing tuple (caller must hold suitable lock)
4530  * tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
4531  * cid: current command ID (used for visibility test, and stored into
4532  * tuple's cmax if lock is successful)
4533  * mode: indicates if shared or exclusive tuple lock is desired
4534  * wait_policy: what to do if tuple lock is not available
4535  * follow_updates: if true, follow the update chain to also lock descendant
4536  * tuples.
4537  *
4538  * Output parameters:
4539  * *tuple: all fields filled in
4540  * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4541  * *hufd: filled in failure cases (see below)
4542  *
4543  * Function result may be:
4544  * HeapTupleMayBeUpdated: lock was successfully acquired
4545  * HeapTupleInvisible: lock failed because tuple was never visible to us
4546  * HeapTupleSelfUpdated: lock failed because tuple updated by self
4547  * HeapTupleUpdated: lock failed because tuple updated by other xact
4548  * HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip
4549  *
4550  * In the failure cases other than HeapTupleInvisible, the routine fills
4551  * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4552  * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated,
4553  * since we cannot obtain cmax from a combocid generated by another
4554  * transaction).
4555  * See comments for struct HeapUpdateFailureData for additional info.
4556  *
4557  * See README.tuplock for a thorough explanation of this mechanism.
4558  */
4561  CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4562  bool follow_updates,
4564 {
4565  HTSU_Result result;
4566  ItemPointer tid = &(tuple->t_self);
4567  ItemId lp;
4568  Page page;
4569  Buffer vmbuffer = InvalidBuffer;
4570  BlockNumber block;
4571  TransactionId xid,
4572  xmax;
4573  uint16 old_infomask,
4574  new_infomask,
4575  new_infomask2;
4576  bool first_time = true;
4577  bool have_tuple_lock = false;
4578  bool cleared_all_frozen = false;
4579 
4580  *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4581  block = ItemPointerGetBlockNumber(tid);
4582 
4583  /*
4584  * Before locking the buffer, pin the visibility map page if it appears to
4585  * be necessary. Since we haven't got the lock yet, someone else might be
4586  * in the middle of changing this, so we'll need to recheck after we have
4587  * the lock.
4588  */
4589  if (PageIsAllVisible(BufferGetPage(*buffer)))
4590  visibilitymap_pin(relation, block, &vmbuffer);
4591 
4593 
4594  page = BufferGetPage(*buffer);
4595  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4596  Assert(ItemIdIsNormal(lp));
4597 
4598  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4599  tuple->t_len = ItemIdGetLength(lp);
4600  tuple->t_tableOid = RelationGetRelid(relation);
4601 
4602 l3:
4603  result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4604 
4605  if (result == HeapTupleInvisible)
4606  {
4607  /*
4608  * This is possible, but only when locking a tuple for ON CONFLICT
4609  * UPDATE. We return this value here rather than throwing an error in
4610  * order to give that case the opportunity to throw a more specific
4611  * error.
4612  */
4613  result = HeapTupleInvisible;
4614  goto out_locked;
4615  }
4616  else if (result == HeapTupleBeingUpdated || result == HeapTupleUpdated)
4617  {
4618  TransactionId xwait;
4619  uint16 infomask;
4620  uint16 infomask2;
4621  bool require_sleep;
4622  ItemPointerData t_ctid;
4623 
4624  /* must copy state data before unlocking buffer */
4625  xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4626  infomask = tuple->t_data->t_infomask;
4627  infomask2 = tuple->t_data->t_infomask2;
4628  ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4629 
4630  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4631 
4632  /*
4633  * If any subtransaction of the current top transaction already holds
4634  * a lock as strong as or stronger than what we're requesting, we
4635  * effectively hold the desired lock already. We *must* succeed
4636  * without trying to take the tuple lock, else we will deadlock
4637  * against anyone wanting to acquire a stronger lock.
4638  *
4639  * Note we only do this the first time we loop on the HTSU result;
4640  * there is no point in testing in subsequent passes, because
4641  * evidently our own transaction cannot have acquired a new lock after
4642  * the first time we checked.
4643  */
4644  if (first_time)
4645  {
4646  first_time = false;
4647 
4648  if (infomask & HEAP_XMAX_IS_MULTI)
4649  {
4650  int i;
4651  int nmembers;
4652  MultiXactMember *members;
4653 
4654  /*
4655  * We don't need to allow old multixacts here; if that had
4656  * been the case, HeapTupleSatisfiesUpdate would have returned
4657  * MayBeUpdated and we wouldn't be here.
4658  */
4659  nmembers =
4660  GetMultiXactIdMembers(xwait, &members, false,
4661  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4662 
4663  for (i = 0; i < nmembers; i++)
4664  {
4665  /* only consider members of our own transaction */
4666  if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4667  continue;
4668 
4669  if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4670  {
4671  pfree(members);
4672  result = HeapTupleMayBeUpdated;
4673  goto out_unlocked;
4674  }
4675  }
4676 
4677  if (members)
4678  pfree(members);
4679  }
4680  else if (TransactionIdIsCurrentTransactionId(xwait))
4681  {
4682  switch (mode)
4683  {
4684  case LockTupleKeyShare:
4685  Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4686  HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4687  HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4688  result = HeapTupleMayBeUpdated;
4689  goto out_unlocked;
4690  case LockTupleShare:
4691  if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4692  HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4693  {
4694  result = HeapTupleMayBeUpdated;
4695  goto out_unlocked;
4696  }
4697  break;
4699  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4700  {
4701  result = HeapTupleMayBeUpdated;
4702  goto out_unlocked;
4703  }
4704  break;
4705  case LockTupleExclusive:
4706  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4707  infomask2 & HEAP_KEYS_UPDATED)
4708  {
4709  result = HeapTupleMayBeUpdated;
4710  goto out_unlocked;
4711  }
4712  break;
4713  }
4714  }
4715  }
4716 
4717  /*
4718  * Initially assume that we will have to wait for the locking
4719  * transaction(s) to finish. We check various cases below in which
4720  * this can be turned off.
4721  */
4722  require_sleep = true;
4723  if (mode == LockTupleKeyShare)
4724  {
4725  /*
4726  * If we're requesting KeyShare, and there's no update present, we
4727  * don't need to wait. Even if there is an update, we can still
4728  * continue if the key hasn't been modified.
4729  *
4730  * However, if there are updates, we need to walk the update chain
4731  * to mark future versions of the row as locked, too. That way,
4732  * if somebody deletes that future version, we're protected
4733  * against the key going away. This locking of future versions
4734  * could block momentarily, if a concurrent transaction is
4735  * deleting a key; or it could return a value to the effect that
4736  * the transaction deleting the key has already committed. So we
4737  * do this before re-locking the buffer; otherwise this would be
4738  * prone to deadlocks.
4739  *
4740  * Note that the TID we're locking was grabbed before we unlocked
4741  * the buffer. For it to change while we're not looking, the
4742  * other properties we're testing for below after re-locking the
4743  * buffer would also change, in which case we would restart this
4744  * loop above.
4745  */
4746  if (!(infomask2 & HEAP_KEYS_UPDATED))
4747  {
4748  bool updated;
4749 
4750  updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4751 
4752  /*
4753  * If there are updates, follow the update chain; bail out if
4754  * that cannot be done.
4755  */
4756  if (follow_updates && updated)
4757  {
4758  HTSU_Result res;
4759 
4760  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4762  mode);
4763  if (res != HeapTupleMayBeUpdated)
4764  {
4765  result = res;
4766  /* recovery code expects to have buffer lock held */
4768  goto failed;
4769  }
4770  }
4771 
4773 
4774  /*
4775  * Make sure it's still an appropriate lock, else start over.
4776  * Also, if it wasn't updated before we released the lock, but
4777  * is updated now, we start over too; the reason is that we
4778  * now need to follow the update chain to lock the new
4779  * versions.
4780  */
4781  if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4782  ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4783  !updated))
4784  goto l3;
4785 
4786  /* Things look okay, so we can skip sleeping */
4787  require_sleep = false;
4788 
4789  /*
4790  * Note we allow Xmax to change here; other updaters/lockers
4791  * could have modified it before we grabbed the buffer lock.
4792  * However, this is not a problem, because with the recheck we
4793  * just did we ensure that they still don't conflict with the
4794  * lock we want.
4795  */
4796  }
4797  }
4798  else if (mode == LockTupleShare)
4799  {
4800  /*
4801  * If we're requesting Share, we can similarly avoid sleeping if
4802  * there's no update and no exclusive lock present.
4803  */
4804  if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4805  !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4806  {
4808 
4809  /*
4810  * Make sure it's still an appropriate lock, else start over.
4811  * See above about allowing xmax to change.
4812  */
4813  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4815  goto l3;
4816  require_sleep = false;
4817  }
4818  }
4819  else if (mode == LockTupleNoKeyExclusive)
4820  {
4821  /*
4822  * If we're requesting NoKeyExclusive, we might also be able to
4823  * avoid sleeping; just ensure that there no conflicting lock
4824  * already acquired.
4825  */
4826  if (infomask & HEAP_XMAX_IS_MULTI)
4827  {
4828  if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4829  mode))
4830  {
4831  /*
4832  * No conflict, but if the xmax changed under us in the
4833  * meantime, start over.
4834  */
4836  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4838  xwait))
4839  goto l3;
4840 
4841  /* otherwise, we're good */
4842  require_sleep = false;
4843  }
4844  }
4845  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4846  {
4848 
4849  /* if the xmax changed in the meantime, start over */
4850  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4853  xwait))
4854  goto l3;
4855  /* otherwise, we're good */
4856  require_sleep = false;
4857  }
4858  }
4859 
4860  /*
4861  * As a check independent from those above, we can also avoid sleeping
4862  * if the current transaction is the sole locker of the tuple. Note
4863  * that the strength of the lock already held is irrelevant; this is
4864  * not about recording the lock in Xmax (which will be done regardless
4865  * of this optimization, below). Also, note that the cases where we
4866  * hold a lock stronger than we are requesting are already handled
4867  * above by not doing anything.
4868  *
4869  * Note we only deal with the non-multixact case here; MultiXactIdWait
4870  * is well equipped to deal with this situation on its own.
4871  */
4872  if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4874  {
4875  /* ... but if the xmax changed in the meantime, start over */
4877  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4879  xwait))
4880  goto l3;
4882  require_sleep = false;
4883  }
4884 
4885  /*
4886  * Time to sleep on the other transaction/multixact, if necessary.
4887  *
4888  * If the other transaction is an update that's already committed,
4889  * then sleeping cannot possibly do any good: if we're required to
4890  * sleep, get out to raise an error instead.
4891  *
4892  * By here, we either have already acquired the buffer exclusive lock,
4893  * or we must wait for the locking transaction or multixact; so below
4894  * we ensure that we grab buffer lock after the sleep.
4895  */
4896  if (require_sleep && result == HeapTupleUpdated)
4897  {
4899  goto failed;
4900  }
4901  else if (require_sleep)
4902  {
4903  /*
4904  * Acquire tuple lock to establish our priority for the tuple, or
4905  * die trying. LockTuple will release us when we are next-in-line
4906  * for the tuple. We must do this even if we are share-locking.
4907  *
4908  * If we are forced to "start over" below, we keep the tuple lock;
4909  * this arranges that we stay at the head of the line while
4910  * rechecking tuple state.
4911  */
4912  if (!heap_acquire_tuplock(relation, tid, mode, wait_policy,
4913  &have_tuple_lock))
4914  {
4915  /*
4916  * This can only happen if wait_policy is Skip and the lock
4917  * couldn't be obtained.
4918  */
4919  result = HeapTupleWouldBlock;
4920  /* recovery code expects to have buffer lock held */
4922  goto failed;
4923  }
4924 
4925  if (infomask & HEAP_XMAX_IS_MULTI)
4926  {
4928 
4929  /* We only ever lock tuples, never update them */
4930  if (status >= MultiXactStatusNoKeyUpdate)
4931  elog(ERROR, "invalid lock mode in heap_lock_tuple");
4932 
4933  /* wait for multixact to end, or die trying */
4934  switch (wait_policy)
4935  {
4936  case LockWaitBlock:
4937  MultiXactIdWait((MultiXactId) xwait, status, infomask,
4938  relation, &tuple->t_self, XLTW_Lock, NULL);
4939  break;
4940  case LockWaitSkip:
4942  status, infomask, relation,
4943  NULL))
4944  {
4945  result = HeapTupleWouldBlock;
4946  /* recovery code expects to have buffer lock held */
4948  goto failed;
4949  }
4950  break;
4951  case LockWaitError:
4953  status, infomask, relation,
4954  NULL))
4955  ereport(ERROR,
4956  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4957  errmsg("could not obtain lock on row in relation \"%s\"",
4958  RelationGetRelationName(relation))));
4959 
4960  break;
4961  }
4962 
4963  /*
4964  * Of course, the multixact might not be done here: if we're
4965  * requesting a light lock mode, other transactions with light
4966  * locks could still be alive, as well as locks owned by our
4967  * own xact or other subxacts of this backend. We need to
4968  * preserve the surviving MultiXact members. Note that it
4969  * isn't absolutely necessary in the latter case, but doing so
4970  * is simpler.
4971  */
4972  }
4973  else
4974  {
4975  /* wait for regular transaction to end, or die trying */
4976  switch (wait_policy)
4977  {
4978  case LockWaitBlock:
4979  XactLockTableWait(xwait, relation, &tuple->t_self,
4980  XLTW_Lock);
4981  break;
4982  case LockWaitSkip:
4983  if (!ConditionalXactLockTableWait(xwait))
4984  {
4985  result = HeapTupleWouldBlock;
4986  /* recovery code expects to have buffer lock held */
4988  goto failed;
4989  }
4990  break;
4991  case LockWaitError:
4992  if (!ConditionalXactLockTableWait(xwait))
4993  ereport(ERROR,
4994  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4995  errmsg("could not obtain lock on row in relation \"%s\"",
4996  RelationGetRelationName(relation))));
4997  break;
4998  }
4999  }
5000 
5001  /* if there are updates, follow the update chain */
5002  if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
5003  {
5004  HTSU_Result res;
5005 
5006  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
5008  mode);
5009  if (res != HeapTupleMayBeUpdated)
5010  {
5011  result = res;
5012  /* recovery code expects to have buffer lock held */
5014  goto failed;
5015  }
5016  }
5017 
5019 
5020  /*
5021  * xwait is done, but if xwait had just locked the tuple then some
5022  * other xact could update this tuple before we get to this point.
5023  * Check for xmax change, and start over if so.
5024  */
5025  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5027  xwait))
5028  goto l3;
5029 
5030  if (!(infomask & HEAP_XMAX_IS_MULTI))
5031  {
5032  /*
5033  * Otherwise check if it committed or aborted. Note we cannot
5034  * be here if the tuple was only locked by somebody who didn't
5035  * conflict with us; that would have been handled above. So
5036  * that transaction must necessarily be gone by now. But
5037  * don't check for this in the multixact case, because some
5038  * locker transactions might still be running.
5039  */
5040  UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5041  }
5042  }
5043 
5044  /* By here, we're certain that we hold buffer exclusive lock again */
5045 
5046  /*
5047  * We may lock if previous xmax aborted, or if it committed but only
5048  * locked the tuple without updating it; or if we didn't have to wait
5049  * at all for whatever reason.
5050  */
5051  if (!require_sleep ||
5052  (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5055  result = HeapTupleMayBeUpdated;
5056  else
5057  result = HeapTupleUpdated;
5058  }
5059 
5060 failed:
5061  if (result != HeapTupleMayBeUpdated)
5062  {
5063  Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated ||
5064  result == HeapTupleWouldBlock);
5065  Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5066  hufd->ctid = tuple->t_data->t_ctid;
5067  hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5068  if (result == HeapTupleSelfUpdated)
5069  hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5070  else
5071  hufd->cmax = InvalidCommandId;
5072  goto out_locked;
5073  }
5074 
5075  /*
5076  * If we didn't pin the visibility map page and the page has become all
5077  * visible while we were busy locking the buffer, or during some
5078  * subsequent window during which we had it unlocked, we'll have to unlock
5079  * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5080  * unfortunate, especially since we'll now have to recheck whether the
5081  * tuple has been locked or updated under us, but hopefully it won't
5082  * happen very often.
5083  */
5084  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5085  {
5086  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5087  visibilitymap_pin(relation, block, &vmbuffer);
5089  goto l3;
5090  }
5091 
5092  xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5093  old_infomask = tuple->t_data->t_infomask;
5094 
5095  /*
5096  * If this is the first possibly-multixact-able operation in the current
5097  * transaction, set my per-backend OldestMemberMXactId setting. We can be
5098  * certain that the transaction will never become a member of any older
5099  * MultiXactIds than that. (We have to do this even if we end up just
5100  * using our own TransactionId below, since some other backend could
5101  * incorporate our XID into a MultiXact immediately afterwards.)
5102  */
5104 
5105  /*
5106  * Compute the new xmax and infomask to store into the tuple. Note we do
5107  * not modify the tuple just yet, because that would leave it in the wrong
5108  * state if multixact.c elogs.
5109  */
5110  compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
5111  GetCurrentTransactionId(), mode, false,
5112  &xid, &new_infomask, &new_infomask2);
5113 
5115 
5116  /*
5117  * Store transaction information of xact locking the tuple.
5118  *
5119  * Note: Cmax is meaningless in this context, so don't set it; this avoids
5120  * possibly generating a useless combo CID. Moreover, if we're locking a
5121  * previously updated tuple, it's important to preserve the Cmax.
5122  *
5123  * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5124  * we would break the HOT chain.
5125  */
5126  tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
5127  tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5128  tuple->t_data->t_infomask |= new_infomask;
5129  tuple->t_data->t_infomask2 |= new_infomask2;
5130  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5132  HeapTupleHeaderSetXmax(tuple->t_data, xid);
5133 
5134  /*
5135  * Make sure there is no forward chain link in t_ctid. Note that in the
5136  * cases where the tuple has been updated, we must not overwrite t_ctid,
5137  * because it was set by the updater. Moreover, if the tuple has been
5138  * updated, we need to follow the update chain to lock the new versions of
5139  * the tuple as well.
5140  */
5141  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5142  tuple->t_data->t_ctid = *tid;
5143 
5144  /* Clear only the all-frozen bit on visibility map if needed */
5145  if (PageIsAllVisible(page) &&
5146  visibilitymap_clear(relation, block, vmbuffer,
5148  cleared_all_frozen = true;
5149 
5150 
5151  MarkBufferDirty(*buffer);
5152 
5153  /*
5154  * XLOG stuff. You might think that we don't need an XLOG record because
5155  * there is no state change worth restoring after a crash. You would be
5156  * wrong however: we have just written either a TransactionId or a
5157  * MultiXactId that may never have been seen on disk before, and we need
5158  * to make sure that there are XLOG entries covering those ID numbers.
5159  * Else the same IDs might be re-used after a crash, which would be
5160  * disastrous if this page made it to disk before the crash. Essentially
5161  * we have to enforce the WAL log-before-data rule even in this case.
5162  * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5163  * entries for everything anyway.)
5164  */
5165  if (RelationNeedsWAL(relation))
5166  {
5167  xl_heap_lock xlrec;
5168  XLogRecPtr recptr;
5169 
5170  XLogBeginInsert();
5171  XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
5172 
5173  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5174  xlrec.locking_xid = xid;
5175  xlrec.infobits_set = compute_infobits(new_infomask,
5176  tuple->t_data->t_infomask2);
5177  xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5178  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
5179 
5180  /* we don't decode row locks atm, so no need to log the origin */
5181 
5182  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
5183 
5184  PageSetLSN(page, recptr);
5185  }
5186 
5187  END_CRIT_SECTION();
5188 
5189  result = HeapTupleMayBeUpdated;
5190 
5191 out_locked:
5192  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5193 
5194 out_unlocked:
5195  if (BufferIsValid(vmbuffer))
5196  ReleaseBuffer(vmbuffer);
5197 
5198  /*
5199  * Don't update the visibility map here. Locking a tuple doesn't change
5200  * visibility info.
5201  */
5202 
5203  /*
5204  * Now that we have successfully marked the tuple as locked, we can
5205  * release the lmgr tuple lock, if we had it.
5206  */
5207  if (have_tuple_lock)
5208  UnlockTupleTuplock(relation, tid, mode);
5209 
5210  return result;
5211 }
5212 
5213 /*
5214  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5215  * its normal, Xmax-based tuple lock.
5216  *
5217  * have_tuple_lock is an input and output parameter: on input, it indicates
5218  * whether the lock has previously been acquired (and this function does
5219  * nothing in that case). If this function returns success, have_tuple_lock
5220  * has been flipped to true.
5221  *
5222  * Returns false if it was unable to obtain the lock; this can only happen if
5223  * wait_policy is Skip.
5224  */
5225 static bool
5227  LockWaitPolicy wait_policy, bool *have_tuple_lock)
5228 {
5229  if (*have_tuple_lock)
5230  return true;
5231 
5232  switch (wait_policy)
5233  {
5234  case LockWaitBlock:
5235  LockTupleTuplock(relation, tid, mode);
5236  break;
5237 
5238  case LockWaitSkip:
5239  if (!ConditionalLockTupleTuplock(relation, tid, mode))
5240  return false;
5241  break;
5242 
5243  case LockWaitError:
5244  if (!ConditionalLockTupleTuplock(relation, tid, mode))
5245  ereport(ERROR,
5246  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5247  errmsg("could not obtain lock on row in relation \"%s\"",
5248  RelationGetRelationName(relation))));
5249  break;
5250  }
5251  *have_tuple_lock = true;
5252 
5253  return true;
5254 }
5255 
5256 /*
5257  * Given an original set of Xmax and infomask, and a transaction (identified by
5258  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5259  * corresponding infomasks to use on the tuple.
5260  *
5261  * Note that this might have side effects such as creating a new MultiXactId.
5262  *
5263  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5264  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5265  * but it was not running anymore. There is a race condition, which is that the
5266  * MultiXactId may have finished since then, but that uncommon case is handled
5267  * either here, or within MultiXactIdExpand.
5268  *
5269  * There is a similar race condition possible when the old xmax was a regular
5270  * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5271  * window, but it's still possible to end up creating an unnecessary
5272  * MultiXactId. Fortunately this is harmless.
5273  */
5274 static void
5276  uint16 old_infomask2, TransactionId add_to_xmax,
5277  LockTupleMode mode, bool is_update,
5278  TransactionId *result_xmax, uint16 *result_infomask,
5279  uint16 *result_infomask2)
5280 {
5281  TransactionId new_xmax;
5282  uint16 new_infomask,
5283  new_infomask2;
5284 
5286 
5287 l5:
5288  new_infomask = 0;
5289  new_infomask2 = 0;
5290  if (old_infomask & HEAP_XMAX_INVALID)
5291  {
5292  /*
5293  * No previous locker; we just insert our own TransactionId.
5294  *
5295  * Note that it's critical that this case be the first one checked,
5296  * because there are several blocks below that come back to this one
5297  * to implement certain optimizations; old_infomask might contain
5298  * other dirty bits in those cases, but we don't really care.
5299  */
5300  if (is_update)
5301  {
5302  new_xmax = add_to_xmax;
5303  if (mode == LockTupleExclusive)
5304  new_infomask2 |= HEAP_KEYS_UPDATED;
5305  }
5306  else
5307  {
5308  new_infomask |= HEAP_XMAX_LOCK_ONLY;
5309  switch (mode)
5310  {
5311  case LockTupleKeyShare:
5312  new_xmax = add_to_xmax;
5313  new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5314  break;
5315  case LockTupleShare:
5316  new_xmax = add_to_xmax;
5317  new_infomask |= HEAP_XMAX_SHR_LOCK;
5318  break;
5320  new_xmax = add_to_xmax;
5321  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5322  break;
5323  case LockTupleExclusive:
5324  new_xmax = add_to_xmax;
5325  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5326  new_infomask2 |= HEAP_KEYS_UPDATED;
5327  break;
5328  default:
5329  new_xmax = InvalidTransactionId; /* silence compiler */
5330  elog(ERROR, "invalid lock mode");
5331  }
5332  }
5333  }
5334  else if (old_infomask & HEAP_XMAX_IS_MULTI)
5335  {
5336  MultiXactStatus new_status;
5337 
5338  /*
5339  * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5340  * cross-check.
5341  */
5342  Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5343 
5344  /*
5345  * A multixact together with LOCK_ONLY set but neither lock bit set
5346  * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5347  * anymore. This check is critical for databases upgraded by
5348  * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5349  * that such multis are never passed.
5350  */
5351  if (HEAP_LOCKED_UPGRADED(old_infomask))
5352  {
5353  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5354  old_infomask |= HEAP_XMAX_INVALID;
5355  goto l5;
5356  }
5357 
5358  /*
5359  * If the XMAX is already a MultiXactId, then we need to expand it to
5360  * include add_to_xmax; but if all the members were lockers and are
5361  * all gone, we can do away with the IS_MULTI bit and just set
5362  * add_to_xmax as the only locker/updater. If all lockers are gone
5363  * and we have an updater that aborted, we can also do without a
5364  * multi.
5365  *
5366  * The cost of doing GetMultiXactIdMembers would be paid by
5367  * MultiXactIdExpand if we weren't to do this, so this check is not
5368  * incurring extra work anyhow.
5369  */
5370  if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5371  {
5372  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5374  old_infomask)))
5375  {
5376  /*
5377  * Reset these bits and restart; otherwise fall through to
5378  * create a new multi below.
5379  */
5380  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5381  old_infomask |= HEAP_XMAX_INVALID;
5382  goto l5;
5383  }
5384  }
5385 
5386  new_status = get_mxact_status_for_lock(mode, is_update);
5387 
5388  new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5389  new_status);
5390  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5391  }
5392  else if (old_infomask & HEAP_XMAX_COMMITTED)
5393  {
5394  /*
5395  * It's a committed update, so we need to preserve him as updater of
5396  * the tuple.
5397  */
5399  MultiXactStatus new_status;
5400 
5401  if (old_infomask2 & HEAP_KEYS_UPDATED)
5402  status = MultiXactStatusUpdate;
5403  else
5404  status = MultiXactStatusNoKeyUpdate;
5405 
5406  new_status = get_mxact_status_for_lock(mode, is_update);
5407 
5408  /*
5409  * since it's not running, it's obviously impossible for the old
5410  * updater to be identical to the current one, so we need not check
5411  * for that case as we do in the block above.
5412  */
5413  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5414  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5415  }
5416  else if (TransactionIdIsInProgress(xmax))
5417  {
5418  /*
5419  * If the XMAX is a valid, in-progress TransactionId, then we need to
5420  * create a new MultiXactId that includes both the old locker or
5421  * updater and our own TransactionId.
5422  */
5423  MultiXactStatus new_status;
5424  MultiXactStatus old_status;
5425  LockTupleMode old_mode;
5426 
5427  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5428  {
5429  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5430  old_status = MultiXactStatusForKeyShare;
5431  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5432  old_status = MultiXactStatusForShare;
5433  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5434  {
5435  if (old_infomask2 & HEAP_KEYS_UPDATED)
5436  old_status = MultiXactStatusForUpdate;
5437  else
5438  old_status = MultiXactStatusForNoKeyUpdate;
5439  }
5440  else
5441  {
5442  /*
5443  * LOCK_ONLY can be present alone only when a page has been
5444  * upgraded by pg_upgrade. But in that case,
5445  * TransactionIdIsInProgress() should have returned false. We
5446  * assume it's no longer locked in this case.
5447  */
5448  elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5449  old_infomask |= HEAP_XMAX_INVALID;
5450  old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5451  goto l5;
5452  }
5453  }
5454  else
5455  {
5456  /* it's an update, but which kind? */
5457  if (old_infomask2 & HEAP_KEYS_UPDATED)
5458  old_status = MultiXactStatusUpdate;
5459  else
5460  old_status = MultiXactStatusNoKeyUpdate;
5461  }
5462 
5463  old_mode = TUPLOCK_from_mxstatus(old_status);
5464 
5465  /*
5466  * If the lock to be acquired is for the same TransactionId as the
5467  * existing lock, there's an optimization possible: consider only the
5468  * strongest of both locks as the only one present, and restart.
5469  */
5470  if (xmax == add_to_xmax)
5471  {
5472  /*
5473  * Note that it's not possible for the original tuple to be
5474  * updated: we wouldn't be here because the tuple would have been
5475  * invisible and we wouldn't try to update it. As a subtlety,
5476  * this code can also run when traversing an update chain to lock
5477  * future versions of a tuple. But we wouldn't be here either,
5478  * because the add_to_xmax would be different from the original
5479  * updater.
5480  */
5481  Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5482 
5483  /* acquire the strongest of both */
5484  if (mode < old_mode)
5485  mode = old_mode;
5486  /* mustn't touch is_update */
5487 
5488  old_infomask |= HEAP_XMAX_INVALID;
5489  goto l5;
5490  }
5491 
5492  /* otherwise, just fall back to creating a new multixact */
5493  new_status = get_mxact_status_for_lock(mode, is_update);
5494  new_xmax = MultiXactIdCreate(xmax, old_status,
5495  add_to_xmax, new_status);
5496  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5497  }
5498  else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5499  TransactionIdDidCommit(xmax))
5500  {
5501  /*
5502  * It's a committed update, so we gotta preserve him as updater of the
5503  * tuple.
5504  */
5506  MultiXactStatus new_status;
5507 
5508  if (old_infomask2 & HEAP_KEYS_UPDATED)
5509  status = MultiXactStatusUpdate;
5510  else
5511  status = MultiXactStatusNoKeyUpdate;
5512 
5513  new_status = get_mxact_status_for_lock(mode, is_update);
5514 
5515  /*
5516  * since it's not running, it's obviously impossible for the old
5517  * updater to be identical to the current one, so we need not check
5518  * for that case as we do in the block above.
5519  */
5520  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5521  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5522  }
5523  else
5524  {
5525  /*
5526  * Can get here iff the locking/updating transaction was running when
5527  * the infomask was extracted from the tuple, but finished before
5528  * TransactionIdIsInProgress got to run. Deal with it as if there was
5529  * no locker at all in the first place.
5530  */
5531  old_infomask |= HEAP_XMAX_INVALID;
5532  goto l5;
5533  }
5534 
5535  *result_infomask = new_infomask;
5536  *result_infomask2 = new_infomask2;
5537  *result_xmax = new_xmax;
5538 }
5539 
5540 /*
5541  * Subroutine for heap_lock_updated_tuple_rec.
5542  *
5543  * Given a hypothetical multixact status held by the transaction identified
5544  * with the given xid, does the current transaction need to wait, fail, or can
5545  * it continue if it wanted to acquire a lock of the given mode? "needwait"
5546  * is set to true if waiting is necessary; if it can continue, then
5547  * HeapTupleMayBeUpdated is returned. If the lock is already held by the
5548  * current transaction, return HeapTupleSelfUpdated. In case of a conflict
5549  * with another transaction, a different HeapTupleSatisfiesUpdate return code
5550  * is returned.
5551  *
5552  * The held status is said to be hypothetical because it might correspond to a
5553  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5554  * way for simplicity of API.
5555  */
5556 static HTSU_Result
5558  LockTupleMode mode, bool *needwait)
5559 {
5560  MultiXactStatus wantedstatus;
5561 
5562  *needwait = false;
5563  wantedstatus = get_mxact_status_for_lock(mode, false);
5564 
5565  /*
5566  * Note: we *must* check TransactionIdIsInProgress before
5567  * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an
5568  * explanation.
5569  */
5571  {
5572  /*
5573  * The tuple has already been locked by our own transaction. This is
5574  * very rare but can happen if multiple transactions are trying to
5575  * lock an ancient version of the same tuple.
5576  */
5577  return HeapTupleSelfUpdated;
5578  }
5579  else if (TransactionIdIsInProgress(xid))
5580  {
5581  /*
5582  * If the locking transaction is running, what we do depends on
5583  * whether the lock modes conflict: if they do, then we must wait for
5584  * it to finish; otherwise we can fall through to lock this tuple
5585  * version without waiting.
5586  */
5588  LOCKMODE_from_mxstatus(wantedstatus)))
5589  {
5590  *needwait = true;
5591  }
5592 
5593  /*
5594  * If we set needwait above, then this value doesn't matter;
5595  * otherwise, this value signals to caller that it's okay to proceed.
5596  */
5597  return HeapTupleMayBeUpdated;
5598  }
5599  else if (TransactionIdDidAbort(xid))
5600  return HeapTupleMayBeUpdated;
5601  else if (TransactionIdDidCommit(xid))
5602  {
5603  /*
5604  * The other transaction committed. If it was only a locker, then the
5605  * lock is completely gone now and we can return success; but if it
5606  * was an update, then what we do depends on whether the two lock
5607  * modes conflict. If they conflict, then we must report error to
5608  * caller. But if they don't, we can fall through to allow the current
5609  * transaction to lock the tuple.
5610  *
5611  * Note: the reason we worry about ISUPDATE here is because as soon as
5612  * a transaction ends, all its locks are gone and meaningless, and
5613  * thus we can ignore them; whereas its updates persist. In the
5614  * TransactionIdIsInProgress case, above, we don't need to check
5615  * because we know the lock is still "alive" and thus a conflict needs
5616  * always be checked.
5617  */
5618  if (!ISUPDATE_from_mxstatus(status))
5619  return HeapTupleMayBeUpdated;
5620 
5622  LOCKMODE_from_mxstatus(wantedstatus)))
5623  /* bummer */
5624  return HeapTupleUpdated;
5625 
5626  return HeapTupleMayBeUpdated;
5627  }
5628 
5629  /* Not in progress, not aborted, not committed -- must have crashed */
5630  return HeapTupleMayBeUpdated;
5631 }
5632 
5633 
5634 /*
5635  * Recursive part of heap_lock_updated_tuple
5636  *
5637  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5638  * xid with the given mode; if this tuple is updated, recurse to lock the new
5639  * version as well.
5640  */
5641 static HTSU_Result
5643  LockTupleMode mode)
5644 {
5645  HTSU_Result result;
5646  ItemPointerData tupid;
5647  HeapTupleData mytup;
5648  Buffer buf;
5649  uint16 new_infomask,
5650  new_infomask2,
5651  old_infomask,
5652  old_infomask2;
5653  TransactionId xmax,
5654  new_xmax;
5655  TransactionId priorXmax = InvalidTransactionId;
5656  bool cleared_all_frozen = false;
5657  Buffer vmbuffer = InvalidBuffer;
5658  BlockNumber block;
5659 
5660  ItemPointerCopy(tid, &tupid);
5661 
5662  for (;;)
5663  {
5664  new_infomask = 0;
5665  new_xmax = InvalidTransactionId;
5666  block = ItemPointerGetBlockNumber(&tupid);
5667  ItemPointerCopy(&tupid, &(mytup.t_self));
5668 
5669  if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
5670  {
5671  /*
5672  * if we fail to find the updated version of the tuple, it's
5673  * because it was vacuumed/pruned away after its creator
5674  * transaction aborted. So behave as if we got to the end of the
5675  * chain, and there's no further tuple to lock: return success to
5676  * caller.
5677  */
5678  return HeapTupleMayBeUpdated;
5679  }
5680 
5681 l4:
5683 
5684  /*
5685  * Before locking the buffer, pin the visibility map page if it
5686  * appears to be necessary. Since we haven't got the lock yet,
5687  * someone else might be in the middle of changing this, so we'll need
5688  * to recheck after we have the lock.
5689  */
5690  if (PageIsAllVisible(BufferGetPage(buf)))
5691  visibilitymap_pin(rel, block, &vmbuffer);
5692  else
5693  vmbuffer = InvalidBuffer;
5694 
5696 
5697  /*
5698  * If we didn't pin the visibility map page and the page has become
5699  * all visible while we were busy locking the buffer, we'll have to
5700  * unlock and re-lock, to avoid holding the buffer lock across I/O.
5701  * That's a bit unfortunate, but hopefully shouldn't happen often.
5702  */
5703  if (vmbuffer == InvalidBuffer && PageIsAllVisible(BufferGetPage(buf)))
5704  {
5706  visibilitymap_pin(rel, block, &vmbuffer);
5708  }
5709 
5710  /*
5711  * Check the tuple XMIN against prior XMAX, if any. If we reached the
5712  * end of the chain, we're done, so return success.
5713  */
5714  if (TransactionIdIsValid(priorXmax) &&
5716  priorXmax))
5717  {
5718  result = HeapTupleMayBeUpdated;
5719  goto out_locked;
5720  }
5721 
5722  /*
5723  * Also check Xmin: if this tuple was created by an aborted
5724  * (sub)transaction, then we already locked the last live one in the
5725  * chain, thus we're done, so return success.
5726  */
5728  {
5729  UnlockReleaseBuffer(buf);
5730  return HeapTupleMayBeUpdated;
5731  }
5732 
5733  old_infomask = mytup.t_data->t_infomask;
5734  old_infomask2 = mytup.t_data->t_infomask2;
5735  xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5736 
5737  /*
5738  * If this tuple version has been updated or locked by some concurrent
5739  * transaction(s), what we do depends on whether our lock mode
5740  * conflicts with what those other transactions hold, and also on the
5741  * status of them.
5742  */
5743  if (!(old_infomask & HEAP_XMAX_INVALID))
5744  {
5745  TransactionId rawxmax;
5746  bool needwait;
5747 
5748  rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5749  if (old_infomask & HEAP_XMAX_IS_MULTI)
5750  {
5751  int nmembers;
5752  int i;
5753  MultiXactMember *members;
5754 
5755  /*
5756  * We don't need a test for pg_upgrade'd tuples: this is only
5757  * applied to tuples after the first in an update chain. Said
5758  * first tuple in the chain may well be locked-in-9.2-and-
5759  * pg_upgraded, but that one was already locked by our caller,
5760  * not us; and any subsequent ones cannot be because our
5761  * caller must necessarily have obtained a snapshot later than
5762  * the pg_upgrade itself.
5763  */
5765 
5766  nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5767  HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5768  for (i = 0; i < nmembers; i++)
5769  {
5770  result = test_lockmode_for_conflict(members[i].status,
5771  members[i].xid,
5772  mode, &needwait);
5773 
5774  /*
5775  * If the tuple was already locked by ourselves in a
5776  * previous iteration of this (say heap_lock_tuple was
5777  * forced to restart the locking loop because of a change
5778  * in xmax), then we hold the lock already on this tuple
5779  * version and we don't need to do anything; and this is
5780  * not an error condition either. We just need to skip
5781  * this tuple and continue locking the next version in the
5782  * update chain.
5783  */
5784  if (result == HeapTupleSelfUpdated)
5785  {
5786  pfree(members);
5787  goto next;
5788  }
5789 
5790  if (needwait)
5791  {
5793  XactLockTableWait(members[i].xid, rel,
5794  &mytup.t_self,
5796  pfree(members);
5797  goto l4;
5798  }
5799  if (result != HeapTupleMayBeUpdated)
5800  {
5801  pfree(members);
5802  goto out_locked;
5803  }
5804  }
5805  if (members)
5806  pfree(members);
5807  }
5808  else
5809  {
5811 
5812  /*
5813  * For a non-multi Xmax, we first need to compute the
5814  * corresponding MultiXactStatus by using the infomask bits.
5815  */
5816  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5817  {
5818  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5819  status = MultiXactStatusForKeyShare;
5820  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5821  status = MultiXactStatusForShare;
5822  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5823  {
5824  if (old_infomask2 & HEAP_KEYS_UPDATED)
5825  status = MultiXactStatusForUpdate;
5826  else
5828  }
5829  else
5830  {
5831  /*
5832  * LOCK_ONLY present alone (a pg_upgraded tuple marked
5833  * as share-locked in the old cluster) shouldn't be
5834  * seen in the middle of an update chain.
5835  */
5836  elog(ERROR, "invalid lock status in tuple");
5837  }
5838  }
5839  else
5840  {
5841  /* it's an update, but which kind? */
5842  if (old_infomask2 & HEAP_KEYS_UPDATED)
5843  status = MultiXactStatusUpdate;
5844  else
5845  status = MultiXactStatusNoKeyUpdate;
5846  }
5847 
5848  result = test_lockmode_for_conflict(status, rawxmax, mode,
5849  &needwait);
5850 
5851  /*
5852  * If the tuple was already locked by ourselves in a previous
5853  * iteration of this (say heap_lock_tuple was forced to
5854  * restart the locking loop because of a change in xmax), then
5855  * we hold the lock already on this tuple version and we don't
5856  * need to do anything; and this is not an error condition
5857  * either. We just need to skip this tuple and continue
5858  * locking the next version in the update chain.
5859  */
5860  if (result == HeapTupleSelfUpdated)
5861  goto next;
5862 
5863  if (needwait)
5864  {
5866  XactLockTableWait(rawxmax, rel, &mytup.t_self,
5868  goto l4;
5869  }
5870  if (result != HeapTupleMayBeUpdated)
5871  {
5872  goto out_locked;
5873  }
5874  }
5875  }
5876 
5877  /* compute the new Xmax and infomask values for the tuple ... */
5878  compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5879  xid, mode, false,
5880  &new_xmax, &new_infomask, &new_infomask2);
5881 
5882  if (PageIsAllVisible(BufferGetPage(buf)) &&
5883  visibilitymap_clear(rel, block, vmbuffer,
5885  cleared_all_frozen = true;
5886 
5888 
5889  /* ... and set them */
5890  HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5891  mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5893  mytup.t_data->t_infomask |= new_infomask;
5894  mytup.t_data->t_infomask2 |= new_infomask2;
5895 
5896  MarkBufferDirty(buf);
5897 
5898  /* XLOG stuff */
5899  if (RelationNeedsWAL(rel))
5900  {
5901  xl_heap_lock_updated xlrec;
5902  XLogRecPtr recptr;
5903  Page page = BufferGetPage(buf);
5904 
5905  XLogBeginInsert();
5907 
5908  xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
5909  xlrec.xmax = new_xmax;
5910  xlrec.infobits_set =