PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
heapam.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  * heap access method code
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  * relation_open - open any relation by relation OID
16  * relation_openrv - open any relation specified by a RangeVar
17  * relation_close - close any relation
18  * heap_open - open a heap relation by relation OID
19  * heap_openrv - open a heap relation specified by a RangeVar
20  * heap_close - (now just a macro for relation_close)
21  * heap_beginscan - begin relation scan
22  * heap_rescan - restart a relation scan
23  * heap_endscan - end relation scan
24  * heap_getnext - retrieve next tuple in scan
25  * heap_fetch - retrieve tuple with given tid
26  * heap_insert - insert tuple into a relation
27  * heap_multi_insert - insert multiple tuples into a relation
28  * heap_delete - delete a tuple from a relation
29  * heap_update - replace a tuple in a relation with another tuple
30  * heap_sync - sync heap, for when no WAL has been written
31  *
32  * NOTES
33  * This file contains the heap_ routines which implement
34  * the POSTGRES heap access method used for all POSTGRES
35  * relations.
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include "access/bufmask.h"
42 #include "access/heapam.h"
43 #include "access/heapam_xlog.h"
44 #include "access/hio.h"
45 #include "access/multixact.h"
46 #include "access/parallel.h"
47 #include "access/relscan.h"
48 #include "access/sysattr.h"
49 #include "access/transam.h"
50 #include "access/tuptoaster.h"
51 #include "access/valid.h"
52 #include "access/visibilitymap.h"
53 #include "access/xact.h"
54 #include "access/xlog.h"
55 #include "access/xloginsert.h"
56 #include "access/xlogutils.h"
57 #include "catalog/catalog.h"
58 #include "catalog/namespace.h"
59 #include "miscadmin.h"
60 #include "pgstat.h"
61 #include "storage/bufmgr.h"
62 #include "storage/freespace.h"
63 #include "storage/lmgr.h"
64 #include "storage/predicate.h"
65 #include "storage/procarray.h"
66 #include "storage/smgr.h"
67 #include "storage/spin.h"
68 #include "storage/standby.h"
69 #include "utils/datum.h"
70 #include "utils/inval.h"
71 #include "utils/lsyscache.h"
72 #include "utils/relcache.h"
73 #include "utils/snapmgr.h"
74 #include "utils/syscache.h"
75 #include "utils/tqual.h"
76 
77 
78 /* GUC variable */
80 
81 
83  Snapshot snapshot,
84  int nkeys, ScanKey key,
85  ParallelHeapScanDesc parallel_scan,
86  bool allow_strat,
87  bool allow_sync,
88  bool allow_pagemode,
89  bool is_bitmapscan,
90  bool is_samplescan,
91  bool temp_snap);
93 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
94  TransactionId xid, CommandId cid, int options);
95 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
96  Buffer newbuf, HeapTuple oldtup,
97  HeapTuple newtup, HeapTuple old_key_tup,
98  bool all_visible_cleared, bool new_all_visible_cleared);
100  Bitmapset *interesting_cols,
101  HeapTuple oldtup, HeapTuple newtup);
102 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
103  LockTupleMode mode, LockWaitPolicy wait_policy,
104  bool *have_tuple_lock);
105 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
106  uint16 old_infomask2, TransactionId add_to_xmax,
107  LockTupleMode mode, bool is_update,
108  TransactionId *result_xmax, uint16 *result_infomask,
109  uint16 *result_infomask2);
111  ItemPointer ctid, TransactionId xid,
112  LockTupleMode mode);
113 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
114  uint16 *new_infomask2);
116  uint16 t_infomask);
117 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
118  LockTupleMode lockmode);
119 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
120  Relation rel, ItemPointer ctid, XLTW_Oper oper,
121  int *remaining);
123  uint16 infomask, Relation rel, int *remaining);
124 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
125 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
126  bool *copy);
127 
128 
129 /*
130  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
131  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
132  * update them). This table (and the macros below) helps us determine the
133  * heavyweight lock mode and MultiXactStatus values to use for any particular
134  * tuple lock strength.
135  *
136  * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
137  * instead.
138  */
139 static const struct
140 {
144 }
145 
147 {
148  { /* LockTupleKeyShare */
151  -1 /* KeyShare does not allow updating tuples */
152  },
153  { /* LockTupleShare */
154  RowShareLock,
156  -1 /* Share does not allow updating tuples */
157  },
158  { /* LockTupleNoKeyExclusive */
162  },
163  { /* LockTupleExclusive */
167  }
168 };
169 
170 /* Get the LOCKMODE for a given MultiXactStatus */
171 #define LOCKMODE_from_mxstatus(status) \
172  (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
173 
174 /*
175  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
176  * This is more readable than having every caller translate it to lock.h's
177  * LOCKMODE.
178  */
179 #define LockTupleTuplock(rel, tup, mode) \
180  LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
181 #define UnlockTupleTuplock(rel, tup, mode) \
182  UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
183 #define ConditionalLockTupleTuplock(rel, tup, mode) \
184  ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
185 
186 /*
187  * This table maps tuple lock strength values for each particular
188  * MultiXactStatus value.
189  */
191 {
192  LockTupleKeyShare, /* ForKeyShare */
193  LockTupleShare, /* ForShare */
194  LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
195  LockTupleExclusive, /* ForUpdate */
196  LockTupleNoKeyExclusive, /* NoKeyUpdate */
197  LockTupleExclusive /* Update */
198 };
199 
200 /* Get the LockTupleMode for a given MultiXactStatus */
201 #define TUPLOCK_from_mxstatus(status) \
202  (MultiXactStatusLock[(status)])
203 
204 /* ----------------------------------------------------------------
205  * heap support routines
206  * ----------------------------------------------------------------
207  */
208 
209 /* ----------------
210  * initscan - scan code common to heap_beginscan and heap_rescan
211  * ----------------
212  */
213 static void
214 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
215 {
216  bool allow_strat;
217  bool allow_sync;
218 
219  /*
220  * Determine the number of blocks we have to scan.
221  *
222  * It is sufficient to do this once at scan start, since any tuples added
223  * while the scan is in progress will be invisible to my snapshot anyway.
224  * (That is not true when using a non-MVCC snapshot. However, we couldn't
225  * guarantee to return tuples added after scan start anyway, since they
226  * might go into pages we already scanned. To guarantee consistent
227  * results for a non-MVCC snapshot, the caller must hold some higher-level
228  * lock that ensures the interesting tuple(s) won't change.)
229  */
230  if (scan->rs_parallel != NULL)
231  scan->rs_nblocks = scan->rs_parallel->phs_nblocks;
232  else
234 
235  /*
236  * If the table is large relative to NBuffers, use a bulk-read access
237  * strategy and enable synchronized scanning (see syncscan.c). Although
238  * the thresholds for these features could be different, we make them the
239  * same so that there are only two behaviors to tune rather than four.
240  * (However, some callers need to be able to disable one or both of these
241  * behaviors, independently of the size of the table; also there is a GUC
242  * variable that can disable synchronized scanning.)
243  *
244  * Note that heap_parallelscan_initialize has a very similar test; if you
245  * change this, consider changing that one, too.
246  */
247  if (!RelationUsesLocalBuffers(scan->rs_rd) &&
248  scan->rs_nblocks > NBuffers / 4)
249  {
250  allow_strat = scan->rs_allow_strat;
251  allow_sync = scan->rs_allow_sync;
252  }
253  else
254  allow_strat = allow_sync = false;
255 
256  if (allow_strat)
257  {
258  /* During a rescan, keep the previous strategy object. */
259  if (scan->rs_strategy == NULL)
261  }
262  else
263  {
264  if (scan->rs_strategy != NULL)
266  scan->rs_strategy = NULL;
267  }
268 
269  if (scan->rs_parallel != NULL)
270  {
271  /* For parallel scan, believe whatever ParallelHeapScanDesc says. */
272  scan->rs_syncscan = scan->rs_parallel->phs_syncscan;
273  }
274  else if (keep_startblock)
275  {
276  /*
277  * When rescanning, we want to keep the previous startblock setting,
278  * so that rewinding a cursor doesn't generate surprising results.
279  * Reset the active syncscan setting, though.
280  */
281  scan->rs_syncscan = (allow_sync && synchronize_seqscans);
282  }
283  else if (allow_sync && synchronize_seqscans)
284  {
285  scan->rs_syncscan = true;
286  scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
287  }
288  else
289  {
290  scan->rs_syncscan = false;
291  scan->rs_startblock = 0;
292  }
293 
295  scan->rs_inited = false;
296  scan->rs_ctup.t_data = NULL;
298  scan->rs_cbuf = InvalidBuffer;
300 
301  /* page-at-a-time fields are always invalid when not rs_inited */
302 
303  /*
304  * copy the scan key, if appropriate
305  */
306  if (key != NULL)
307  memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
308 
309  /*
310  * Currently, we don't have a stats counter for bitmap heap scans (but the
311  * underlying bitmap index scans will be counted) or sample scans (we only
312  * update stats for tuple fetches there)
313  */
314  if (!scan->rs_bitmapscan && !scan->rs_samplescan)
316 }
317 
318 /*
319  * heap_setscanlimits - restrict range of a heapscan
320  *
321  * startBlk is the page to start at
322  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
323  */
324 void
326 {
327  Assert(!scan->rs_inited); /* else too late to change */
328  Assert(!scan->rs_syncscan); /* else rs_startblock is significant */
329 
330  /* Check startBlk is valid (but allow case of zero blocks...) */
331  Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
332 
333  scan->rs_startblock = startBlk;
334  scan->rs_numblocks = numBlks;
335 }
336 
337 /*
338  * heapgetpage - subroutine for heapgettup()
339  *
340  * This routine reads and pins the specified page of the relation.
341  * In page-at-a-time mode it performs additional work, namely determining
342  * which tuples on the page are visible.
343  */
344 void
346 {
347  Buffer buffer;
348  Snapshot snapshot;
349  Page dp;
350  int lines;
351  int ntup;
352  OffsetNumber lineoff;
353  ItemId lpp;
354  bool all_visible;
355 
356  Assert(page < scan->rs_nblocks);
357 
358  /* release previous scan buffer, if any */
359  if (BufferIsValid(scan->rs_cbuf))
360  {
361  ReleaseBuffer(scan->rs_cbuf);
362  scan->rs_cbuf = InvalidBuffer;
363  }
364 
365  /*
366  * Be sure to check for interrupts at least once per page. Checks at
367  * higher code levels won't be able to stop a seqscan that encounters many
368  * pages' worth of consecutive dead tuples.
369  */
371 
372  /* read page using selected strategy */
373  scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
374  RBM_NORMAL, scan->rs_strategy);
375  scan->rs_cblock = page;
376 
377  if (!scan->rs_pageatatime)
378  return;
379 
380  buffer = scan->rs_cbuf;
381  snapshot = scan->rs_snapshot;
382 
383  /*
384  * Prune and repair fragmentation for the whole page, if possible.
385  */
386  heap_page_prune_opt(scan->rs_rd, buffer);
387 
388  /*
389  * We must hold share lock on the buffer content while examining tuple
390  * visibility. Afterwards, however, the tuples we have found to be
391  * visible are guaranteed good as long as we hold the buffer pin.
392  */
393  LockBuffer(buffer, BUFFER_LOCK_SHARE);
394 
395  dp = BufferGetPage(buffer);
396  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
397  lines = PageGetMaxOffsetNumber(dp);
398  ntup = 0;
399 
400  /*
401  * If the all-visible flag indicates that all tuples on the page are
402  * visible to everyone, we can skip the per-tuple visibility tests.
403  *
404  * Note: In hot standby, a tuple that's already visible to all
405  * transactions in the master might still be invisible to a read-only
406  * transaction in the standby. We partly handle this problem by tracking
407  * the minimum xmin of visible tuples as the cut-off XID while marking a
408  * page all-visible on master and WAL log that along with the visibility
409  * map SET operation. In hot standby, we wait for (or abort) all
410  * transactions that can potentially may not see one or more tuples on the
411  * page. That's how index-only scans work fine in hot standby. A crucial
412  * difference between index-only scans and heap scans is that the
413  * index-only scan completely relies on the visibility map where as heap
414  * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
415  * the page-level flag can be trusted in the same way, because it might
416  * get propagated somehow without being explicitly WAL-logged, e.g. via a
417  * full page write. Until we can prove that beyond doubt, let's check each
418  * tuple for visibility the hard way.
419  */
420  all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
421 
422  for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
423  lineoff <= lines;
424  lineoff++, lpp++)
425  {
426  if (ItemIdIsNormal(lpp))
427  {
428  HeapTupleData loctup;
429  bool valid;
430 
431  loctup.t_tableOid = RelationGetRelid(scan->rs_rd);
432  loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
433  loctup.t_len = ItemIdGetLength(lpp);
434  ItemPointerSet(&(loctup.t_self), page, lineoff);
435 
436  if (all_visible)
437  valid = true;
438  else
439  valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
440 
441  CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
442  buffer, snapshot);
443 
444  if (valid)
445  scan->rs_vistuples[ntup++] = lineoff;
446  }
447  }
448 
450 
451  Assert(ntup <= MaxHeapTuplesPerPage);
452  scan->rs_ntuples = ntup;
453 }
454 
455 /* ----------------
456  * heapgettup - fetch next heap tuple
457  *
458  * Initialize the scan if not already done; then advance to the next
459  * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
460  * or set scan->rs_ctup.t_data = NULL if no more tuples.
461  *
462  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
463  * by scan->rs_ctup".
464  *
465  * Note: the reason nkeys/key are passed separately, even though they are
466  * kept in the scan descriptor, is that the caller may not want us to check
467  * the scankeys.
468  *
469  * Note: when we fall off the end of the scan in either direction, we
470  * reset rs_inited. This means that a further request with the same
471  * scan direction will restart the scan, which is a bit odd, but a
472  * request with the opposite scan direction will start a fresh scan
473  * in the proper direction. The latter is required behavior for cursors,
474  * while the former case is generally undefined behavior in Postgres
475  * so we don't care too much.
476  * ----------------
477  */
478 static void
480  ScanDirection dir,
481  int nkeys,
482  ScanKey key)
483 {
484  HeapTuple tuple = &(scan->rs_ctup);
485  Snapshot snapshot = scan->rs_snapshot;
486  bool backward = ScanDirectionIsBackward(dir);
487  BlockNumber page;
488  bool finished;
489  Page dp;
490  int lines;
491  OffsetNumber lineoff;
492  int linesleft;
493  ItemId lpp;
494 
495  /*
496  * calculate next starting lineoff, given scan direction
497  */
498  if (ScanDirectionIsForward(dir))
499  {
500  if (!scan->rs_inited)
501  {
502  /*
503  * return null immediately if relation is empty
504  */
505  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
506  {
507  Assert(!BufferIsValid(scan->rs_cbuf));
508  tuple->t_data = NULL;
509  return;
510  }
511  if (scan->rs_parallel != NULL)
512  {
513  page = heap_parallelscan_nextpage(scan);
514 
515  /* Other processes might have already finished the scan. */
516  if (page == InvalidBlockNumber)
517  {
518  Assert(!BufferIsValid(scan->rs_cbuf));
519  tuple->t_data = NULL;
520  return;
521  }
522  }
523  else
524  page = scan->rs_startblock; /* first page */
525  heapgetpage(scan, page);
526  lineoff = FirstOffsetNumber; /* first offnum */
527  scan->rs_inited = true;
528  }
529  else
530  {
531  /* continue from previously returned page/tuple */
532  page = scan->rs_cblock; /* current page */
533  lineoff = /* next offnum */
535  }
536 
538 
539  dp = BufferGetPage(scan->rs_cbuf);
540  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
541  lines = PageGetMaxOffsetNumber(dp);
542  /* page and lineoff now reference the physically next tid */
543 
544  linesleft = lines - lineoff + 1;
545  }
546  else if (backward)
547  {
548  /* backward parallel scan not supported */
549  Assert(scan->rs_parallel == NULL);
550 
551  if (!scan->rs_inited)
552  {
553  /*
554  * return null immediately if relation is empty
555  */
556  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
557  {
558  Assert(!BufferIsValid(scan->rs_cbuf));
559  tuple->t_data = NULL;
560  return;
561  }
562 
563  /*
564  * Disable reporting to syncscan logic in a backwards scan; it's
565  * not very likely anyone else is doing the same thing at the same
566  * time, and much more likely that we'll just bollix things for
567  * forward scanners.
568  */
569  scan->rs_syncscan = false;
570  /* start from last page of the scan */
571  if (scan->rs_startblock > 0)
572  page = scan->rs_startblock - 1;
573  else
574  page = scan->rs_nblocks - 1;
575  heapgetpage(scan, page);
576  }
577  else
578  {
579  /* continue from previously returned page/tuple */
580  page = scan->rs_cblock; /* current page */
581  }
582 
584 
585  dp = BufferGetPage(scan->rs_cbuf);
586  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
587  lines = PageGetMaxOffsetNumber(dp);
588 
589  if (!scan->rs_inited)
590  {
591  lineoff = lines; /* final offnum */
592  scan->rs_inited = true;
593  }
594  else
595  {
596  lineoff = /* previous offnum */
598  }
599  /* page and lineoff now reference the physically previous tid */
600 
601  linesleft = lineoff;
602  }
603  else
604  {
605  /*
606  * ``no movement'' scan direction: refetch prior tuple
607  */
608  if (!scan->rs_inited)
609  {
610  Assert(!BufferIsValid(scan->rs_cbuf));
611  tuple->t_data = NULL;
612  return;
613  }
614 
615  page = ItemPointerGetBlockNumber(&(tuple->t_self));
616  if (page != scan->rs_cblock)
617  heapgetpage(scan, page);
618 
619  /* Since the tuple was previously fetched, needn't lock page here */
620  dp = BufferGetPage(scan->rs_cbuf);
621  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
622  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
623  lpp = PageGetItemId(dp, lineoff);
624  Assert(ItemIdIsNormal(lpp));
625 
626  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
627  tuple->t_len = ItemIdGetLength(lpp);
628 
629  return;
630  }
631 
632  /*
633  * advance the scan until we find a qualifying tuple or run out of stuff
634  * to scan
635  */
636  lpp = PageGetItemId(dp, lineoff);
637  for (;;)
638  {
639  while (linesleft > 0)
640  {
641  if (ItemIdIsNormal(lpp))
642  {
643  bool valid;
644 
645  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
646  tuple->t_len = ItemIdGetLength(lpp);
647  ItemPointerSet(&(tuple->t_self), page, lineoff);
648 
649  /*
650  * if current tuple qualifies, return it.
651  */
652  valid = HeapTupleSatisfiesVisibility(tuple,
653  snapshot,
654  scan->rs_cbuf);
655 
656  CheckForSerializableConflictOut(valid, scan->rs_rd, tuple,
657  scan->rs_cbuf, snapshot);
658 
659  if (valid && key != NULL)
660  HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
661  nkeys, key, valid);
662 
663  if (valid)
664  {
666  return;
667  }
668  }
669 
670  /*
671  * otherwise move to the next item on the page
672  */
673  --linesleft;
674  if (backward)
675  {
676  --lpp; /* move back in this page's ItemId array */
677  --lineoff;
678  }
679  else
680  {
681  ++lpp; /* move forward in this page's ItemId array */
682  ++lineoff;
683  }
684  }
685 
686  /*
687  * if we get here, it means we've exhausted the items on this page and
688  * it's time to move to the next.
689  */
691 
692  /*
693  * advance to next/prior page and detect end of scan
694  */
695  if (backward)
696  {
697  finished = (page == scan->rs_startblock) ||
698  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
699  if (page == 0)
700  page = scan->rs_nblocks;
701  page--;
702  }
703  else if (scan->rs_parallel != NULL)
704  {
705  page = heap_parallelscan_nextpage(scan);
706  finished = (page == InvalidBlockNumber);
707  }
708  else
709  {
710  page++;
711  if (page >= scan->rs_nblocks)
712  page = 0;
713  finished = (page == scan->rs_startblock) ||
714  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
715 
716  /*
717  * Report our new scan position for synchronization purposes. We
718  * don't do that when moving backwards, however. That would just
719  * mess up any other forward-moving scanners.
720  *
721  * Note: we do this before checking for end of scan so that the
722  * final state of the position hint is back at the start of the
723  * rel. That's not strictly necessary, but otherwise when you run
724  * the same query multiple times the starting position would shift
725  * a little bit backwards on every invocation, which is confusing.
726  * We don't guarantee any specific ordering in general, though.
727  */
728  if (scan->rs_syncscan)
729  ss_report_location(scan->rs_rd, page);
730  }
731 
732  /*
733  * return NULL if we've exhausted all the pages
734  */
735  if (finished)
736  {
737  if (BufferIsValid(scan->rs_cbuf))
738  ReleaseBuffer(scan->rs_cbuf);
739  scan->rs_cbuf = InvalidBuffer;
741  tuple->t_data = NULL;
742  scan->rs_inited = false;
743  return;
744  }
745 
746  heapgetpage(scan, page);
747 
749 
750  dp = BufferGetPage(scan->rs_cbuf);
751  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
752  lines = PageGetMaxOffsetNumber((Page) dp);
753  linesleft = lines;
754  if (backward)
755  {
756  lineoff = lines;
757  lpp = PageGetItemId(dp, lines);
758  }
759  else
760  {
761  lineoff = FirstOffsetNumber;
762  lpp = PageGetItemId(dp, FirstOffsetNumber);
763  }
764  }
765 }
766 
767 /* ----------------
768  * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
769  *
770  * Same API as heapgettup, but used in page-at-a-time mode
771  *
772  * The internal logic is much the same as heapgettup's too, but there are some
773  * differences: we do not take the buffer content lock (that only needs to
774  * happen inside heapgetpage), and we iterate through just the tuples listed
775  * in rs_vistuples[] rather than all tuples on the page. Notice that
776  * lineindex is 0-based, where the corresponding loop variable lineoff in
777  * heapgettup is 1-based.
778  * ----------------
779  */
780 static void
782  ScanDirection dir,
783  int nkeys,
784  ScanKey key)
785 {
786  HeapTuple tuple = &(scan->rs_ctup);
787  bool backward = ScanDirectionIsBackward(dir);
788  BlockNumber page;
789  bool finished;
790  Page dp;
791  int lines;
792  int lineindex;
793  OffsetNumber lineoff;
794  int linesleft;
795  ItemId lpp;
796 
797  /*
798  * calculate next starting lineindex, given scan direction
799  */
800  if (ScanDirectionIsForward(dir))
801  {
802  if (!scan->rs_inited)
803  {
804  /*
805  * return null immediately if relation is empty
806  */
807  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
808  {
809  Assert(!BufferIsValid(scan->rs_cbuf));
810  tuple->t_data = NULL;
811  return;
812  }
813  if (scan->rs_parallel != NULL)
814  {
815  page = heap_parallelscan_nextpage(scan);
816 
817  /* Other processes might have already finished the scan. */
818  if (page == InvalidBlockNumber)
819  {
820  Assert(!BufferIsValid(scan->rs_cbuf));
821  tuple->t_data = NULL;
822  return;
823  }
824  }
825  else
826  page = scan->rs_startblock; /* first page */
827  heapgetpage(scan, page);
828  lineindex = 0;
829  scan->rs_inited = true;
830  }
831  else
832  {
833  /* continue from previously returned page/tuple */
834  page = scan->rs_cblock; /* current page */
835  lineindex = scan->rs_cindex + 1;
836  }
837 
838  dp = BufferGetPage(scan->rs_cbuf);
839  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
840  lines = scan->rs_ntuples;
841  /* page and lineindex now reference the next visible tid */
842 
843  linesleft = lines - lineindex;
844  }
845  else if (backward)
846  {
847  /* backward parallel scan not supported */
848  Assert(scan->rs_parallel == NULL);
849 
850  if (!scan->rs_inited)
851  {
852  /*
853  * return null immediately if relation is empty
854  */
855  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
856  {
857  Assert(!BufferIsValid(scan->rs_cbuf));
858  tuple->t_data = NULL;
859  return;
860  }
861 
862  /*
863  * Disable reporting to syncscan logic in a backwards scan; it's
864  * not very likely anyone else is doing the same thing at the same
865  * time, and much more likely that we'll just bollix things for
866  * forward scanners.
867  */
868  scan->rs_syncscan = false;
869  /* start from last page of the scan */
870  if (scan->rs_startblock > 0)
871  page = scan->rs_startblock - 1;
872  else
873  page = scan->rs_nblocks - 1;
874  heapgetpage(scan, page);
875  }
876  else
877  {
878  /* continue from previously returned page/tuple */
879  page = scan->rs_cblock; /* current page */
880  }
881 
882  dp = BufferGetPage(scan->rs_cbuf);
883  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
884  lines = scan->rs_ntuples;
885 
886  if (!scan->rs_inited)
887  {
888  lineindex = lines - 1;
889  scan->rs_inited = true;
890  }
891  else
892  {
893  lineindex = scan->rs_cindex - 1;
894  }
895  /* page and lineindex now reference the previous visible tid */
896 
897  linesleft = lineindex + 1;
898  }
899  else
900  {
901  /*
902  * ``no movement'' scan direction: refetch prior tuple
903  */
904  if (!scan->rs_inited)
905  {
906  Assert(!BufferIsValid(scan->rs_cbuf));
907  tuple->t_data = NULL;
908  return;
909  }
910 
911  page = ItemPointerGetBlockNumber(&(tuple->t_self));
912  if (page != scan->rs_cblock)
913  heapgetpage(scan, page);
914 
915  /* Since the tuple was previously fetched, needn't lock page here */
916  dp = BufferGetPage(scan->rs_cbuf);
917  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
918  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
919  lpp = PageGetItemId(dp, lineoff);
920  Assert(ItemIdIsNormal(lpp));
921 
922  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
923  tuple->t_len = ItemIdGetLength(lpp);
924 
925  /* check that rs_cindex is in sync */
926  Assert(scan->rs_cindex < scan->rs_ntuples);
927  Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
928 
929  return;
930  }
931 
932  /*
933  * advance the scan until we find a qualifying tuple or run out of stuff
934  * to scan
935  */
936  for (;;)
937  {
938  while (linesleft > 0)
939  {
940  lineoff = scan->rs_vistuples[lineindex];
941  lpp = PageGetItemId(dp, lineoff);
942  Assert(ItemIdIsNormal(lpp));
943 
944  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
945  tuple->t_len = ItemIdGetLength(lpp);
946  ItemPointerSet(&(tuple->t_self), page, lineoff);
947 
948  /*
949  * if current tuple qualifies, return it.
950  */
951  if (key != NULL)
952  {
953  bool valid;
954 
955  HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
956  nkeys, key, valid);
957  if (valid)
958  {
959  scan->rs_cindex = lineindex;
960  return;
961  }
962  }
963  else
964  {
965  scan->rs_cindex = lineindex;
966  return;
967  }
968 
969  /*
970  * otherwise move to the next item on the page
971  */
972  --linesleft;
973  if (backward)
974  --lineindex;
975  else
976  ++lineindex;
977  }
978 
979  /*
980  * if we get here, it means we've exhausted the items on this page and
981  * it's time to move to the next.
982  */
983  if (backward)
984  {
985  finished = (page == scan->rs_startblock) ||
986  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
987  if (page == 0)
988  page = scan->rs_nblocks;
989  page--;
990  }
991  else if (scan->rs_parallel != NULL)
992  {
993  page = heap_parallelscan_nextpage(scan);
994  finished = (page == InvalidBlockNumber);
995  }
996  else
997  {
998  page++;
999  if (page >= scan->rs_nblocks)
1000  page = 0;
1001  finished = (page == scan->rs_startblock) ||
1002  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1003 
1004  /*
1005  * Report our new scan position for synchronization purposes. We
1006  * don't do that when moving backwards, however. That would just
1007  * mess up any other forward-moving scanners.
1008  *
1009  * Note: we do this before checking for end of scan so that the
1010  * final state of the position hint is back at the start of the
1011  * rel. That's not strictly necessary, but otherwise when you run
1012  * the same query multiple times the starting position would shift
1013  * a little bit backwards on every invocation, which is confusing.
1014  * We don't guarantee any specific ordering in general, though.
1015  */
1016  if (scan->rs_syncscan)
1017  ss_report_location(scan->rs_rd, page);
1018  }
1019 
1020  /*
1021  * return NULL if we've exhausted all the pages
1022  */
1023  if (finished)
1024  {
1025  if (BufferIsValid(scan->rs_cbuf))
1026  ReleaseBuffer(scan->rs_cbuf);
1027  scan->rs_cbuf = InvalidBuffer;
1028  scan->rs_cblock = InvalidBlockNumber;
1029  tuple->t_data = NULL;
1030  scan->rs_inited = false;
1031  return;
1032  }
1033 
1034  heapgetpage(scan, page);
1035 
1036  dp = BufferGetPage(scan->rs_cbuf);
1037  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
1038  lines = scan->rs_ntuples;
1039  linesleft = lines;
1040  if (backward)
1041  lineindex = lines - 1;
1042  else
1043  lineindex = 0;
1044  }
1045 }
1046 
1047 
1048 #if defined(DISABLE_COMPLEX_MACRO)
1049 /*
1050  * This is formatted so oddly so that the correspondence to the macro
1051  * definition in access/htup_details.h is maintained.
1052  */
1053 Datum
1054 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1055  bool *isnull)
1056 {
1057  return (
1058  (attnum) > 0 ?
1059  (
1060  (*(isnull) = false),
1061  HeapTupleNoNulls(tup) ?
1062  (
1063  (tupleDesc)->attrs[(attnum) - 1]->attcacheoff >= 0 ?
1064  (
1065  fetchatt((tupleDesc)->attrs[(attnum) - 1],
1066  (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1067  (tupleDesc)->attrs[(attnum) - 1]->attcacheoff)
1068  )
1069  :
1070  nocachegetattr((tup), (attnum), (tupleDesc))
1071  )
1072  :
1073  (
1074  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1075  (
1076  (*(isnull) = true),
1077  (Datum) NULL
1078  )
1079  :
1080  (
1081  nocachegetattr((tup), (attnum), (tupleDesc))
1082  )
1083  )
1084  )
1085  :
1086  (
1087  (Datum) NULL
1088  )
1089  );
1090 }
1091 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1092 
1093 
1094 /* ----------------------------------------------------------------
1095  * heap access method interface
1096  * ----------------------------------------------------------------
1097  */
1098 
1099 /* ----------------
1100  * relation_open - open any relation by relation OID
1101  *
1102  * If lockmode is not "NoLock", the specified kind of lock is
1103  * obtained on the relation. (Generally, NoLock should only be
1104  * used if the caller knows it has some appropriate lock on the
1105  * relation already.)
1106  *
1107  * An error is raised if the relation does not exist.
1108  *
1109  * NB: a "relation" is anything with a pg_class entry. The caller is
1110  * expected to check whether the relkind is something it can handle.
1111  * ----------------
1112  */
1113 Relation
1114 relation_open(Oid relationId, LOCKMODE lockmode)
1115 {
1116  Relation r;
1117 
1118  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1119 
1120  /* Get the lock before trying to open the relcache entry */
1121  if (lockmode != NoLock)
1122  LockRelationOid(relationId, lockmode);
1123 
1124  /* The relcache does all the real work... */
1125  r = RelationIdGetRelation(relationId);
1126 
1127  if (!RelationIsValid(r))
1128  elog(ERROR, "could not open relation with OID %u", relationId);
1129 
1130  /* Make note that we've accessed a temporary relation */
1131  if (RelationUsesLocalBuffers(r))
1133 
1134  pgstat_initstats(r);
1135 
1136  return r;
1137 }
1138 
1139 /* ----------------
1140  * try_relation_open - open any relation by relation OID
1141  *
1142  * Same as relation_open, except return NULL instead of failing
1143  * if the relation does not exist.
1144  * ----------------
1145  */
1146 Relation
1147 try_relation_open(Oid relationId, LOCKMODE lockmode)
1148 {
1149  Relation r;
1150 
1151  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1152 
1153  /* Get the lock first */
1154  if (lockmode != NoLock)
1155  LockRelationOid(relationId, lockmode);
1156 
1157  /*
1158  * Now that we have the lock, probe to see if the relation really exists
1159  * or not.
1160  */
1161  if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId)))
1162  {
1163  /* Release useless lock */
1164  if (lockmode != NoLock)
1165  UnlockRelationOid(relationId, lockmode);
1166 
1167  return NULL;
1168  }
1169 
1170  /* Should be safe to do a relcache load */
1171  r = RelationIdGetRelation(relationId);
1172 
1173  if (!RelationIsValid(r))
1174  elog(ERROR, "could not open relation with OID %u", relationId);
1175 
1176  /* Make note that we've accessed a temporary relation */
1177  if (RelationUsesLocalBuffers(r))
1179 
1180  pgstat_initstats(r);
1181 
1182  return r;
1183 }
1184 
1185 /* ----------------
1186  * relation_openrv - open any relation specified by a RangeVar
1187  *
1188  * Same as relation_open, but the relation is specified by a RangeVar.
1189  * ----------------
1190  */
1191 Relation
1192 relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
1193 {
1194  Oid relOid;
1195 
1196  /*
1197  * Check for shared-cache-inval messages before trying to open the
1198  * relation. This is needed even if we already hold a lock on the
1199  * relation, because GRANT/REVOKE are executed without taking any lock on
1200  * the target relation, and we want to be sure we see current ACL
1201  * information. We can skip this if asked for NoLock, on the assumption
1202  * that such a call is not the first one in the current command, and so we
1203  * should be reasonably up-to-date already. (XXX this all could stand to
1204  * be redesigned, but for the moment we'll keep doing this like it's been
1205  * done historically.)
1206  */
1207  if (lockmode != NoLock)
1209 
1210  /* Look up and lock the appropriate relation using namespace search */
1211  relOid = RangeVarGetRelid(relation, lockmode, false);
1212 
1213  /* Let relation_open do the rest */
1214  return relation_open(relOid, NoLock);
1215 }
1216 
1217 /* ----------------
1218  * relation_openrv_extended - open any relation specified by a RangeVar
1219  *
1220  * Same as relation_openrv, but with an additional missing_ok argument
1221  * allowing a NULL return rather than an error if the relation is not
1222  * found. (Note that some other causes, such as permissions problems,
1223  * will still result in an ereport.)
1224  * ----------------
1225  */
1226 Relation
1227 relation_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1228  bool missing_ok)
1229 {
1230  Oid relOid;
1231 
1232  /*
1233  * Check for shared-cache-inval messages before trying to open the
1234  * relation. See comments in relation_openrv().
1235  */
1236  if (lockmode != NoLock)
1238 
1239  /* Look up and lock the appropriate relation using namespace search */
1240  relOid = RangeVarGetRelid(relation, lockmode, missing_ok);
1241 
1242  /* Return NULL on not-found */
1243  if (!OidIsValid(relOid))
1244  return NULL;
1245 
1246  /* Let relation_open do the rest */
1247  return relation_open(relOid, NoLock);
1248 }
1249 
1250 /* ----------------
1251  * relation_close - close any relation
1252  *
1253  * If lockmode is not "NoLock", we then release the specified lock.
1254  *
1255  * Note that it is often sensible to hold a lock beyond relation_close;
1256  * in that case, the lock is released automatically at xact end.
1257  * ----------------
1258  */
1259 void
1260 relation_close(Relation relation, LOCKMODE lockmode)
1261 {
1262  LockRelId relid = relation->rd_lockInfo.lockRelId;
1263 
1264  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1265 
1266  /* The relcache does the real work... */
1267  RelationClose(relation);
1268 
1269  if (lockmode != NoLock)
1270  UnlockRelationId(&relid, lockmode);
1271 }
1272 
1273 
1274 /* ----------------
1275  * heap_open - open a heap relation by relation OID
1276  *
1277  * This is essentially relation_open plus check that the relation
1278  * is not an index nor a composite type. (The caller should also
1279  * check that it's not a view or foreign table before assuming it has
1280  * storage.)
1281  * ----------------
1282  */
1283 Relation
1284 heap_open(Oid relationId, LOCKMODE lockmode)
1285 {
1286  Relation r;
1287 
1288  r = relation_open(relationId, lockmode);
1289 
1290  if (r->rd_rel->relkind == RELKIND_INDEX)
1291  ereport(ERROR,
1292  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1293  errmsg("\"%s\" is an index",
1295  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1296  ereport(ERROR,
1297  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1298  errmsg("\"%s\" is a composite type",
1300 
1301  return r;
1302 }
1303 
1304 /* ----------------
1305  * heap_openrv - open a heap relation specified
1306  * by a RangeVar node
1307  *
1308  * As above, but relation is specified by a RangeVar.
1309  * ----------------
1310  */
1311 Relation
1312 heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
1313 {
1314  Relation r;
1315 
1316  r = relation_openrv(relation, lockmode);
1317 
1318  if (r->rd_rel->relkind == RELKIND_INDEX)
1319  ereport(ERROR,
1320  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1321  errmsg("\"%s\" is an index",
1323  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1324  ereport(ERROR,
1325  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1326  errmsg("\"%s\" is a composite type",
1328 
1329  return r;
1330 }
1331 
1332 /* ----------------
1333  * heap_openrv_extended - open a heap relation specified
1334  * by a RangeVar node
1335  *
1336  * As above, but optionally return NULL instead of failing for
1337  * relation-not-found.
1338  * ----------------
1339  */
1340 Relation
1341 heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1342  bool missing_ok)
1343 {
1344  Relation r;
1345 
1346  r = relation_openrv_extended(relation, lockmode, missing_ok);
1347 
1348  if (r)
1349  {
1350  if (r->rd_rel->relkind == RELKIND_INDEX)
1351  ereport(ERROR,
1352  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1353  errmsg("\"%s\" is an index",
1355  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1356  ereport(ERROR,
1357  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1358  errmsg("\"%s\" is a composite type",
1360  }
1361 
1362  return r;
1363 }
1364 
1365 
1366 /* ----------------
1367  * heap_beginscan - begin relation scan
1368  *
1369  * heap_beginscan is the "standard" case.
1370  *
1371  * heap_beginscan_catalog differs in setting up its own temporary snapshot.
1372  *
1373  * heap_beginscan_strat offers an extended API that lets the caller control
1374  * whether a nondefault buffer access strategy can be used, and whether
1375  * syncscan can be chosen (possibly resulting in the scan not starting from
1376  * block zero). Both of these default to TRUE with plain heap_beginscan.
1377  *
1378  * heap_beginscan_bm is an alternative entry point for setting up a
1379  * HeapScanDesc for a bitmap heap scan. Although that scan technology is
1380  * really quite unlike a standard seqscan, there is just enough commonality
1381  * to make it worth using the same data structure.
1382  *
1383  * heap_beginscan_sampling is an alternative entry point for setting up a
1384  * HeapScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
1385  * using the same data structure although the behavior is rather different.
1386  * In addition to the options offered by heap_beginscan_strat, this call
1387  * also allows control of whether page-mode visibility checking is used.
1388  * ----------------
1389  */
1391 heap_beginscan(Relation relation, Snapshot snapshot,
1392  int nkeys, ScanKey key)
1393 {
1394  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1395  true, true, true, false, false, false);
1396 }
1397 
1399 heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key)
1400 {
1401  Oid relid = RelationGetRelid(relation);
1402  Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
1403 
1404  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1405  true, true, true, false, false, true);
1406 }
1407 
1410  int nkeys, ScanKey key,
1411  bool allow_strat, bool allow_sync)
1412 {
1413  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1414  allow_strat, allow_sync, true,
1415  false, false, false);
1416 }
1417 
1420  int nkeys, ScanKey key)
1421 {
1422  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1423  false, false, true, true, false, false);
1424 }
1425 
1428  int nkeys, ScanKey key,
1429  bool allow_strat, bool allow_sync, bool allow_pagemode)
1430 {
1431  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1432  allow_strat, allow_sync, allow_pagemode,
1433  false, true, false);
1434 }
1435 
1436 static HeapScanDesc
1438  int nkeys, ScanKey key,
1439  ParallelHeapScanDesc parallel_scan,
1440  bool allow_strat,
1441  bool allow_sync,
1442  bool allow_pagemode,
1443  bool is_bitmapscan,
1444  bool is_samplescan,
1445  bool temp_snap)
1446 {
1447  HeapScanDesc scan;
1448 
1449  /*
1450  * increment relation ref count while scanning relation
1451  *
1452  * This is just to make really sure the relcache entry won't go away while
1453  * the scan has a pointer to it. Caller should be holding the rel open
1454  * anyway, so this is redundant in all normal scenarios...
1455  */
1457 
1458  /*
1459  * allocate and initialize scan descriptor
1460  */
1461  scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1462 
1463  scan->rs_rd = relation;
1464  scan->rs_snapshot = snapshot;
1465  scan->rs_nkeys = nkeys;
1466  scan->rs_bitmapscan = is_bitmapscan;
1467  scan->rs_samplescan = is_samplescan;
1468  scan->rs_strategy = NULL; /* set in initscan */
1469  scan->rs_allow_strat = allow_strat;
1470  scan->rs_allow_sync = allow_sync;
1471  scan->rs_temp_snap = temp_snap;
1472  scan->rs_parallel = parallel_scan;
1473 
1474  /*
1475  * we can use page-at-a-time mode if it's an MVCC-safe snapshot
1476  */
1477  scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot);
1478 
1479  /*
1480  * For a seqscan in a serializable transaction, acquire a predicate lock
1481  * on the entire relation. This is required not only to lock all the
1482  * matching tuples, but also to conflict with new insertions into the
1483  * table. In an indexscan, we take page locks on the index pages covering
1484  * the range specified in the scan qual, but in a heap scan there is
1485  * nothing more fine-grained to lock. A bitmap scan is a different story,
1486  * there we have already scanned the index and locked the index pages
1487  * covering the predicate. But in that case we still have to lock any
1488  * matching heap tuples.
1489  */
1490  if (!is_bitmapscan)
1491  PredicateLockRelation(relation, snapshot);
1492 
1493  /* we only need to set this up once */
1494  scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1495 
1496  /*
1497  * we do this here instead of in initscan() because heap_rescan also calls
1498  * initscan() and we don't want to allocate memory again
1499  */
1500  if (nkeys > 0)
1501  scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1502  else
1503  scan->rs_key = NULL;
1504 
1505  initscan(scan, key, false);
1506 
1507  return scan;
1508 }
1509 
1510 /* ----------------
1511  * heap_rescan - restart a relation scan
1512  * ----------------
1513  */
1514 void
1516  ScanKey key)
1517 {
1518  /*
1519  * unpin scan buffers
1520  */
1521  if (BufferIsValid(scan->rs_cbuf))
1522  ReleaseBuffer(scan->rs_cbuf);
1523 
1524  /*
1525  * reinitialize scan descriptor
1526  */
1527  initscan(scan, key, true);
1528 
1529  /*
1530  * reset parallel scan, if present
1531  */
1532  if (scan->rs_parallel != NULL)
1533  {
1534  ParallelHeapScanDesc parallel_scan;
1535 
1536  /*
1537  * Caller is responsible for making sure that all workers have
1538  * finished the scan before calling this, so it really shouldn't be
1539  * necessary to acquire the mutex at all. We acquire it anyway, just
1540  * to be tidy.
1541  */
1542  parallel_scan = scan->rs_parallel;
1543  SpinLockAcquire(&parallel_scan->phs_mutex);
1544  parallel_scan->phs_cblock = parallel_scan->phs_startblock;
1545  SpinLockRelease(&parallel_scan->phs_mutex);
1546  }
1547 }
1548 
1549 /* ----------------
1550  * heap_rescan_set_params - restart a relation scan after changing params
1551  *
1552  * This call allows changing the buffer strategy, syncscan, and pagemode
1553  * options before starting a fresh scan. Note that although the actual use
1554  * of syncscan might change (effectively, enabling or disabling reporting),
1555  * the previously selected startblock will be kept.
1556  * ----------------
1557  */
1558 void
1560  bool allow_strat, bool allow_sync, bool allow_pagemode)
1561 {
1562  /* adjust parameters */
1563  scan->rs_allow_strat = allow_strat;
1564  scan->rs_allow_sync = allow_sync;
1565  scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(scan->rs_snapshot);
1566  /* ... and rescan */
1567  heap_rescan(scan, key);
1568 }
1569 
1570 /* ----------------
1571  * heap_endscan - end relation scan
1572  *
1573  * See how to integrate with index scans.
1574  * Check handling if reldesc caching.
1575  * ----------------
1576  */
1577 void
1579 {
1580  /* Note: no locking manipulations needed */
1581 
1582  /*
1583  * unpin scan buffers
1584  */
1585  if (BufferIsValid(scan->rs_cbuf))
1586  ReleaseBuffer(scan->rs_cbuf);
1587 
1588  /*
1589  * decrement relation reference count and free scan descriptor storage
1590  */
1592 
1593  if (scan->rs_key)
1594  pfree(scan->rs_key);
1595 
1596  if (scan->rs_strategy != NULL)
1598 
1599  if (scan->rs_temp_snap)
1601 
1602  pfree(scan);
1603 }
1604 
1605 /* ----------------
1606  * heap_parallelscan_estimate - estimate storage for ParallelHeapScanDesc
1607  *
1608  * Sadly, this doesn't reduce to a constant, because the size required
1609  * to serialize the snapshot can vary.
1610  * ----------------
1611  */
1612 Size
1614 {
1615  return add_size(offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1616  EstimateSnapshotSpace(snapshot));
1617 }
1618 
1619 /* ----------------
1620  * heap_parallelscan_initialize - initialize ParallelHeapScanDesc
1621  *
1622  * Must allow as many bytes of shared memory as returned by
1623  * heap_parallelscan_estimate. Call this just once in the leader
1624  * process; then, individual workers attach via heap_beginscan_parallel.
1625  * ----------------
1626  */
1627 void
1629  Snapshot snapshot)
1630 {
1631  target->phs_relid = RelationGetRelid(relation);
1632  target->phs_nblocks = RelationGetNumberOfBlocks(relation);
1633  /* compare phs_syncscan initialization to similar logic in initscan */
1634  target->phs_syncscan = synchronize_seqscans &&
1635  !RelationUsesLocalBuffers(relation) &&
1636  target->phs_nblocks > NBuffers / 4;
1637  SpinLockInit(&target->phs_mutex);
1638  target->phs_cblock = InvalidBlockNumber;
1640  SerializeSnapshot(snapshot, target->phs_snapshot_data);
1641 }
1642 
1643 /* ----------------
1644  * heap_beginscan_parallel - join a parallel scan
1645  *
1646  * Caller must hold a suitable lock on the correct relation.
1647  * ----------------
1648  */
1651 {
1652  Snapshot snapshot;
1653 
1654  Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
1655  snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data);
1656  RegisterSnapshot(snapshot);
1657 
1658  return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan,
1659  true, true, true, false, false, true);
1660 }
1661 
1662 /* ----------------
1663  * heap_parallelscan_nextpage - get the next page to scan
1664  *
1665  * Get the next page to scan. Even if there are no pages left to scan,
1666  * another backend could have grabbed a page to scan and not yet finished
1667  * looking at it, so it doesn't follow that the scan is done when the
1668  * first backend gets an InvalidBlockNumber return.
1669  * ----------------
1670  */
1671 static BlockNumber
1673 {
1675  BlockNumber sync_startpage = InvalidBlockNumber;
1676  BlockNumber report_page = InvalidBlockNumber;
1677  ParallelHeapScanDesc parallel_scan;
1678 
1679  Assert(scan->rs_parallel);
1680  parallel_scan = scan->rs_parallel;
1681 
1682 retry:
1683  /* Grab the spinlock. */
1684  SpinLockAcquire(&parallel_scan->phs_mutex);
1685 
1686  /*
1687  * If the scan's startblock has not yet been initialized, we must do so
1688  * now. If this is not a synchronized scan, we just start at block 0, but
1689  * if it is a synchronized scan, we must get the starting position from
1690  * the synchronized scan machinery. We can't hold the spinlock while
1691  * doing that, though, so release the spinlock, get the information we
1692  * need, and retry. If nobody else has initialized the scan in the
1693  * meantime, we'll fill in the value we fetched on the second time
1694  * through.
1695  */
1696  if (parallel_scan->phs_startblock == InvalidBlockNumber)
1697  {
1698  if (!parallel_scan->phs_syncscan)
1699  parallel_scan->phs_startblock = 0;
1700  else if (sync_startpage != InvalidBlockNumber)
1701  parallel_scan->phs_startblock = sync_startpage;
1702  else
1703  {
1704  SpinLockRelease(&parallel_scan->phs_mutex);
1705  sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks);
1706  goto retry;
1707  }
1708  parallel_scan->phs_cblock = parallel_scan->phs_startblock;
1709  }
1710 
1711  /*
1712  * The current block number is the next one that needs to be scanned,
1713  * unless it's InvalidBlockNumber already, in which case there are no more
1714  * blocks to scan. After remembering the current value, we must advance
1715  * it so that the next call to this function returns the next block to be
1716  * scanned.
1717  */
1718  page = parallel_scan->phs_cblock;
1719  if (page != InvalidBlockNumber)
1720  {
1721  parallel_scan->phs_cblock++;
1722  if (parallel_scan->phs_cblock >= scan->rs_nblocks)
1723  parallel_scan->phs_cblock = 0;
1724  if (parallel_scan->phs_cblock == parallel_scan->phs_startblock)
1725  {
1726  parallel_scan->phs_cblock = InvalidBlockNumber;
1727  report_page = parallel_scan->phs_startblock;
1728  }
1729  }
1730 
1731  /* Release the lock. */
1732  SpinLockRelease(&parallel_scan->phs_mutex);
1733 
1734  /*
1735  * Report scan location. Normally, we report the current page number.
1736  * When we reach the end of the scan, though, we report the starting page,
1737  * not the ending page, just so the starting positions for later scans
1738  * doesn't slew backwards. We only report the position at the end of the
1739  * scan once, though: subsequent callers will have report nothing, since
1740  * they will have page == InvalidBlockNumber.
1741  */
1742  if (scan->rs_syncscan)
1743  {
1744  if (report_page == InvalidBlockNumber)
1745  report_page = page;
1746  if (report_page != InvalidBlockNumber)
1747  ss_report_location(scan->rs_rd, report_page);
1748  }
1749 
1750  return page;
1751 }
1752 
1753 /* ----------------
1754  * heap_update_snapshot
1755  *
1756  * Update snapshot info in heap scan descriptor.
1757  * ----------------
1758  */
1759 void
1761 {
1762  Assert(IsMVCCSnapshot(snapshot));
1763 
1764  RegisterSnapshot(snapshot);
1765  scan->rs_snapshot = snapshot;
1766  scan->rs_temp_snap = true;
1767 }
1768 
1769 /* ----------------
1770  * heap_getnext - retrieve next tuple in scan
1771  *
1772  * Fix to work with index relations.
1773  * We don't return the buffer anymore, but you can get it from the
1774  * returned HeapTuple.
1775  * ----------------
1776  */
1777 
1778 #ifdef HEAPDEBUGALL
1779 #define HEAPDEBUG_1 \
1780  elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1781  RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1782 #define HEAPDEBUG_2 \
1783  elog(DEBUG2, "heap_getnext returning EOS")
1784 #define HEAPDEBUG_3 \
1785  elog(DEBUG2, "heap_getnext returning tuple")
1786 #else
1787 #define HEAPDEBUG_1
1788 #define HEAPDEBUG_2
1789 #define HEAPDEBUG_3
1790 #endif /* !defined(HEAPDEBUGALL) */
1791 
1792 
1793 HeapTuple
1795 {
1796  /* Note: no locking manipulations needed */
1797 
1798  HEAPDEBUG_1; /* heap_getnext( info ) */
1799 
1800  if (scan->rs_pageatatime)
1801  heapgettup_pagemode(scan, direction,
1802  scan->rs_nkeys, scan->rs_key);
1803  else
1804  heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);
1805 
1806  if (scan->rs_ctup.t_data == NULL)
1807  {
1808  HEAPDEBUG_2; /* heap_getnext returning EOS */
1809  return NULL;
1810  }
1811 
1812  /*
1813  * if we get here it means we have a new current scan tuple, so point to
1814  * the proper return buffer and return the tuple.
1815  */
1816  HEAPDEBUG_3; /* heap_getnext returning tuple */
1817 
1819 
1820  return &(scan->rs_ctup);
1821 }
1822 
1823 /*
1824  * heap_fetch - retrieve tuple with given tid
1825  *
1826  * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1827  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1828  * against the specified snapshot.
1829  *
1830  * If successful (tuple found and passes snapshot time qual), then *userbuf
1831  * is set to the buffer holding the tuple and TRUE is returned. The caller
1832  * must unpin the buffer when done with the tuple.
1833  *
1834  * If the tuple is not found (ie, item number references a deleted slot),
1835  * then tuple->t_data is set to NULL and FALSE is returned.
1836  *
1837  * If the tuple is found but fails the time qual check, then FALSE is returned
1838  * but tuple->t_data is left pointing to the tuple.
1839  *
1840  * keep_buf determines what is done with the buffer in the FALSE-result cases.
1841  * When the caller specifies keep_buf = true, we retain the pin on the buffer
1842  * and return it in *userbuf (so the caller must eventually unpin it); when
1843  * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
1844  *
1845  * stats_relation is the relation to charge the heap_fetch operation against
1846  * for statistical purposes. (This could be the heap rel itself, an
1847  * associated index, or NULL to not count the fetch at all.)
1848  *
1849  * heap_fetch does not follow HOT chains: only the exact TID requested will
1850  * be fetched.
1851  *
1852  * It is somewhat inconsistent that we ereport() on invalid block number but
1853  * return false on invalid item number. There are a couple of reasons though.
1854  * One is that the caller can relatively easily check the block number for
1855  * validity, but cannot check the item number without reading the page
1856  * himself. Another is that when we are following a t_ctid link, we can be
1857  * reasonably confident that the page number is valid (since VACUUM shouldn't
1858  * truncate off the destination page without having killed the referencing
1859  * tuple first), but the item number might well not be good.
1860  */
1861 bool
1863  Snapshot snapshot,
1864  HeapTuple tuple,
1865  Buffer *userbuf,
1866  bool keep_buf,
1867  Relation stats_relation)
1868 {
1869  ItemPointer tid = &(tuple->t_self);
1870  ItemId lp;
1871  Buffer buffer;
1872  Page page;
1873  OffsetNumber offnum;
1874  bool valid;
1875 
1876  /*
1877  * Fetch and pin the appropriate page of the relation.
1878  */
1879  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1880 
1881  /*
1882  * Need share lock on buffer to examine tuple commit status.
1883  */
1884  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1885  page = BufferGetPage(buffer);
1886  TestForOldSnapshot(snapshot, relation, page);
1887 
1888  /*
1889  * We'd better check for out-of-range offnum in case of VACUUM since the
1890  * TID was obtained.
1891  */
1892  offnum = ItemPointerGetOffsetNumber(tid);
1893  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1894  {
1895  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1896  if (keep_buf)
1897  *userbuf = buffer;
1898  else
1899  {
1900  ReleaseBuffer(buffer);
1901  *userbuf = InvalidBuffer;
1902  }
1903  tuple->t_data = NULL;
1904  return false;
1905  }
1906 
1907  /*
1908  * get the item line pointer corresponding to the requested tid
1909  */
1910  lp = PageGetItemId(page, offnum);
1911 
1912  /*
1913  * Must check for deleted tuple.
1914  */
1915  if (!ItemIdIsNormal(lp))
1916  {
1917  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1918  if (keep_buf)
1919  *userbuf = buffer;
1920  else
1921  {
1922  ReleaseBuffer(buffer);
1923  *userbuf = InvalidBuffer;
1924  }
1925  tuple->t_data = NULL;
1926  return false;
1927  }
1928 
1929  /*
1930  * fill in *tuple fields
1931  */
1932  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1933  tuple->t_len = ItemIdGetLength(lp);
1934  tuple->t_tableOid = RelationGetRelid(relation);
1935 
1936  /*
1937  * check time qualification of tuple, then release lock
1938  */
1939  valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1940 
1941  if (valid)
1942  PredicateLockTuple(relation, tuple, snapshot);
1943 
1944  CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1945 
1946  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1947 
1948  if (valid)
1949  {
1950  /*
1951  * All checks passed, so return the tuple as valid. Caller is now
1952  * responsible for releasing the buffer.
1953  */
1954  *userbuf = buffer;
1955 
1956  /* Count the successful fetch against appropriate rel, if any */
1957  if (stats_relation != NULL)
1958  pgstat_count_heap_fetch(stats_relation);
1959 
1960  return true;
1961  }
1962 
1963  /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1964  if (keep_buf)
1965  *userbuf = buffer;
1966  else
1967  {
1968  ReleaseBuffer(buffer);
1969  *userbuf = InvalidBuffer;
1970  }
1971 
1972  return false;
1973 }
1974 
1975 /*
1976  * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1977  *
1978  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1979  * of a HOT chain), and buffer is the buffer holding this tuple. We search
1980  * for the first chain member satisfying the given snapshot. If one is
1981  * found, we update *tid to reference that tuple's offset number, and
1982  * return TRUE. If no match, return FALSE without modifying *tid.
1983  *
1984  * heapTuple is a caller-supplied buffer. When a match is found, we return
1985  * the tuple here, in addition to updating *tid. If no match is found, the
1986  * contents of this buffer on return are undefined.
1987  *
1988  * If all_dead is not NULL, we check non-visible tuples to see if they are
1989  * globally dead; *all_dead is set TRUE if all members of the HOT chain
1990  * are vacuumable, FALSE if not.
1991  *
1992  * Unlike heap_fetch, the caller must already have pin and (at least) share
1993  * lock on the buffer; it is still pinned/locked at exit. Also unlike
1994  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1995  */
1996 bool
1998  Snapshot snapshot, HeapTuple heapTuple,
1999  bool *all_dead, bool first_call)
2000 {
2001  Page dp = (Page) BufferGetPage(buffer);
2002  TransactionId prev_xmax = InvalidTransactionId;
2003  OffsetNumber offnum;
2004  bool at_chain_start;
2005  bool valid;
2006  bool skip;
2007 
2008  /* If this is not the first call, previous call returned a (live!) tuple */
2009  if (all_dead)
2010  *all_dead = first_call;
2011 
2013 
2015  offnum = ItemPointerGetOffsetNumber(tid);
2016  at_chain_start = first_call;
2017  skip = !first_call;
2018 
2019  heapTuple->t_self = *tid;
2020 
2021  /* Scan through possible multiple members of HOT-chain */
2022  for (;;)
2023  {
2024  ItemId lp;
2025 
2026  /* check for bogus TID */
2027  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
2028  break;
2029 
2030  lp = PageGetItemId(dp, offnum);
2031 
2032  /* check for unused, dead, or redirected items */
2033  if (!ItemIdIsNormal(lp))
2034  {
2035  /* We should only see a redirect at start of chain */
2036  if (ItemIdIsRedirected(lp) && at_chain_start)
2037  {
2038  /* Follow the redirect */
2039  offnum = ItemIdGetRedirect(lp);
2040  at_chain_start = false;
2041  continue;
2042  }
2043  /* else must be end of chain */
2044  break;
2045  }
2046 
2047  heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
2048  heapTuple->t_len = ItemIdGetLength(lp);
2049  heapTuple->t_tableOid = RelationGetRelid(relation);
2050  ItemPointerSetOffsetNumber(&heapTuple->t_self, offnum);
2051 
2052  /*
2053  * Shouldn't see a HEAP_ONLY tuple at chain start.
2054  */
2055  if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
2056  break;
2057 
2058  /*
2059  * The xmin should match the previous xmax value, else chain is
2060  * broken.
2061  */
2062  if (TransactionIdIsValid(prev_xmax) &&
2063  !TransactionIdEquals(prev_xmax,
2064  HeapTupleHeaderGetXmin(heapTuple->t_data)))
2065  break;
2066 
2067  /*
2068  * When first_call is true (and thus, skip is initially false) we'll
2069  * return the first tuple we find. But on later passes, heapTuple
2070  * will initially be pointing to the tuple we returned last time.
2071  * Returning it again would be incorrect (and would loop forever), so
2072  * we skip it and return the next match we find.
2073  */
2074  if (!skip)
2075  {
2076  /*
2077  * For the benefit of logical decoding, have t_self point at the
2078  * element of the HOT chain we're currently investigating instead
2079  * of the root tuple of the HOT chain. This is important because
2080  * the *Satisfies routine for historical mvcc snapshots needs the
2081  * correct tid to decide about the visibility in some cases.
2082  */
2083  ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum);
2084 
2085  /* If it's visible per the snapshot, we must return it */
2086  valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
2087  CheckForSerializableConflictOut(valid, relation, heapTuple,
2088  buffer, snapshot);
2089  /* reset to original, non-redirected, tid */
2090  heapTuple->t_self = *tid;
2091 
2092  if (valid)
2093  {
2094  ItemPointerSetOffsetNumber(tid, offnum);
2095  PredicateLockTuple(relation, heapTuple, snapshot);
2096  if (all_dead)
2097  *all_dead = false;
2098  return true;
2099  }
2100  }
2101  skip = false;
2102 
2103  /*
2104  * If we can't see it, maybe no one else can either. At caller
2105  * request, check whether all chain members are dead to all
2106  * transactions.
2107  */
2108  if (all_dead && *all_dead &&
2110  *all_dead = false;
2111 
2112  /*
2113  * Check to see if HOT chain continues past this tuple; if so fetch
2114  * the next offnum and loop around.
2115  */
2116  if (HeapTupleIsHotUpdated(heapTuple))
2117  {
2120  offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
2121  at_chain_start = false;
2122  prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
2123  }
2124  else
2125  break; /* end of chain */
2126  }
2127 
2128  return false;
2129 }
2130 
2131 /*
2132  * heap_hot_search - search HOT chain for tuple satisfying snapshot
2133  *
2134  * This has the same API as heap_hot_search_buffer, except that the caller
2135  * does not provide the buffer containing the page, rather we access it
2136  * locally.
2137  */
2138 bool
2140  bool *all_dead)
2141 {
2142  bool result;
2143  Buffer buffer;
2144  HeapTupleData heapTuple;
2145 
2146  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
2147  LockBuffer(buffer, BUFFER_LOCK_SHARE);
2148  result = heap_hot_search_buffer(tid, relation, buffer, snapshot,
2149  &heapTuple, all_dead, true);
2150  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2151  ReleaseBuffer(buffer);
2152  return result;
2153 }
2154 
2155 /*
2156  * heap_get_latest_tid - get the latest tid of a specified tuple
2157  *
2158  * Actually, this gets the latest version that is visible according to
2159  * the passed snapshot. You can pass SnapshotDirty to get the very latest,
2160  * possibly uncommitted version.
2161  *
2162  * *tid is both an input and an output parameter: it is updated to
2163  * show the latest version of the row. Note that it will not be changed
2164  * if no version of the row passes the snapshot test.
2165  */
2166 void
2168  Snapshot snapshot,
2169  ItemPointer tid)
2170 {
2171  BlockNumber blk;
2172  ItemPointerData ctid;
2173  TransactionId priorXmax;
2174 
2175  /* this is to avoid Assert failures on bad input */
2176  if (!ItemPointerIsValid(tid))
2177  return;
2178 
2179  /*
2180  * Since this can be called with user-supplied TID, don't trust the input
2181  * too much. (RelationGetNumberOfBlocks is an expensive check, so we
2182  * don't check t_ctid links again this way. Note that it would not do to
2183  * call it just once and save the result, either.)
2184  */
2185  blk = ItemPointerGetBlockNumber(tid);
2186  if (blk >= RelationGetNumberOfBlocks(relation))
2187  elog(ERROR, "block number %u is out of range for relation \"%s\"",
2188  blk, RelationGetRelationName(relation));
2189 
2190  /*
2191  * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
2192  * need to examine, and *tid is the TID we will return if ctid turns out
2193  * to be bogus.
2194  *
2195  * Note that we will loop until we reach the end of the t_ctid chain.
2196  * Depending on the snapshot passed, there might be at most one visible
2197  * version of the row, but we don't try to optimize for that.
2198  */
2199  ctid = *tid;
2200  priorXmax = InvalidTransactionId; /* cannot check first XMIN */
2201  for (;;)
2202  {
2203  Buffer buffer;
2204  Page page;
2205  OffsetNumber offnum;
2206  ItemId lp;
2207  HeapTupleData tp;
2208  bool valid;
2209 
2210  /*
2211  * Read, pin, and lock the page.
2212  */
2213  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
2214  LockBuffer(buffer, BUFFER_LOCK_SHARE);
2215  page = BufferGetPage(buffer);
2216  TestForOldSnapshot(snapshot, relation, page);
2217 
2218  /*
2219  * Check for bogus item number. This is not treated as an error
2220  * condition because it can happen while following a t_ctid link. We
2221  * just assume that the prior tid is OK and return it unchanged.
2222  */
2223  offnum = ItemPointerGetOffsetNumber(&ctid);
2224  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
2225  {
2226  UnlockReleaseBuffer(buffer);
2227  break;
2228  }
2229  lp = PageGetItemId(page, offnum);
2230  if (!ItemIdIsNormal(lp))
2231  {
2232  UnlockReleaseBuffer(buffer);
2233  break;
2234  }
2235 
2236  /* OK to access the tuple */
2237  tp.t_self = ctid;
2238  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2239  tp.t_len = ItemIdGetLength(lp);
2240  tp.t_tableOid = RelationGetRelid(relation);
2241 
2242  /*
2243  * After following a t_ctid link, we might arrive at an unrelated
2244  * tuple. Check for XMIN match.
2245  */
2246  if (TransactionIdIsValid(priorXmax) &&
2248  {
2249  UnlockReleaseBuffer(buffer);
2250  break;
2251  }
2252 
2253  /*
2254  * Check time qualification of tuple; if visible, set it as the new
2255  * result candidate.
2256  */
2257  valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2258  CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2259  if (valid)
2260  *tid = ctid;
2261 
2262  /*
2263  * If there's a valid t_ctid link, follow it, else we're done.
2264  */
2265  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2268  {
2269  UnlockReleaseBuffer(buffer);
2270  break;
2271  }
2272 
2273  ctid = tp.t_data->t_ctid;
2274  priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2275  UnlockReleaseBuffer(buffer);
2276  } /* end of loop */
2277 }
2278 
2279 
2280 /*
2281  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2282  *
2283  * This is called after we have waited for the XMAX transaction to terminate.
2284  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2285  * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2286  * hint bit if possible --- but beware that that may not yet be possible,
2287  * if the transaction committed asynchronously.
2288  *
2289  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2290  * even if it commits.
2291  *
2292  * Hence callers should look only at XMAX_INVALID.
2293  *
2294  * Note this is not allowed for tuples whose xmax is a multixact.
2295  */
2296 static void
2298 {
2300  Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
2301 
2302  if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
2303  {
2304  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2307  xid);
2308  else
2309  HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
2311  }
2312 }
2313 
2314 
2315 /*
2316  * GetBulkInsertState - prepare status object for a bulk insert
2317  */
2320 {
2321  BulkInsertState bistate;
2322 
2323  bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2325  bistate->current_buf = InvalidBuffer;
2326  return bistate;
2327 }
2328 
2329 /*
2330  * FreeBulkInsertState - clean up after finishing a bulk insert
2331  */
2332 void
2334 {
2335  if (bistate->current_buf != InvalidBuffer)
2336  ReleaseBuffer(bistate->current_buf);
2337  FreeAccessStrategy(bistate->strategy);
2338  pfree(bistate);
2339 }
2340 
2341 /*
2342  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2343  */
2344 void
2346 {
2347  if (bistate->current_buf != InvalidBuffer)
2348  ReleaseBuffer(bistate->current_buf);
2349  bistate->current_buf = InvalidBuffer;
2350 }
2351 
2352 
2353 /*
2354  * heap_insert - insert tuple into a heap
2355  *
2356  * The new tuple is stamped with current transaction ID and the specified
2357  * command ID.
2358  *
2359  * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
2360  * logged in WAL, even for a non-temp relation. Safe usage of this behavior
2361  * requires that we arrange that all new tuples go into new pages not
2362  * containing any tuples from other transactions, and that the relation gets
2363  * fsync'd before commit. (See also heap_sync() comments)
2364  *
2365  * The HEAP_INSERT_SKIP_FSM option is passed directly to
2366  * RelationGetBufferForTuple, which see for more info.
2367  *
2368  * HEAP_INSERT_FROZEN should only be specified for inserts into
2369  * relfilenodes created during the current subtransaction and when
2370  * there are no prior snapshots or pre-existing portals open.
2371  * This causes rows to be frozen, which is an MVCC violation and
2372  * requires explicit options chosen by user.
2373  *
2374  * HEAP_INSERT_IS_SPECULATIVE is used on so-called "speculative insertions",
2375  * which can be backed out afterwards without aborting the whole transaction.
2376  * Other sessions can wait for the speculative insertion to be confirmed,
2377  * turning it into a regular tuple, or aborted, as if it never existed.
2378  * Speculatively inserted tuples behave as "value locks" of short duration,
2379  * used to implement INSERT .. ON CONFLICT.
2380  *
2381  * Note that most of these options will be applied when inserting into the
2382  * heap's TOAST table, too, if the tuple requires any out-of-line data. Only
2383  * HEAP_INSERT_IS_SPECULATIVE is explicitly ignored, as the toast data does
2384  * not partake in speculative insertion.
2385  *
2386  * The BulkInsertState object (if any; bistate can be NULL for default
2387  * behavior) is also just passed through to RelationGetBufferForTuple.
2388  *
2389  * The return value is the OID assigned to the tuple (either here or by the
2390  * caller), or InvalidOid if no OID. The header fields of *tup are updated
2391  * to match the stored tuple; in particular tup->t_self receives the actual
2392  * TID where the tuple was stored. But note that any toasting of fields
2393  * within the tuple data is NOT reflected into *tup.
2394  */
2395 Oid
2397  int options, BulkInsertState bistate)
2398 {
2400  HeapTuple heaptup;
2401  Buffer buffer;
2402  Buffer vmbuffer = InvalidBuffer;
2403  bool all_visible_cleared = false;
2404 
2405  /*
2406  * Fill in tuple header fields, assign an OID, and toast the tuple if
2407  * necessary.
2408  *
2409  * Note: below this point, heaptup is the data we actually intend to store
2410  * into the relation; tup is the caller's original untoasted data.
2411  */
2412  heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2413 
2414  /*
2415  * Find buffer to insert this tuple into. If the page is all visible,
2416  * this will also pin the requisite visibility map page.
2417  */
2418  buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2419  InvalidBuffer, options, bistate,
2420  &vmbuffer, NULL);
2421 
2422  /*
2423  * We're about to do the actual insert -- but check for conflict first, to
2424  * avoid possibly having to roll back work we've just done.
2425  *
2426  * This is safe without a recheck as long as there is no possibility of
2427  * another process scanning the page between this check and the insert
2428  * being visible to the scan (i.e., an exclusive buffer content lock is
2429  * continuously held from this point until the tuple insert is visible).
2430  *
2431  * For a heap insert, we only need to check for table-level SSI locks. Our
2432  * new tuple can't possibly conflict with existing tuple locks, and heap
2433  * page locks are only consolidated versions of tuple locks; they do not
2434  * lock "gaps" as index page locks do. So we don't need to specify a
2435  * buffer when making the call, which makes for a faster check.
2436  */
2438 
2439  /* NO EREPORT(ERROR) from here till changes are logged */
2441 
2442  RelationPutHeapTuple(relation, buffer, heaptup,
2443  (options & HEAP_INSERT_SPECULATIVE) != 0);
2444 
2445  if (PageIsAllVisible(BufferGetPage(buffer)))
2446  {
2447  all_visible_cleared = true;
2449  visibilitymap_clear(relation,
2450  ItemPointerGetBlockNumber(&(heaptup->t_self)),
2451  vmbuffer, VISIBILITYMAP_VALID_BITS);
2452  }
2453 
2454  /*
2455  * XXX Should we set PageSetPrunable on this page ?
2456  *
2457  * The inserting transaction may eventually abort thus making this tuple
2458  * DEAD and hence available for pruning. Though we don't want to optimize
2459  * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2460  * aborted tuple will never be pruned until next vacuum is triggered.
2461  *
2462  * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2463  */
2464 
2465  MarkBufferDirty(buffer);
2466 
2467  /* XLOG stuff */
2468  if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
2469  {
2470  xl_heap_insert xlrec;
2471  xl_heap_header xlhdr;
2472  XLogRecPtr recptr;
2473  Page page = BufferGetPage(buffer);
2474  uint8 info = XLOG_HEAP_INSERT;
2475  int bufflags = 0;
2476 
2477  /*
2478  * If this is a catalog, we need to transmit combocids to properly
2479  * decode, so log that as well.
2480  */
2482  log_heap_new_cid(relation, heaptup);
2483 
2484  /*
2485  * If this is the single and first tuple on page, we can reinit the
2486  * page instead of restoring the whole thing. Set flag, and hide
2487  * buffer references from XLogInsert.
2488  */
2489  if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2491  {
2492  info |= XLOG_HEAP_INIT_PAGE;
2493  bufflags |= REGBUF_WILL_INIT;
2494  }
2495 
2496  xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2497  xlrec.flags = 0;
2498  if (all_visible_cleared)
2500  if (options & HEAP_INSERT_SPECULATIVE)
2503 
2504  /*
2505  * For logical decoding, we need the tuple even if we're doing a full
2506  * page write, so make sure it's included even if we take a full-page
2507  * image. (XXX We could alternatively store a pointer into the FPW).
2508  */
2509  if (RelationIsLogicallyLogged(relation))
2510  {
2512  bufflags |= REGBUF_KEEP_DATA;
2513  }
2514 
2515  XLogBeginInsert();
2516  XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2517 
2518  xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2519  xlhdr.t_infomask = heaptup->t_data->t_infomask;
2520  xlhdr.t_hoff = heaptup->t_data->t_hoff;
2521 
2522  /*
2523  * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2524  * write the whole page to the xlog, we don't need to store
2525  * xl_heap_header in the xlog.
2526  */
2527  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2528  XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2529  /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2531  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2532  heaptup->t_len - SizeofHeapTupleHeader);
2533 
2534  /* filtering by origin on a row level is much more efficient */
2536 
2537  recptr = XLogInsert(RM_HEAP_ID, info);
2538 
2539  PageSetLSN(page, recptr);
2540  }
2541 
2542  END_CRIT_SECTION();
2543 
2544  UnlockReleaseBuffer(buffer);
2545  if (vmbuffer != InvalidBuffer)
2546  ReleaseBuffer(vmbuffer);
2547 
2548  /*
2549  * If tuple is cachable, mark it for invalidation from the caches in case
2550  * we abort. Note it is OK to do this after releasing the buffer, because
2551  * the heaptup data structure is all in local memory, not in the shared
2552  * buffer.
2553  */
2554  CacheInvalidateHeapTuple(relation, heaptup, NULL);
2555 
2556  /* Note: speculative insertions are counted too, even if aborted later */
2557  pgstat_count_heap_insert(relation, 1);
2558 
2559  /*
2560  * If heaptup is a private copy, release it. Don't forget to copy t_self
2561  * back to the caller's image, too.
2562  */
2563  if (heaptup != tup)
2564  {
2565  tup->t_self = heaptup->t_self;
2566  heap_freetuple(heaptup);
2567  }
2568 
2569  return HeapTupleGetOid(tup);
2570 }
2571 
2572 /*
2573  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2574  * tuple header fields, assigns an OID, and toasts the tuple if necessary.
2575  * Returns a toasted version of the tuple if it was toasted, or the original
2576  * tuple if not. Note that in any case, the header fields are also set in
2577  * the original tuple.
2578  */
2579 static HeapTuple
2581  CommandId cid, int options)
2582 {
2583  /*
2584  * For now, parallel operations are required to be strictly read-only.
2585  * Unlike heap_update() and heap_delete(), an insert should never create a
2586  * combo CID, so it might be possible to relax this restriction, but not
2587  * without more thought and testing.
2588  */
2589  if (IsInParallelMode())
2590  ereport(ERROR,
2591  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2592  errmsg("cannot insert tuples during a parallel operation")));
2593 
2594  if (relation->rd_rel->relhasoids)
2595  {
2596 #ifdef NOT_USED
2597  /* this is redundant with an Assert in HeapTupleSetOid */
2599 #endif
2600 
2601  /*
2602  * If the object id of this tuple has already been assigned, trust the
2603  * caller. There are a couple of ways this can happen. At initial db
2604  * creation, the backend program sets oids for tuples. When we define
2605  * an index, we set the oid. Finally, in the future, we may allow
2606  * users to set their own object ids in order to support a persistent
2607  * object store (objects need to contain pointers to one another).
2608  */
2609  if (!OidIsValid(HeapTupleGetOid(tup)))
2610  HeapTupleSetOid(tup, GetNewOid(relation));
2611  }
2612  else
2613  {
2614  /* check there is not space for an OID */
2615  Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
2616  }
2617 
2618  tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2619  tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2621  HeapTupleHeaderSetXmin(tup->t_data, xid);
2622  if (options & HEAP_INSERT_FROZEN)
2624 
2625  HeapTupleHeaderSetCmin(tup->t_data, cid);
2626  HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2627  tup->t_tableOid = RelationGetRelid(relation);
2628 
2629  /*
2630  * If the new tuple is too big for storage or contains already toasted
2631  * out-of-line attributes from some other relation, invoke the toaster.
2632  */
2633  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2634  relation->rd_rel->relkind != RELKIND_MATVIEW)
2635  {
2636  /* toast table entries should never be recursively toasted */
2638  return tup;
2639  }
2640  else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2641  return toast_insert_or_update(relation, tup, NULL, options);
2642  else
2643  return tup;
2644 }
2645 
2646 /*
2647  * heap_multi_insert - insert multiple tuple into a heap
2648  *
2649  * This is like heap_insert(), but inserts multiple tuples in one operation.
2650  * That's faster than calling heap_insert() in a loop, because when multiple
2651  * tuples can be inserted on a single page, we can write just a single WAL
2652  * record covering all of them, and only need to lock/unlock the page once.
2653  *
2654  * Note: this leaks memory into the current memory context. You can create a
2655  * temporary context before calling this, if that's a problem.
2656  */
2657 void
2658 heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
2659  CommandId cid, int options, BulkInsertState bistate)
2660 {
2662  HeapTuple *heaptuples;
2663  int i;
2664  int ndone;
2665  char *scratch = NULL;
2666  Page page;
2667  bool needwal;
2668  Size saveFreeSpace;
2669  bool need_tuple_data = RelationIsLogicallyLogged(relation);
2670  bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2671 
2672  needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2673  saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2675 
2676  /* Toast and set header data in all the tuples */
2677  heaptuples = palloc(ntuples * sizeof(HeapTuple));
2678  for (i = 0; i < ntuples; i++)
2679  heaptuples[i] = heap_prepare_insert(relation, tuples[i],
2680  xid, cid, options);
2681 
2682  /*
2683  * Allocate some memory to use for constructing the WAL record. Using
2684  * palloc() within a critical section is not safe, so we allocate this
2685  * beforehand.
2686  */
2687  if (needwal)
2688  scratch = palloc(BLCKSZ);
2689 
2690  /*
2691  * We're about to do the actual inserts -- but check for conflict first,
2692  * to minimize the possibility of having to roll back work we've just
2693  * done.
2694  *
2695  * A check here does not definitively prevent a serialization anomaly;
2696  * that check MUST be done at least past the point of acquiring an
2697  * exclusive buffer content lock on every buffer that will be affected,
2698  * and MAY be done after all inserts are reflected in the buffers and
2699  * those locks are released; otherwise there race condition. Since
2700  * multiple buffers can be locked and unlocked in the loop below, and it
2701  * would not be feasible to identify and lock all of those buffers before
2702  * the loop, we must do a final check at the end.
2703  *
2704  * The check here could be omitted with no loss of correctness; it is
2705  * present strictly as an optimization.
2706  *
2707  * For heap inserts, we only need to check for table-level SSI locks. Our
2708  * new tuples can't possibly conflict with existing tuple locks, and heap
2709  * page locks are only consolidated versions of tuple locks; they do not
2710  * lock "gaps" as index page locks do. So we don't need to specify a
2711  * buffer when making the call, which makes for a faster check.
2712  */
2714 
2715  ndone = 0;
2716  while (ndone < ntuples)
2717  {
2718  Buffer buffer;
2719  Buffer vmbuffer = InvalidBuffer;
2720  bool all_visible_cleared = false;
2721  int nthispage;
2722 
2724 
2725  /*
2726  * Find buffer where at least the next tuple will fit. If the page is
2727  * all-visible, this will also pin the requisite visibility map page.
2728  */
2729  buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2730  InvalidBuffer, options, bistate,
2731  &vmbuffer, NULL);
2732  page = BufferGetPage(buffer);
2733 
2734  /* NO EREPORT(ERROR) from here till changes are logged */
2736 
2737  /*
2738  * RelationGetBufferForTuple has ensured that the first tuple fits.
2739  * Put that on the page, and then as many other tuples as fit.
2740  */
2741  RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2742  for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2743  {
2744  HeapTuple heaptup = heaptuples[ndone + nthispage];
2745 
2746  if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2747  break;
2748 
2749  RelationPutHeapTuple(relation, buffer, heaptup, false);
2750 
2751  /*
2752  * We don't use heap_multi_insert for catalog tuples yet, but
2753  * better be prepared...
2754  */
2755  if (needwal && need_cids)
2756  log_heap_new_cid(relation, heaptup);
2757  }
2758 
2759  if (PageIsAllVisible(page))
2760  {
2761  all_visible_cleared = true;
2762  PageClearAllVisible(page);
2763  visibilitymap_clear(relation,
2764  BufferGetBlockNumber(buffer),
2765  vmbuffer, VISIBILITYMAP_VALID_BITS);
2766  }
2767 
2768  /*
2769  * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2770  */
2771 
2772  MarkBufferDirty(buffer);
2773 
2774  /* XLOG stuff */
2775  if (needwal)
2776  {
2777  XLogRecPtr recptr;
2778  xl_heap_multi_insert *xlrec;
2780  char *tupledata;
2781  int totaldatalen;
2782  char *scratchptr = scratch;
2783  bool init;
2784  int bufflags = 0;
2785 
2786  /*
2787  * If the page was previously empty, we can reinit the page
2788  * instead of restoring the whole thing.
2789  */
2790  init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2791  PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2792 
2793  /* allocate xl_heap_multi_insert struct from the scratch area */
2794  xlrec = (xl_heap_multi_insert *) scratchptr;
2795  scratchptr += SizeOfHeapMultiInsert;
2796 
2797  /*
2798  * Allocate offsets array. Unless we're reinitializing the page,
2799  * in that case the tuples are stored in order starting at
2800  * FirstOffsetNumber and we don't need to store the offsets
2801  * explicitly.
2802  */
2803  if (!init)
2804  scratchptr += nthispage * sizeof(OffsetNumber);
2805 
2806  /* the rest of the scratch space is used for tuple data */
2807  tupledata = scratchptr;
2808 
2809  xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2810  xlrec->ntuples = nthispage;
2811 
2812  /*
2813  * Write out an xl_multi_insert_tuple and the tuple data itself
2814  * for each tuple.
2815  */
2816  for (i = 0; i < nthispage; i++)
2817  {
2818  HeapTuple heaptup = heaptuples[ndone + i];
2819  xl_multi_insert_tuple *tuphdr;
2820  int datalen;
2821 
2822  if (!init)
2823  xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2824  /* xl_multi_insert_tuple needs two-byte alignment. */
2825  tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2826  scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2827 
2828  tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2829  tuphdr->t_infomask = heaptup->t_data->t_infomask;
2830  tuphdr->t_hoff = heaptup->t_data->t_hoff;
2831 
2832  /* write bitmap [+ padding] [+ oid] + data */
2833  datalen = heaptup->t_len - SizeofHeapTupleHeader;
2834  memcpy(scratchptr,
2835  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2836  datalen);
2837  tuphdr->datalen = datalen;
2838  scratchptr += datalen;
2839  }
2840  totaldatalen = scratchptr - tupledata;
2841  Assert((scratchptr - scratch) < BLCKSZ);
2842 
2843  if (need_tuple_data)
2845 
2846  /*
2847  * Signal that this is the last xl_heap_multi_insert record
2848  * emitted by this call to heap_multi_insert(). Needed for logical
2849  * decoding so it knows when to cleanup temporary data.
2850  */
2851  if (ndone + nthispage == ntuples)
2852  xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2853 
2854  if (init)
2855  {
2856  info |= XLOG_HEAP_INIT_PAGE;
2857  bufflags |= REGBUF_WILL_INIT;
2858  }
2859 
2860  /*
2861  * If we're doing logical decoding, include the new tuple data
2862  * even if we take a full-page image of the page.
2863  */
2864  if (need_tuple_data)
2865  bufflags |= REGBUF_KEEP_DATA;
2866 
2867  XLogBeginInsert();
2868  XLogRegisterData((char *) xlrec, tupledata - scratch);
2869  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2870 
2871  XLogRegisterBufData(0, tupledata, totaldatalen);
2872 
2873  /* filtering by origin on a row level is much more efficient */
2875 
2876  recptr = XLogInsert(RM_HEAP2_ID, info);
2877 
2878  PageSetLSN(page, recptr);
2879  }
2880 
2881  END_CRIT_SECTION();
2882 
2883  UnlockReleaseBuffer(buffer);
2884  if (vmbuffer != InvalidBuffer)
2885  ReleaseBuffer(vmbuffer);
2886 
2887  ndone += nthispage;
2888  }
2889 
2890  /*
2891  * We're done with the actual inserts. Check for conflicts again, to
2892  * ensure that all rw-conflicts in to these inserts are detected. Without
2893  * this final check, a sequential scan of the heap may have locked the
2894  * table after the "before" check, missing one opportunity to detect the
2895  * conflict, and then scanned the table before the new tuples were there,
2896  * missing the other chance to detect the conflict.
2897  *
2898  * For heap inserts, we only need to check for table-level SSI locks. Our
2899  * new tuples can't possibly conflict with existing tuple locks, and heap
2900  * page locks are only consolidated versions of tuple locks; they do not
2901  * lock "gaps" as index page locks do. So we don't need to specify a
2902  * buffer when making the call.
2903  */
2905 
2906  /*
2907  * If tuples are cachable, mark them for invalidation from the caches in
2908  * case we abort. Note it is OK to do this after releasing the buffer,
2909  * because the heaptuples data structure is all in local memory, not in
2910  * the shared buffer.
2911  */
2912  if (IsCatalogRelation(relation))
2913  {
2914  for (i = 0; i < ntuples; i++)
2915  CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2916  }
2917 
2918  /*
2919  * Copy t_self fields back to the caller's original tuples. This does
2920  * nothing for untoasted tuples (tuples[i] == heaptuples[i)], but it's
2921  * probably faster to always copy than check.
2922  */
2923  for (i = 0; i < ntuples; i++)
2924  tuples[i]->t_self = heaptuples[i]->t_self;
2925 
2926  pgstat_count_heap_insert(relation, ntuples);
2927 }
2928 
2929 /*
2930  * simple_heap_insert - insert a tuple
2931  *
2932  * Currently, this routine differs from heap_insert only in supplying
2933  * a default command ID and not allowing access to the speedup options.
2934  *
2935  * This should be used rather than using heap_insert directly in most places
2936  * where we are modifying system catalogs.
2937  */
2938 Oid
2940 {
2941  return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2942 }
2943 
2944 /*
2945  * Given infomask/infomask2, compute the bits that must be saved in the
2946  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2947  * xl_heap_lock_updated WAL records.
2948  *
2949  * See fix_infomask_from_infobits.
2950  */
2951 static uint8
2952 compute_infobits(uint16 infomask, uint16 infomask2)
2953 {
2954  return
2955  ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2956  ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2957  ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2958  /* note we ignore HEAP_XMAX_SHR_LOCK here */
2959  ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2960  ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2961  XLHL_KEYS_UPDATED : 0);
2962 }
2963 
2964 /*
2965  * Given two versions of the same t_infomask for a tuple, compare them and
2966  * return whether the relevant status for a tuple Xmax has changed. This is
2967  * used after a buffer lock has been released and reacquired: we want to ensure
2968  * that the tuple state continues to be the same it was when we previously
2969  * examined it.
2970  *
2971  * Note the Xmax field itself must be compared separately.
2972  */
2973 static inline bool
2974 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2975 {
2976  const uint16 interesting =
2978 
2979  if ((new_infomask & interesting) != (old_infomask & interesting))
2980  return true;
2981 
2982  return false;
2983 }
2984 
2985 /*
2986  * heap_delete - delete a tuple
2987  *
2988  * NB: do not call this directly unless you are prepared to deal with
2989  * concurrent-update conditions. Use simple_heap_delete instead.
2990  *
2991  * relation - table to be modified (caller must hold suitable lock)
2992  * tid - TID of tuple to be deleted
2993  * cid - delete command ID (used for visibility test, and stored into
2994  * cmax if successful)
2995  * crosscheck - if not InvalidSnapshot, also check tuple against this
2996  * wait - true if should wait for any conflicting update to commit/abort
2997  * hufd - output parameter, filled in failure cases (see below)
2998  *
2999  * Normal, successful return value is HeapTupleMayBeUpdated, which
3000  * actually means we did delete it. Failure return codes are
3001  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3002  * (the last only possible if wait == false).
3003  *
3004  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3005  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3006  * (the last only for HeapTupleSelfUpdated, since we
3007  * cannot obtain cmax from a combocid generated by another transaction).
3008  * See comments for struct HeapUpdateFailureData for additional info.
3009  */
3012  CommandId cid, Snapshot crosscheck, bool wait,
3013  HeapUpdateFailureData *hufd)
3014 {
3017  ItemId lp;
3018  HeapTupleData tp;
3019  Page page;
3020  BlockNumber block;
3021  Buffer buffer;
3022  Buffer vmbuffer = InvalidBuffer;
3023  TransactionId new_xmax;
3024  uint16 new_infomask,
3025  new_infomask2;
3026  bool have_tuple_lock = false;
3027  bool iscombo;
3028  bool all_visible_cleared = false;
3029  HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
3030  bool old_key_copied = false;
3031 
3032  Assert(ItemPointerIsValid(tid));
3033 
3034  /*
3035  * Forbid this during a parallel operation, lest it allocate a combocid.
3036  * Other workers might need that combocid for visibility checks, and we
3037  * have no provision for broadcasting it to them.
3038  */
3039  if (IsInParallelMode())
3040  ereport(ERROR,
3041  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3042  errmsg("cannot delete tuples during a parallel operation")));
3043 
3044  block = ItemPointerGetBlockNumber(tid);
3045  buffer = ReadBuffer(relation, block);
3046  page = BufferGetPage(buffer);
3047 
3048  /*
3049  * Before locking the buffer, pin the visibility map page if it appears to
3050  * be necessary. Since we haven't got the lock yet, someone else might be
3051  * in the middle of changing this, so we'll need to recheck after we have
3052  * the lock.
3053  */
3054  if (PageIsAllVisible(page))
3055  visibilitymap_pin(relation, block, &vmbuffer);
3056 
3058 
3059  /*
3060  * If we didn't pin the visibility map page and the page has become all
3061  * visible while we were busy locking the buffer, we'll have to unlock and
3062  * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
3063  * unfortunate, but hopefully shouldn't happen often.
3064  */
3065  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3066  {
3067  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3068  visibilitymap_pin(relation, block, &vmbuffer);
3070  }
3071 
3072  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
3073  Assert(ItemIdIsNormal(lp));
3074 
3075  tp.t_tableOid = RelationGetRelid(relation);
3076  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3077  tp.t_len = ItemIdGetLength(lp);
3078  tp.t_self = *tid;
3079 
3080 l1:
3081  result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
3082 
3083  if (result == HeapTupleInvisible)
3084  {
3085  UnlockReleaseBuffer(buffer);
3086  ereport(ERROR,
3087  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3088  errmsg("attempted to delete invisible tuple")));
3089  }
3090  else if (result == HeapTupleBeingUpdated && wait)
3091  {
3092  TransactionId xwait;
3093  uint16 infomask;
3094 
3095  /* must copy state data before unlocking buffer */
3096  xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
3097  infomask = tp.t_data->t_infomask;
3098 
3099  /*
3100  * Sleep until concurrent transaction ends -- except when there's a
3101  * single locker and it's our own transaction. Note we don't care
3102  * which lock mode the locker has, because we need the strongest one.
3103  *
3104  * Before sleeping, we need to acquire tuple lock to establish our
3105  * priority for the tuple (see heap_lock_tuple). LockTuple will
3106  * release us when we are next-in-line for the tuple.
3107  *
3108  * If we are forced to "start over" below, we keep the tuple lock;
3109  * this arranges that we stay at the head of the line while rechecking
3110  * tuple state.
3111  */
3112  if (infomask & HEAP_XMAX_IS_MULTI)
3113  {
3114  /* wait for multixact */
3115  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3117  {
3118  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3119 
3120  /* acquire tuple lock, if necessary */
3122  LockWaitBlock, &have_tuple_lock);
3123 
3124  /* wait for multixact */
3126  relation, &(tp.t_self), XLTW_Delete,
3127  NULL);
3129 
3130  /*
3131  * If xwait had just locked the tuple then some other xact
3132  * could update this tuple before we get to this point. Check
3133  * for xmax change, and start over if so.
3134  */
3135  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3137  xwait))
3138  goto l1;
3139  }
3140 
3141  /*
3142  * You might think the multixact is necessarily done here, but not
3143  * so: it could have surviving members, namely our own xact or
3144  * other subxacts of this backend. It is legal for us to delete
3145  * the tuple in either case, however (the latter case is
3146  * essentially a situation of upgrading our former shared lock to
3147  * exclusive). We don't bother changing the on-disk hint bits
3148  * since we are about to overwrite the xmax altogether.
3149  */
3150  }
3151  else if (!TransactionIdIsCurrentTransactionId(xwait))
3152  {
3153  /*
3154  * Wait for regular transaction to end; but first, acquire tuple
3155  * lock.
3156  */
3157  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3159  LockWaitBlock, &have_tuple_lock);
3160  XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3162 
3163  /*
3164  * xwait is done, but if xwait had just locked the tuple then some
3165  * other xact could update this tuple before we get to this point.
3166  * Check for xmax change, and start over if so.
3167  */
3168  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3170  xwait))
3171  goto l1;
3172 
3173  /* Otherwise check if it committed or aborted */
3174  UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3175  }
3176 
3177  /*
3178  * We may overwrite if previous xmax aborted, or if it committed but
3179  * only locked the tuple without updating it.
3180  */
3181  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3184  result = HeapTupleMayBeUpdated;
3185  else
3186  result = HeapTupleUpdated;
3187  }
3188 
3189  if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3190  {
3191  /* Perform additional check for transaction-snapshot mode RI updates */
3192  if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3193  result = HeapTupleUpdated;
3194  }
3195 
3196  if (result != HeapTupleMayBeUpdated)
3197  {
3198  Assert(result == HeapTupleSelfUpdated ||
3199  result == HeapTupleUpdated ||
3200  result == HeapTupleBeingUpdated);
3202  hufd->ctid = tp.t_data->t_ctid;
3204  if (result == HeapTupleSelfUpdated)
3205  hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
3206  else
3207  hufd->cmax = InvalidCommandId;
3208  UnlockReleaseBuffer(buffer);
3209  if (have_tuple_lock)
3210  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3211  if (vmbuffer != InvalidBuffer)
3212  ReleaseBuffer(vmbuffer);
3213  return result;
3214  }
3215 
3216  /*
3217  * We're about to do the actual delete -- check for conflict first, to
3218  * avoid possibly having to roll back work we've just done.
3219  *
3220  * This is safe without a recheck as long as there is no possibility of
3221  * another process scanning the page between this check and the delete
3222  * being visible to the scan (i.e., an exclusive buffer content lock is
3223  * continuously held from this point until the tuple delete is visible).
3224  */
3225  CheckForSerializableConflictIn(relation, &tp, buffer);
3226 
3227  /* replace cid with a combo cid if necessary */
3228  HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
3229 
3230  /*
3231  * Compute replica identity tuple before entering the critical section so
3232  * we don't PANIC upon a memory allocation failure.
3233  */
3234  old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3235 
3236  /*
3237  * If this is the first possibly-multixact-able operation in the current
3238  * transaction, set my per-backend OldestMemberMXactId setting. We can be
3239  * certain that the transaction will never become a member of any older
3240  * MultiXactIds than that. (We have to do this even if we end up just
3241  * using our own TransactionId below, since some other backend could
3242  * incorporate our XID into a MultiXact immediately afterwards.)
3243  */
3245 
3248  xid, LockTupleExclusive, true,
3249  &new_xmax, &new_infomask, &new_infomask2);
3250 
3252 
3253  /*
3254  * If this transaction commits, the tuple will become DEAD sooner or
3255  * later. Set flag that this page is a candidate for pruning once our xid
3256  * falls below the OldestXmin horizon. If the transaction finally aborts,
3257  * the subsequent page pruning will be a no-op and the hint will be
3258  * cleared.
3259  */
3260  PageSetPrunable(page, xid);
3261 
3262  if (PageIsAllVisible(page))
3263  {
3264  all_visible_cleared = true;
3265  PageClearAllVisible(page);
3266  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3267  vmbuffer, VISIBILITYMAP_VALID_BITS);
3268  }
3269 
3270  /* store transaction information of xact deleting the tuple */
3273  tp.t_data->t_infomask |= new_infomask;
3274  tp.t_data->t_infomask2 |= new_infomask2;
3276  HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3277  HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
3278  /* Make sure there is no forward chain link in t_ctid */
3279  tp.t_data->t_ctid = tp.t_self;
3280 
3281  MarkBufferDirty(buffer);
3282 
3283  /*
3284  * XLOG stuff
3285  *
3286  * NB: heap_abort_speculative() uses the same xlog record and replay
3287  * routines.
3288  */
3289  if (RelationNeedsWAL(relation))
3290  {
3291  xl_heap_delete xlrec;
3292  XLogRecPtr recptr;
3293 
3294  /* For logical decode we need combocids to properly decode the catalog */
3296  log_heap_new_cid(relation, &tp);
3297 
3298  xlrec.flags = all_visible_cleared ? XLH_DELETE_ALL_VISIBLE_CLEARED : 0;
3300  tp.t_data->t_infomask2);
3302  xlrec.xmax = new_xmax;
3303 
3304  if (old_key_tuple != NULL)
3305  {
3306  if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3308  else
3310  }
3311 
3312  XLogBeginInsert();
3313  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3314 
3315  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3316 
3317  /*
3318  * Log replica identity of the deleted tuple if there is one
3319  */
3320  if (old_key_tuple != NULL)
3321  {
3322  xl_heap_header xlhdr;
3323 
3324  xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3325  xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3326  xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3327 
3328  XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3329  XLogRegisterData((char *) old_key_tuple->t_data
3331  old_key_tuple->t_len
3333  }
3334 
3335  /* filtering by origin on a row level is much more efficient */
3337 
3338  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3339 
3340  PageSetLSN(page, recptr);
3341  }
3342 
3343  END_CRIT_SECTION();
3344 
3345  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3346 
3347  if (vmbuffer != InvalidBuffer)
3348  ReleaseBuffer(vmbuffer);
3349 
3350  /*
3351  * If the tuple has toasted out-of-line attributes, we need to delete
3352  * those items too. We have to do this before releasing the buffer
3353  * because we need to look at the contents of the tuple, but it's OK to
3354  * release the content lock on the buffer first.
3355  */
3356  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3357  relation->rd_rel->relkind != RELKIND_MATVIEW)
3358  {
3359  /* toast table entries should never be recursively toasted */
3361  }
3362  else if (HeapTupleHasExternal(&tp))
3363  toast_delete(relation, &tp, false);
3364 
3365  /*
3366  * Mark tuple for invalidation from system caches at next command
3367  * boundary. We have to do this before releasing the buffer because we
3368  * need to look at the contents of the tuple.
3369  */
3370  CacheInvalidateHeapTuple(relation, &tp, NULL);
3371 
3372  /* Now we can release the buffer */
3373  ReleaseBuffer(buffer);
3374 
3375  /*
3376  * Release the lmgr tuple lock, if we had it.
3377  */
3378  if (have_tuple_lock)
3379  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3380 
3381  pgstat_count_heap_delete(relation);
3382 
3383  if (old_key_tuple != NULL && old_key_copied)
3384  heap_freetuple(old_key_tuple);
3385 
3386  return HeapTupleMayBeUpdated;
3387 }
3388 
3389 /*
3390  * simple_heap_delete - delete a tuple
3391  *
3392  * This routine may be used to delete a tuple when concurrent updates of
3393  * the target tuple are not expected (for example, because we have a lock
3394  * on the relation associated with the tuple). Any failure is reported
3395  * via ereport().
3396  */
3397 void
3399 {
3401  HeapUpdateFailureData hufd;
3402 
3403  result = heap_delete(relation, tid,
3405  true /* wait for commit */ ,
3406  &hufd);
3407  switch (result)
3408  {
3409  case HeapTupleSelfUpdated:
3410  /* Tuple was already updated in current command? */
3411  elog(ERROR, "tuple already updated by self");
3412  break;
3413 
3414  case HeapTupleMayBeUpdated:
3415  /* done successfully */
3416  break;
3417 
3418  case HeapTupleUpdated:
3419  elog(ERROR, "tuple concurrently updated");
3420  break;
3421 
3422  default:
3423  elog(ERROR, "unrecognized heap_delete status: %u", result);
3424  break;
3425  }
3426 }
3427 
3428 /*
3429  * heap_update - replace a tuple
3430  *
3431  * NB: do not call this directly unless you are prepared to deal with
3432  * concurrent-update conditions. Use simple_heap_update instead.
3433  *
3434  * relation - table to be modified (caller must hold suitable lock)
3435  * otid - TID of old tuple to be replaced
3436  * newtup - newly constructed tuple data to store
3437  * cid - update command ID (used for visibility test, and stored into
3438  * cmax/cmin if successful)
3439  * crosscheck - if not InvalidSnapshot, also check old tuple against this
3440  * wait - true if should wait for any conflicting update to commit/abort
3441  * hufd - output parameter, filled in failure cases (see below)
3442  * lockmode - output parameter, filled with lock mode acquired on tuple
3443  *
3444  * Normal, successful return value is HeapTupleMayBeUpdated, which
3445  * actually means we *did* update it. Failure return codes are
3446  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3447  * (the last only possible if wait == false).
3448  *
3449  * On success, the header fields of *newtup are updated to match the new
3450  * stored tuple; in particular, newtup->t_self is set to the TID where the
3451  * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
3452  * update was done. However, any TOAST changes in the new tuple's
3453  * data are not reflected into *newtup.
3454  *
3455  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3456  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3457  * (the last only for HeapTupleSelfUpdated, since we
3458  * cannot obtain cmax from a combocid generated by another transaction).
3459  * See comments for struct HeapUpdateFailureData for additional info.
3460  */
3463  CommandId cid, Snapshot crosscheck, bool wait,
3464  HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
3465 {
3468  Bitmapset *hot_attrs;
3469  Bitmapset *key_attrs;
3470  Bitmapset *id_attrs;
3471  Bitmapset *interesting_attrs;
3472  Bitmapset *modified_attrs;
3473  ItemId lp;
3474  HeapTupleData oldtup;
3475  HeapTuple heaptup;
3476  HeapTuple old_key_tuple = NULL;
3477  bool old_key_copied = false;
3478  Page page;
3479  BlockNumber block;
3480  MultiXactStatus mxact_status;
3481  Buffer buffer,
3482  newbuf,
3483  vmbuffer = InvalidBuffer,
3484  vmbuffer_new = InvalidBuffer;
3485  bool need_toast;
3486  Size newtupsize,
3487  pagefree;
3488  bool have_tuple_lock = false;
3489  bool iscombo;
3490  bool use_hot_update = false;
3491  bool hot_attrs_checked = false;
3492  bool key_intact;
3493  bool all_visible_cleared = false;
3494  bool all_visible_cleared_new = false;
3495  bool checked_lockers;
3496  bool locker_remains;
3497  TransactionId xmax_new_tuple,
3498  xmax_old_tuple;
3499  uint16 infomask_old_tuple,
3500  infomask2_old_tuple,
3501  infomask_new_tuple,
3502  infomask2_new_tuple;
3503 
3504  Assert(ItemPointerIsValid(otid));
3505 
3506  /*
3507  * Forbid this during a parallel operation, lest it allocate a combocid.
3508  * Other workers might need that combocid for visibility checks, and we
3509  * have no provision for broadcasting it to them.
3510  */
3511  if (IsInParallelMode())
3512  ereport(ERROR,
3513  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3514  errmsg("cannot update tuples during a parallel operation")));
3515 
3516  /*
3517  * Fetch the list of attributes to be checked for various operations.
3518  *
3519  * For HOT considerations, this is wasted effort if we fail to update or
3520  * have to put the new tuple on a different page. But we must compute the
3521  * list before obtaining buffer lock --- in the worst case, if we are doing
3522  * an update on one of the relevant system catalogs, we could deadlock if
3523  * we try to fetch the list later. In any case, the relcache caches the
3524  * data so this is usually pretty cheap.
3525  *
3526  * We also need columns used by the replica identity and columns that are
3527  * considered the "key" of rows in the table.
3528  *
3529  * Note that we get copies of each bitmap, so we need not worry about
3530  * relcache flush happening midway through.
3531  */
3532  hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
3533  key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3534  id_attrs = RelationGetIndexAttrBitmap(relation,
3536 
3537 
3538  block = ItemPointerGetBlockNumber(otid);
3539  buffer = ReadBuffer(relation, block);
3540  page = BufferGetPage(buffer);
3541 
3542  interesting_attrs = NULL;
3543  /*
3544  * If the page is already full, there is hardly any chance of doing a HOT
3545  * update on this page. It might be wasteful effort to look for index
3546  * column updates only to later reject HOT updates for lack of space in the
3547  * same page. So we be conservative and only fetch hot_attrs if the page is
3548  * not already full. Since we are already holding a pin on the buffer,
3549  * there is no chance that the buffer can get cleaned up concurrently and
3550  * even if that was possible, in the worst case we lose a chance to do a
3551  * HOT update.
3552  */
3553  if (!PageIsFull(page))
3554  {
3555  interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3556  hot_attrs_checked = true;
3557  }
3558  interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3559  interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3560 
3561  /*
3562  * Before locking the buffer, pin the visibility map page if it appears to
3563  * be necessary. Since we haven't got the lock yet, someone else might be
3564  * in the middle of changing this, so we'll need to recheck after we have
3565  * the lock.
3566  */
3567  if (PageIsAllVisible(page))
3568  visibilitymap_pin(relation, block, &vmbuffer);
3569 
3571 
3572  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3573  Assert(ItemIdIsNormal(lp));
3574 
3575  /*
3576  * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3577  * properly.
3578  */
3579  oldtup.t_tableOid = RelationGetRelid(relation);
3580  oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3581  oldtup.t_len = ItemIdGetLength(lp);
3582  oldtup.t_self = *otid;
3583 
3584  /* the new tuple is ready, except for this: */
3585  newtup->t_tableOid = RelationGetRelid(relation);
3586 
3587  /* Fill in OID for newtup */
3588  if (relation->rd_rel->relhasoids)
3589  {
3590 #ifdef NOT_USED
3591  /* this is redundant with an Assert in HeapTupleSetOid */
3592  Assert(newtup->t_data->t_infomask & HEAP_HASOID);
3593 #endif
3594  HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
3595  }
3596  else
3597  {
3598  /* check there is not space for an OID */
3599  Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
3600  }
3601 
3602  /* Determine columns modified by the update. */
3603  modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3604  &oldtup, newtup);
3605 
3606  /*
3607  * If we're not updating any "key" column, we can grab a weaker lock type.
3608  * This allows for more concurrency when we are running simultaneously
3609  * with foreign key checks.
3610  *
3611  * Note that if a column gets detoasted while executing the update, but
3612  * the value ends up being the same, this test will fail and we will use
3613  * the stronger lock. This is acceptable; the important case to optimize
3614  * is updates that don't manipulate key columns, not those that
3615  * serendipitiously arrive at the same key values.
3616  */
3617  if (!bms_overlap(modified_attrs, key_attrs))
3618  {
3619  *lockmode = LockTupleNoKeyExclusive;
3620  mxact_status = MultiXactStatusNoKeyUpdate;
3621  key_intact = true;
3622 
3623  /*
3624  * If this is the first possibly-multixact-able operation in the
3625  * current transaction, set my per-backend OldestMemberMXactId
3626  * setting. We can be certain that the transaction will never become a
3627  * member of any older MultiXactIds than that. (We have to do this
3628  * even if we end up just using our own TransactionId below, since
3629  * some other backend could incorporate our XID into a MultiXact
3630  * immediately afterwards.)
3631  */
3633  }
3634  else
3635  {
3636  *lockmode = LockTupleExclusive;
3637  mxact_status = MultiXactStatusUpdate;
3638  key_intact = false;
3639  }
3640 
3641  /*
3642  * Note: beyond this point, use oldtup not otid to refer to old tuple.
3643  * otid may very well point at newtup->t_self, which we will overwrite
3644  * with the new tuple's location, so there's great risk of confusion if we
3645  * use otid anymore.
3646  */
3647 
3648 l2:
3649  checked_lockers = false;
3650  locker_remains = false;
3651  result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3652 
3653  /* see below about the "no wait" case */
3654  Assert(result != HeapTupleBeingUpdated || wait);
3655 
3656  if (result == HeapTupleInvisible)
3657  {
3658  UnlockReleaseBuffer(buffer);
3659  ereport(ERROR,
3660  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3661  errmsg("attempted to update invisible tuple")));
3662  }
3663  else if (result == HeapTupleBeingUpdated && wait)
3664  {
3665  TransactionId xwait;
3666  uint16 infomask;
3667  bool can_continue = false;
3668 
3669  /*
3670  * XXX note that we don't consider the "no wait" case here. This
3671  * isn't a problem currently because no caller uses that case, but it
3672  * should be fixed if such a caller is introduced. It wasn't a
3673  * problem previously because this code would always wait, but now
3674  * that some tuple locks do not conflict with one of the lock modes we
3675  * use, it is possible that this case is interesting to handle
3676  * specially.
3677  *
3678  * This may cause failures with third-party code that calls
3679  * heap_update directly.
3680  */
3681 
3682  /* must copy state data before unlocking buffer */
3683  xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3684  infomask = oldtup.t_data->t_infomask;
3685 
3686  /*
3687  * Now we have to do something about the existing locker. If it's a
3688  * multi, sleep on it; we might be awakened before it is completely
3689  * gone (or even not sleep at all in some cases); we need to preserve
3690  * it as locker, unless it is gone completely.
3691  *
3692  * If it's not a multi, we need to check for sleeping conditions
3693  * before actually going to sleep. If the update doesn't conflict
3694  * with the locks, we just continue without sleeping (but making sure
3695  * it is preserved).
3696  *
3697  * Before sleeping, we need to acquire tuple lock to establish our
3698  * priority for the tuple (see heap_lock_tuple). LockTuple will
3699  * release us when we are next-in-line for the tuple. Note we must
3700  * not acquire the tuple lock until we're sure we're going to sleep;
3701  * otherwise we're open for race conditions with other transactions
3702  * holding the tuple lock which sleep on us.
3703  *
3704  * If we are forced to "start over" below, we keep the tuple lock;
3705  * this arranges that we stay at the head of the line while rechecking
3706  * tuple state.
3707  */
3708  if (infomask & HEAP_XMAX_IS_MULTI)
3709  {
3710  TransactionId update_xact;
3711  int remain;
3712 
3713  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3714  *lockmode))
3715  {
3716  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3717 
3718  /* acquire tuple lock, if necessary */
3719  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3720  LockWaitBlock, &have_tuple_lock);
3721 
3722  /* wait for multixact */
3723  MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3724  relation, &oldtup.t_self, XLTW_Update,
3725  &remain);
3726  checked_lockers = true;
3727  locker_remains = remain != 0;
3729 
3730  /*
3731  * If xwait had just locked the tuple then some other xact
3732  * could update this tuple before we get to this point. Check
3733  * for xmax change, and start over if so.
3734  */
3736  infomask) ||
3738  xwait))
3739  goto l2;
3740  }
3741 
3742  /*
3743  * Note that the multixact may not be done by now. It could have
3744  * surviving members; our own xact or other subxacts of this
3745  * backend, and also any other concurrent transaction that locked
3746  * the tuple with KeyShare if we only got TupleLockUpdate. If
3747  * this is the case, we have to be careful to mark the updated
3748  * tuple with the surviving members in Xmax.
3749  *
3750  * Note that there could have been another update in the
3751  * MultiXact. In that case, we need to check whether it committed
3752  * or aborted. If it aborted we are safe to update it again;
3753  * otherwise there is an update conflict, and we have to return
3754  * HeapTupleUpdated below.
3755  *
3756  * In the LockTupleExclusive case, we still need to preserve the
3757  * surviving members: those would include the tuple locks we had
3758  * before this one, which are important to keep in case this
3759  * subxact aborts.
3760  */
3762  update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3763  else
3764  update_xact = InvalidTransactionId;
3765 
3766  /*
3767  * There was no UPDATE in the MultiXact; or it aborted. No
3768  * TransactionIdIsInProgress() call needed here, since we called
3769  * MultiXactIdWait() above.
3770  */
3771  if (!TransactionIdIsValid(update_xact) ||
3772  TransactionIdDidAbort(update_xact))
3773  can_continue = true;
3774  }
3775  else if (TransactionIdIsCurrentTransactionId(xwait))
3776  {
3777  /*
3778  * The only locker is ourselves; we can avoid grabbing the tuple
3779  * lock here, but must preserve our locking information.
3780  */
3781  checked_lockers = true;
3782  locker_remains = true;
3783  can_continue = true;
3784  }
3785  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3786  {
3787  /*
3788  * If it's just a key-share locker, and we're not changing the key
3789  * columns, we don't need to wait for it to end; but we need to
3790  * preserve it as locker.
3791  */
3792  checked_lockers = true;
3793  locker_remains = true;
3794  can_continue = true;
3795  }
3796  else
3797  {
3798  /*
3799  * Wait for regular transaction to end; but first, acquire tuple
3800  * lock.
3801  */
3802  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3803  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3804  LockWaitBlock, &have_tuple_lock);
3805  XactLockTableWait(xwait, relation, &oldtup.t_self,
3806  XLTW_Update);
3807  checked_lockers = true;
3809 
3810  /*
3811  * xwait is done, but if xwait had just locked the tuple then some
3812  * other xact could update this tuple before we get to this point.
3813  * Check for xmax change, and start over if so.
3814  */
3815  if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3816  !TransactionIdEquals(xwait,
3818  goto l2;
3819 
3820  /* Otherwise check if it committed or aborted */
3821  UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3822  if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3823  can_continue = true;
3824  }
3825 
3826  result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
3827  }
3828 
3829  if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3830  {
3831  /* Perform additional check for transaction-snapshot mode RI updates */
3832  if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3833  result = HeapTupleUpdated;
3834  }
3835 
3836  if (result != HeapTupleMayBeUpdated)
3837  {
3838  Assert(result == HeapTupleSelfUpdated ||
3839  result == HeapTupleUpdated ||
3840  result == HeapTupleBeingUpdated);
3841  Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3842  hufd->ctid = oldtup.t_data->t_ctid;
3843  hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3844  if (result == HeapTupleSelfUpdated)
3845  hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3846  else
3847  hufd->cmax = InvalidCommandId;
3848  UnlockReleaseBuffer(buffer);
3849  if (have_tuple_lock)
3850  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3851  if (vmbuffer != InvalidBuffer)
3852  ReleaseBuffer(vmbuffer);
3853  bms_free(hot_attrs);
3854  bms_free(key_attrs);
3855  bms_free(id_attrs);
3856  bms_free(modified_attrs);
3857  bms_free(interesting_attrs);
3858  return result;
3859  }
3860 
3861  /*
3862  * If we didn't pin the visibility map page and the page has become all
3863  * visible while we were busy locking the buffer, or during some
3864  * subsequent window during which we had it unlocked, we'll have to unlock
3865  * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3866  * bit unfortunate, especially since we'll now have to recheck whether the
3867  * tuple has been locked or updated under us, but hopefully it won't
3868  * happen very often.
3869  */
3870  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3871  {
3872  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3873  visibilitymap_pin(relation, block, &vmbuffer);
3875  goto l2;
3876  }
3877 
3878  /* Fill in transaction status data */
3879 
3880  /*
3881  * If the tuple we're updating is locked, we need to preserve the locking
3882  * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3883  */
3885  oldtup.t_data->t_infomask,
3886  oldtup.t_data->t_infomask2,
3887  xid, *lockmode, true,
3888  &xmax_old_tuple, &infomask_old_tuple,
3889  &infomask2_old_tuple);
3890 
3891  /*
3892  * And also prepare an Xmax value for the new copy of the tuple. If there
3893  * was no xmax previously, or there was one but all lockers are now gone,
3894  * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3895  * rare cases that might also be InvalidXid and yet not have the
3896  * HEAP_XMAX_INVALID bit set; that's fine.)
3897  */
3898  if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3900  (checked_lockers && !locker_remains))
3901  xmax_new_tuple = InvalidTransactionId;
3902  else
3903  xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3904 
3905  if (!TransactionIdIsValid(xmax_new_tuple))
3906  {
3907  infomask_new_tuple = HEAP_XMAX_INVALID;
3908  infomask2_new_tuple = 0;
3909  }
3910  else
3911  {
3912  /*
3913  * If we found a valid Xmax for the new tuple, then the infomask bits
3914  * to use on the new tuple depend on what was there on the old one.
3915  * Note that since we're doing an update, the only possibility is that
3916  * the lockers had FOR KEY SHARE lock.
3917  */
3918  if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3919  {
3920  GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3921  &infomask2_new_tuple);
3922  }
3923  else
3924  {
3925  infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3926  infomask2_new_tuple = 0;
3927  }
3928  }
3929 
3930  /*
3931  * Prepare the new tuple with the appropriate initial values of Xmin and
3932  * Xmax, as well as initial infomask bits as computed above.
3933  */
3934  newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3935  newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3936  HeapTupleHeaderSetXmin(newtup->t_data, xid);
3937  HeapTupleHeaderSetCmin(newtup->t_data, cid);
3938  newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3939  newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3940  HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3941 
3942  /*
3943  * Replace cid with a combo cid if necessary. Note that we already put
3944  * the plain cid into the new tuple.
3945  */
3946  HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3947 
3948  /*
3949  * If the toaster needs to be activated, OR if the new tuple will not fit
3950  * on the same page as the old, then we need to release the content lock
3951  * (but not the pin!) on the old tuple's buffer while we are off doing
3952  * TOAST and/or table-file-extension work. We must mark the old tuple to
3953  * show that it's locked, else other processes may try to update it
3954  * themselves.
3955  *
3956  * We need to invoke the toaster if there are already any out-of-line
3957  * toasted values present, or if the new tuple is over-threshold.
3958  */
3959  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3960  relation->rd_rel->relkind != RELKIND_MATVIEW)
3961  {
3962  /* toast table entries should never be recursively toasted */
3963  Assert(!HeapTupleHasExternal(&oldtup));
3964  Assert(!HeapTupleHasExternal(newtup));
3965  need_toast = false;
3966  }
3967  else
3968  need_toast = (HeapTupleHasExternal(&oldtup) ||
3969  HeapTupleHasExternal(newtup) ||
3970  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3971 
3972  pagefree = PageGetHeapFreeSpace(page);
3973 
3974  newtupsize = MAXALIGN(newtup->t_len);
3975 
3976  if (need_toast || newtupsize > pagefree)
3977  {
3978  TransactionId xmax_lock_old_tuple;
3979  uint16 infomask_lock_old_tuple,
3980  infomask2_lock_old_tuple;
3981  bool cleared_all_frozen = false;
3982 
3983  /*
3984  * To prevent concurrent sessions from updating the tuple, we have to
3985  * temporarily mark it locked, while we release the lock.
3986  *
3987  * To satisfy the rule that any xid potentially appearing in a buffer
3988  * written out to disk, we unfortunately have to WAL log this
3989  * temporary modification. We can reuse xl_heap_lock for this
3990  * purpose. If we crash/error before following through with the
3991  * actual update, xmax will be of an aborted transaction, allowing
3992  * other sessions to proceed.
3993  */
3994 
3995  /*
3996  * Compute xmax / infomask appropriate for locking the tuple. This has
3997  * to be done separately from the lock, because the potentially
3998  * created multixact would otherwise be wrong.
3999  */
4001  oldtup.t_data->t_infomask,
4002  oldtup.t_data->t_infomask2,
4003  xid, *lockmode, false,
4004  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
4005  &infomask2_lock_old_tuple);
4006 
4007  Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
4008 
4010 
4011  /* Clear obsolete visibility flags ... */
4012  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4013  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4014  HeapTupleClearHotUpdated(&oldtup);
4015  /* ... and store info about transaction updating this tuple */
4016  Assert(TransactionIdIsValid(xmax_lock_old_tuple));
4017  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
4018  oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
4019  oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
4020  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4021 
4022  /* temporarily make it look not-updated, but locked */
4023  oldtup.t_data->t_ctid = oldtup.t_self;
4024 
4025  /*
4026  * Clear all-frozen bit on visibility map if needed. We could
4027  * immediately reset ALL_VISIBLE, but given that the WAL logging
4028  * overhead would be unchanged, that doesn't seem necessarily
4029  * worthwhile.
4030  */
4031  if (PageIsAllVisible(BufferGetPage(buffer)) &&
4032  visibilitymap_clear(relation, block, vmbuffer,
4034  cleared_all_frozen = true;
4035 
4036  MarkBufferDirty(buffer);
4037 
4038  if (RelationNeedsWAL(relation))
4039  {
4040  xl_heap_lock xlrec;
4041  XLogRecPtr recptr;
4042 
4043  XLogBeginInsert();
4044  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
4045 
4046  xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
4047  xlrec.locking_xid = xmax_lock_old_tuple;
4049  oldtup.t_data->t_infomask2);
4050  xlrec.flags =
4051  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4052  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4053  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4054  PageSetLSN(page, recptr);
4055  }
4056 
4057  END_CRIT_SECTION();
4058 
4059  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4060 
4061  /*
4062  * Let the toaster do its thing, if needed.
4063  *
4064  * Note: below this point, heaptup is the data we actually intend to
4065  * store into the relation; newtup is the caller's original untoasted
4066  * data.
4067  */
4068  if (need_toast)
4069  {
4070  /* Note we always use WAL and FSM during updates */
4071  heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
4072  newtupsize = MAXALIGN(heaptup->t_len);
4073  }
4074  else
4075  heaptup = newtup;
4076 
4077  /*
4078  * Now, do we need a new page for the tuple, or not? This is a bit
4079  * tricky since someone else could have added tuples to the page while
4080  * we weren't looking. We have to recheck the available space after
4081  * reacquiring the buffer lock. But don't bother to do that if the
4082  * former amount of free space is still not enough; it's unlikely
4083  * there's more free now than before.
4084  *
4085  * What's more, if we need to get a new page, we will need to acquire
4086  * buffer locks on both old and new pages. To avoid deadlock against
4087  * some other backend trying to get the same two locks in the other
4088  * order, we must be consistent about the order we get the locks in.
4089  * We use the rule "lock the lower-numbered page of the relation
4090  * first". To implement this, we must do RelationGetBufferForTuple
4091  * while not holding the lock on the old page, and we must rely on it
4092  * to get the locks on both pages in the correct order.
4093  */
4094  if (newtupsize > pagefree)
4095  {
4096  /* Assume there's no chance to put heaptup on same page. */
4097  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4098  buffer, 0, NULL,
4099  &vmbuffer_new, &vmbuffer);
4100  }
4101  else
4102  {
4103  /* Re-acquire the lock on the old tuple's page. */
4105  /* Re-check using the up-to-date free space */
4106  pagefree = PageGetHeapFreeSpace(page);
4107  if (newtupsize > pagefree)
4108  {
4109  /*
4110  * Rats, it doesn't fit anymore. We must now unlock and
4111  * relock to avoid deadlock. Fortunately, this path should
4112  * seldom be taken.
4113  */
4114  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4115  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4116  buffer, 0, NULL,
4117  &vmbuffer_new, &vmbuffer);
4118  }
4119  else
4120  {
4121  /* OK, it fits here, so we're done. */
4122  newbuf = buffer;
4123  }
4124  }
4125  }
4126  else
4127  {
4128  /* No TOAST work needed, and it'll fit on same page */
4129  newbuf = buffer;
4130  heaptup = newtup;
4131  }
4132 
4133  /*
4134  * We're about to do the actual update -- check for conflict first, to
4135  * avoid possibly having to roll back work we've just done.
4136  *
4137  * This is safe without a recheck as long as there is no possibility of
4138  * another process scanning the pages between this check and the update
4139  * being visible to the scan (i.e., exclusive buffer content lock(s) are
4140  * continuously held from this point until the tuple update is visible).
4141  *
4142  * For the new tuple the only check needed is at the relation level, but
4143  * since both tuples are in the same relation and the check for oldtup
4144  * will include checking the relation level, there is no benefit to a
4145  * separate check for the new tuple.
4146  */
4147  CheckForSerializableConflictIn(relation, &oldtup, buffer);
4148 
4149  /*
4150  * At this point newbuf and buffer are both pinned and locked, and newbuf
4151  * has enough space for the new tuple. If they are the same buffer, only
4152  * one pin is held.
4153  */
4154 
4155  if (newbuf == buffer)
4156  {
4157  /*
4158  * Since the new tuple is going into the same page, we might be able
4159  * to do a HOT update. Check if any of the index columns have been
4160  * changed. If the page was already full, we may have skipped checking
4161  * for index columns. If so, HOT update is possible.
4162  */
4163  if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
4164  use_hot_update = true;
4165  }
4166  else
4167  {
4168  /* Set a hint that the old page could use prune/defrag */
4169  PageSetFull(page);
4170  }
4171 
4172  /*
4173  * Compute replica identity tuple before entering the critical section so
4174  * we don't PANIC upon a memory allocation failure.
4175  * ExtractReplicaIdentity() will return NULL if nothing needs to be
4176  * logged.
4177  */
4178  old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
4179  bms_overlap(modified_attrs, id_attrs),
4180  &old_key_copied);
4181 
4182  /* NO EREPORT(ERROR) from here till changes are logged */
4184 
4185  /*
4186  * If this transaction commits, the old tuple will become DEAD sooner or
4187  * later. Set flag that this page is a candidate for pruning once our xid
4188  * falls below the OldestXmin horizon. If the transaction finally aborts,
4189  * the subsequent page pruning will be a no-op and the hint will be
4190  * cleared.
4191  *
4192  * XXX Should we set hint on newbuf as well? If the transaction aborts,
4193  * there would be a prunable tuple in the newbuf; but for now we choose
4194  * not to optimize for aborts. Note that heap_xlog_update must be kept in
4195  * sync if this decision changes.
4196  */
4197  PageSetPrunable(page, xid);
4198 
4199  if (use_hot_update)
4200  {
4201  /* Mark the old tuple as HOT-updated */
4202  HeapTupleSetHotUpdated(&oldtup);
4203  /* And mark the new tuple as heap-only */
4204  HeapTupleSetHeapOnly(heaptup);
4205  /* Mark the caller's copy too, in case different from heaptup */
4206  HeapTupleSetHeapOnly(newtup);
4207  }
4208  else
4209  {
4210  /* Make sure tuples are correctly marked as not-HOT */
4211  HeapTupleClearHotUpdated(&oldtup);
4212  HeapTupleClearHeapOnly(heaptup);
4213  HeapTupleClearHeapOnly(newtup);
4214  }
4215 
4216  RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4217 
4218 
4219  /* Clear obsolete visibility flags, possibly set by ourselves above... */
4220  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4221  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4222  /* ... and store info about transaction updating this tuple */
4223  Assert(TransactionIdIsValid(xmax_old_tuple));
4224  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
4225  oldtup.t_data->t_infomask |= infomask_old_tuple;
4226  oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4227  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4228 
4229  /* record address of new tuple in t_ctid of old one */
4230  oldtup.t_data->t_ctid = heaptup->t_self;
4231 
4232  /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4233  if (PageIsAllVisible(BufferGetPage(buffer)))
4234  {
4235  all_visible_cleared = true;
4237  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4238  vmbuffer, VISIBILITYMAP_VALID_BITS);
4239  }
4240  if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4241  {
4242  all_visible_cleared_new = true;
4244  visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
4245  vmbuffer_new, VISIBILITYMAP_VALID_BITS);
4246  }
4247 
4248  if (newbuf != buffer)
4249  MarkBufferDirty(newbuf);
4250  MarkBufferDirty(buffer);
4251 
4252  /* XLOG stuff */
4253  if (RelationNeedsWAL(relation))
4254  {
4255  XLogRecPtr recptr;
4256 
4257  /*
4258  * For logical decoding we need combocids to properly decode the
4259  * catalog.
4260  */
4262  {
4263  log_heap_new_cid(relation, &oldtup);
4264  log_heap_new_cid(relation, heaptup);
4265  }
4266 
4267  recptr = log_heap_update(relation, buffer,
4268  newbuf, &oldtup, heaptup,
4269  old_key_tuple,
4270  all_visible_cleared,
4271  all_visible_cleared_new);
4272  if (newbuf != buffer)
4273  {
4274  PageSetLSN(BufferGetPage(newbuf), recptr);
4275  }
4276  PageSetLSN(BufferGetPage(buffer), recptr);
4277  }
4278 
4279  END_CRIT_SECTION();
4280 
4281  if (newbuf != buffer)
4282  LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
4283  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4284 
4285  /*
4286  * Mark old tuple for invalidation from system caches at next command
4287  * boundary, and mark the new tuple for invalidation in case we abort. We
4288  * have to do this before releasing the buffer because oldtup is in the
4289  * buffer. (heaptup is all in local memory, but it's necessary to process
4290  * both tuple versions in one call to inval.c so we can avoid redundant
4291  * sinval messages.)
4292  */
4293  CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4294 
4295  /* Now we can release the buffer(s) */
4296  if (newbuf != buffer)
4297  ReleaseBuffer(newbuf);
4298  ReleaseBuffer(buffer);
4299  if (BufferIsValid(vmbuffer_new))
4300  ReleaseBuffer(vmbuffer_new);
4301  if (BufferIsValid(vmbuffer))
4302  ReleaseBuffer(vmbuffer);
4303 
4304  /*
4305  * Release the lmgr tuple lock, if we had it.
4306  */
4307  if (have_tuple_lock)
4308  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4309 
4310  pgstat_count_heap_update(relation, use_hot_update);
4311 
4312  /*
4313  * If heaptup is a private copy, release it. Don't forget to copy t_self
4314  * back to the caller's image, too.
4315  */
4316  if (heaptup != newtup)
4317  {
4318  newtup->t_self = heaptup->t_self;
4319  heap_freetuple(heaptup);
4320  }
4321 
4322  if (old_key_tuple != NULL && old_key_copied)
4323  heap_freetuple(old_key_tuple);
4324 
4325  bms_free(hot_attrs);
4326  bms_free(key_attrs);
4327  bms_free(id_attrs);
4328  bms_free(modified_attrs);
4329  bms_free(interesting_attrs);
4330 
4331  return HeapTupleMayBeUpdated;
4332 }
4333 
4334 /*
4335  * Check if the specified attribute's value is same in both given tuples.
4336  * Subroutine for HeapDetermineModifiedColumns.
4337  */
4338 static bool
4339 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4340  HeapTuple tup1, HeapTuple tup2)
4341 {
4342  Datum value1,
4343  value2;
4344  bool isnull1,
4345  isnull2;
4346  Form_pg_attribute att;
4347 
4348  /*
4349  * If it's a whole-tuple reference, say "not equal". It's not really
4350  * worth supporting this case, since it could only succeed after a no-op
4351  * update, which is hardly a case worth optimizing for.
4352  */
4353  if (attrnum == 0)
4354  return false;
4355 
4356  /*
4357  * Likewise, automatically say "not equal" for any system attribute other
4358  * than OID and tableOID; we cannot expect these to be consistent in a HOT
4359  * chain, or even to be set correctly yet in the new tuple.
4360  */
4361  if (attrnum < 0)
4362  {
4363  if (attrnum != ObjectIdAttributeNumber &&
4364  attrnum != TableOidAttributeNumber)
4365  return false;
4366  }
4367 
4368  /*
4369  * Extract the corresponding values. XXX this is pretty inefficient if
4370  * there are many indexed columns. Should HeapDetermineModifiedColumns do
4371  * a single heap_deform_tuple call on each tuple, instead? But that
4372  * doesn't work for system columns ...
4373  */
4374  value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4375  value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4376 
4377  /*
4378  * If one value is NULL and other is not, then they are certainly not
4379  * equal
4380  */
4381  if (isnull1 != isnull2)
4382  return false;
4383 
4384  /*
4385  * If both are NULL, they can be considered equal.
4386  */
4387  if (isnull1)
4388  return true;
4389 
4390  /*
4391  * We do simple binary comparison of the two datums. This may be overly
4392  * strict because there can be multiple binary representations for the
4393  * same logical value. But we should be OK as long as there are no false
4394  * positives. Using a type-specific equality operator is messy because
4395  * there could be multiple notions of equality in different operator
4396  * classes; furthermore, we cannot safely invoke user-defined functions
4397  * while holding exclusive buffer lock.
4398  */
4399  if (attrnum <= 0)
4400  {
4401  /* The only allowed system columns are OIDs, so do this */
4402  return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4403  }
4404  else
4405  {
4406  Assert(attrnum <= tupdesc->natts);
4407  att = tupdesc->attrs[attrnum - 1];
4408  return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4409  }
4410 }
4411 
4412 /*
4413  * Check which columns are being updated.
4414  *
4415  * Given an updated tuple, determine (and return into the output bitmapset),
4416  * from those listed as interesting, the set of columns that changed.
4417  *
4418  * The input bitmapset is destructively modified; that is OK since this is
4419  * invoked at most once in heap_update.
4420  */
4421 static Bitmapset *
4423  HeapTuple oldtup, HeapTuple newtup)
4424 {
4425  int attnum;
4426  Bitmapset *modified = NULL;
4427 
4428  while ((attnum = bms_first_member(interesting_cols)) >= 0)
4429  {
4431 
4433  attnum, oldtup, newtup))
4434  modified = bms_add_member(modified,
4436  }
4437 
4438  return modified;
4439 }
4440 
4441 /*
4442  * simple_heap_update - replace a tuple
4443  *
4444  * This routine may be used to update a tuple when concurrent updates of
4445  * the target tuple are not expected (for example, because we have a lock
4446  * on the relation associated with the tuple). Any failure is reported
4447  * via ereport().
4448  */
4449 void
4451 {
4453  HeapUpdateFailureData hufd;
4454  LockTupleMode lockmode;
4455 
4456  result = heap_update(relation, otid, tup,
4458  true /* wait for commit */ ,
4459  &hufd, &lockmode);
4460  switch (result)
4461  {
4462  case HeapTupleSelfUpdated:
4463  /* Tuple was already updated in current command? */
4464  elog(ERROR, "tuple already updated by self");
4465  break;
4466 
4467  case HeapTupleMayBeUpdated:
4468  /* done successfully */
4469  break;
4470 
4471  case HeapTupleUpdated:
4472  elog(ERROR, "tuple concurrently updated");
4473  break;
4474 
4475  default:
4476  elog(ERROR, "unrecognized heap_update status: %u", result);
4477  break;
4478  }
4479 }
4480 
4481 
4482 /*
4483  * Return the MultiXactStatus corresponding to the given tuple lock mode.
4484  */
4485 static MultiXactStatus
4487 {
4488  int retval;
4489 
4490  if (is_update)
4491  retval = tupleLockExtraInfo[mode].updstatus;
4492  else
4493  retval = tupleLockExtraInfo[mode].lockstatus;
4494 
4495  if (retval == -1)
4496  elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4497  is_update ? "true" : "false");
4498 
4499  return (MultiXactStatus) retval;
4500 }
4501 
4502 /*
4503  * heap_lock_tuple - lock a tuple in shared or exclusive mode
4504  *
4505  * Note that this acquires a buffer pin, which the caller must release.
4506  *
4507  * Input parameters:
4508  * relation: relation containing tuple (caller must hold suitable lock)
4509  * tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
4510  * cid: current command ID (used for visibility test, and stored into
4511  * tuple's cmax if lock is successful)
4512  * mode: indicates if shared or exclusive tuple lock is desired
4513  * wait_policy: what to do if tuple lock is not available
4514  * follow_updates: if true, follow the update chain to also lock descendant
4515  * tuples.
4516  *
4517  * Output parameters:
4518  * *tuple: all fields filled in
4519  * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4520  * *hufd: filled in failure cases (see below)
4521  *
4522  * Function result may be:
4523  * HeapTupleMayBeUpdated: lock was successfully acquired
4524  * HeapTupleInvisible: lock failed because tuple was never visible to us
4525  * HeapTupleSelfUpdated: lock failed because tuple updated by self
4526  * HeapTupleUpdated: lock failed because tuple updated by other xact
4527  * HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip
4528  *
4529  * In the failure cases other than HeapTupleInvisible, the routine fills
4530  * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4531  * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated,
4532  * since we cannot obtain cmax from a combocid generated by another
4533  * transaction).
4534  * See comments for struct HeapUpdateFailureData for additional info.
4535  *
4536  * See README.tuplock for a thorough explanation of this mechanism.
4537  */
4540  CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4541  bool follow_updates,
4543 {
4545  ItemPointer tid = &(tuple->t_self);
4546  ItemId lp;
4547  Page page;
4548  Buffer vmbuffer = InvalidBuffer;
4549  BlockNumber block;
4550  TransactionId xid,
4551  xmax;
4552  uint16 old_infomask,
4553  new_infomask,
4554  new_infomask2;
4555  bool first_time = true;
4556  bool have_tuple_lock = false;
4557  bool cleared_all_frozen = false;
4558 
4559  *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4560  block = ItemPointerGetBlockNumber(tid);
4561 
4562  /*
4563  * Before locking the buffer, pin the visibility map page if it appears to
4564  * be necessary. Since we haven't got the lock yet, someone else might be
4565  * in the middle of changing this, so we'll need to recheck after we have
4566  * the lock.
4567  */
4568  if (PageIsAllVisible(BufferGetPage(*buffer)))
4569  visibilitymap_pin(relation, block, &vmbuffer);
4570 
4572 
4573  page = BufferGetPage(*buffer);
4574  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4575  Assert(ItemIdIsNormal(lp));
4576 
4577  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4578  tuple->t_len = ItemIdGetLength(lp);
4579  tuple->t_tableOid = RelationGetRelid(relation);
4580 
4581 l3:
4582  result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4583 
4584  if (result == HeapTupleInvisible)
4585  {
4586  /*
4587  * This is possible, but only when locking a tuple for ON CONFLICT
4588  * UPDATE. We return this value here rather than throwing an error in
4589  * order to give that case the opportunity to throw a more specific
4590  * error.
4591  */
4592  result = HeapTupleInvisible;
4593  goto out_locked;
4594  }
4595  else if (result == HeapTupleBeingUpdated || result == HeapTupleUpdated)
4596  {
4597  TransactionId xwait;
4598  uint16 infomask;
4599  uint16 infomask2;
4600  bool require_sleep;
4601  ItemPointerData t_ctid;
4602 
4603  /* must copy state data before unlocking buffer */
4604  xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4605  infomask = tuple->t_data->t_infomask;
4606  infomask2 = tuple->t_data->t_infomask2;
4607  ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4608 
4609  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4610 
4611  /*
4612  * If any subtransaction of the current top transaction already holds
4613  * a lock as strong as or stronger than what we're requesting, we
4614  * effectively hold the desired lock already. We *must* succeed
4615  * without trying to take the tuple lock, else we will deadlock
4616  * against anyone wanting to acquire a stronger lock.
4617  *
4618  * Note we only do this the first time we loop on the HTSU result;
4619  * there is no point in testing in subsequent passes, because
4620  * evidently our own transaction cannot have acquired a new lock after
4621  * the first time we checked.
4622  */
4623  if (first_time)
4624  {
4625  first_time = false;
4626 
4627  if (infomask & HEAP_XMAX_IS_MULTI)
4628  {
4629  int i;
4630  int nmembers;
4631  MultiXactMember *members;
4632 
4633  /*
4634  * We don't need to allow old multixacts here; if that had
4635  * been the case, HeapTupleSatisfiesUpdate would have returned
4636  * MayBeUpdated and we wouldn't be here.
4637  */
4638  nmembers =
4639  GetMultiXactIdMembers(xwait, &members, false,
4640  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4641 
4642  for (i = 0; i < nmembers; i++)
4643  {
4644  /* only consider members of our own transaction */
4645  if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4646  continue;
4647 
4648  if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4649  {
4650  pfree(members);
4651  result = HeapTupleMayBeUpdated;
4652  goto out_unlocked;
4653  }
4654  }
4655 
4656  if (members)
4657  pfree(members);
4658  }
4659  else if (TransactionIdIsCurrentTransactionId(xwait))
4660  {
4661  switch (mode)
4662  {
4663  case LockTupleKeyShare:
4664  Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4665  HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4666  HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4667  result = HeapTupleMayBeUpdated;
4668  goto out_unlocked;
4669  case LockTupleShare:
4670  if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4671  HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4672  {
4673  result = HeapTupleMayBeUpdated;
4674  goto out_unlocked;
4675  }
4676  break;
4678  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4679  {
4680  result = HeapTupleMayBeUpdated;
4681  goto out_unlocked;
4682  }
4683  break;
4684  case LockTupleExclusive:
4685  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4686  infomask2 & HEAP_KEYS_UPDATED)
4687  {
4688  result = HeapTupleMayBeUpdated;
4689  goto out_unlocked;
4690  }
4691  break;
4692  }
4693  }
4694  }
4695 
4696  /*
4697  * Initially assume that we will have to wait for the locking
4698  * transaction(s) to finish. We check various cases below in which
4699  * this can be turned off.
4700  */
4701  require_sleep = true;
4702  if (mode == LockTupleKeyShare)
4703  {
4704  /*
4705  * If we're requesting KeyShare, and there's no update present, we
4706  * don't need to wait. Even if there is an update, we can still
4707  * continue if the key hasn't been modified.
4708  *
4709  * However, if there are updates, we need to walk the update chain
4710  * to mark future versions of the row as locked, too. That way,
4711  * if somebody deletes that future version, we're protected
4712  * against the key going away. This locking of future versions
4713  * could block momentarily, if a concurrent transaction is
4714  * deleting a key; or it could return a value to the effect that
4715  * the transaction deleting the key has already committed. So we
4716  * do this before re-locking the buffer; otherwise this would be
4717  * prone to deadlocks.
4718  *
4719  * Note that the TID we're locking was grabbed before we unlocked
4720  * the buffer. For it to change while we're not looking, the
4721  * other properties we're testing for below after re-locking the
4722  * buffer would also change, in which case we would restart this
4723  * loop above.
4724  */
4725  if (!(infomask2 & HEAP_KEYS_UPDATED))
4726  {
4727  bool updated;
4728 
4729  updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4730 
4731  /*
4732  * If there are updates, follow the update chain; bail out if
4733  * that cannot be done.
4734  */
4735  if (follow_updates && updated)
4736  {
4737  HTSU_Result res;
4738 
4739  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4741  mode);
4742  if (res != HeapTupleMayBeUpdated)
4743  {
4744  result = res;
4745  /* recovery code expects to have buffer lock held */
4747  goto failed;
4748  }
4749  }
4750 
4752 
4753  /*
4754  * Make sure it's still an appropriate lock, else start over.
4755  * Also, if it wasn't updated before we released the lock, but
4756  * is updated now, we start over too; the reason is that we
4757  * now need to follow the update chain to lock the new
4758  * versions.
4759  */
4760  if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4761  ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4762  !updated))
4763  goto l3;
4764 
4765  /* Things look okay, so we can skip sleeping */
4766  require_sleep = false;
4767 
4768  /*
4769  * Note we allow Xmax to change here; other updaters/lockers
4770  * could have modified it before we grabbed the buffer lock.
4771  * However, this is not a problem, because with the recheck we
4772  * just did we ensure that they still don't conflict with the
4773  * lock we want.
4774  */
4775  }
4776  }
4777  else if (mode == LockTupleShare)
4778  {
4779  /*
4780  * If we're requesting Share, we can similarly avoid sleeping if
4781  * there's no update and no exclusive lock present.
4782  */
4783  if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4784  !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4785  {
4787 
4788  /*
4789  * Make sure it's still an appropriate lock, else start over.
4790  * See above about allowing xmax to change.
4791  */
4792  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4794  goto l3;
4795  require_sleep = false;
4796  }
4797  }
4798  else if (mode == LockTupleNoKeyExclusive)
4799  {
4800  /*
4801  * If we're requesting NoKeyExclusive, we might also be able to
4802  * avoid sleeping; just ensure that there no conflicting lock
4803  * already acquired.
4804  */
4805  if (infomask & HEAP_XMAX_IS_MULTI)
4806  {
4807  if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4808  mode))
4809  {
4810  /*
4811  * No conflict, but if the xmax changed under us in the
4812  * meantime, start over.
4813  */
4815  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4817  xwait))
4818  goto l3;
4819 
4820  /* otherwise, we're good */
4821  require_sleep = false;
4822  }
4823  }
4824  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4825  {
4827 
4828  /* if the xmax changed in the meantime, start over */
4829  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4832  xwait))
4833  goto l3;
4834  /* otherwise, we're good */
4835  require_sleep = false;
4836  }
4837  }
4838 
4839  /*
4840  * As a check independent from those above, we can also avoid sleeping
4841  * if the current transaction is the sole locker of the tuple. Note
4842  * that the strength of the lock already held is irrelevant; this is
4843  * not about recording the lock in Xmax (which will be done regardless
4844  * of this optimization, below). Also, note that the cases where we
4845  * hold a lock stronger than we are requesting are already handled
4846  * above by not doing anything.
4847  *
4848  * Note we only deal with the non-multixact case here; MultiXactIdWait
4849  * is well equipped to deal with this situation on its own.
4850  */
4851  if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4853  {
4854  /* ... but if the xmax changed in the meantime, start over */
4856  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4858  xwait))
4859  goto l3;
4861  require_sleep = false;
4862  }
4863 
4864  /*
4865  * Time to sleep on the other transaction/multixact, if necessary.
4866  *
4867  * If the other transaction is an update that's already committed,
4868  * then sleeping cannot possibly do any good: if we're required to
4869  * sleep, get out to raise an error instead.
4870  *
4871  * By here, we either have already acquired the buffer exclusive lock,
4872  * or we must wait for the locking transaction or multixact; so below
4873  * we ensure that we grab buffer lock after the sleep.
4874  */
4875  if (require_sleep && result == HeapTupleUpdated)
4876  {
4878  goto failed;
4879  }
4880  else if (require_sleep)
4881  {
4882  /*
4883  * Acquire tuple lock to establish our priority for the tuple, or
4884  * die trying. LockTuple will release us when we are next-in-line
4885  * for the tuple. We must do this even if we are share-locking.
4886  *
4887  * If we are forced to "start over" below, we keep the tuple lock;
4888  * this arranges that we stay at the head of the line while
4889  * rechecking tuple state.
4890  */
4891  if (!heap_acquire_tuplock(relation, tid, mode, wait_policy,
4892  &have_tuple_lock))
4893  {
4894  /*
4895  * This can only happen if wait_policy is Skip and the lock
4896  * couldn't be obtained.
4897  */
4898  result = HeapTupleWouldBlock;
4899  /* recovery code expects to have buffer lock held */
4901  goto failed;
4902  }
4903 
4904  if (infomask & HEAP_XMAX_IS_MULTI)
4905  {
4907 
4908  /* We only ever lock tuples, never update them */
4909  if (status >= MultiXactStatusNoKeyUpdate)
4910  elog(ERROR, "invalid lock mode in heap_lock_tuple");
4911 
4912  /* wait for multixact to end, or die trying */
4913  switch (wait_policy)
4914  {
4915  case LockWaitBlock:
4916  MultiXactIdWait((MultiXactId) xwait, status, infomask,
4917  relation, &tuple->t_self, XLTW_Lock, NULL);
4918  break;
4919  case LockWaitSkip:
4921  status, infomask, relation,
4922  NULL))
4923  {
4924  result = HeapTupleWouldBlock;
4925  /* recovery code expects to have buffer lock held */
4927  goto failed;
4928  }
4929  break;
4930  case LockWaitError:
4932  status, infomask, relation,
4933  NULL))
4934  ereport(ERROR,
4935  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4936  errmsg("could not obtain lock on row in relation \"%s\"",
4937  RelationGetRelationName(relation))));
4938 
4939  break;
4940  }
4941 
4942  /*
4943  * Of course, the multixact might not be done here: if we're
4944  * requesting a light lock mode, other transactions with light
4945  * locks could still be alive, as well as locks owned by our
4946  * own xact or other subxacts of this backend. We need to
4947  * preserve the surviving MultiXact members. Note that it
4948  * isn't absolutely necessary in the latter case, but doing so
4949  * is simpler.
4950  */
4951  }
4952  else
4953  {
4954  /* wait for regular transaction to end, or die trying */
4955  switch (wait_policy)
4956  {
4957  case LockWaitBlock:
4958  XactLockTableWait(xwait, relation, &tuple->t_self,
4959  XLTW_Lock);
4960  break;
4961  case LockWaitSkip:
4962  if (!ConditionalXactLockTableWait(xwait))
4963  {
4964  result = HeapTupleWouldBlock;
4965  /* recovery code expects to have buffer lock held */
4967  goto failed;
4968  }
4969  break;
4970  case LockWaitError:
4971  if (!ConditionalXactLockTableWait(xwait))
4972  ereport(ERROR,
4973  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4974  errmsg("could not obtain lock on row in relation \"%s\"",
4975  RelationGetRelationName(relation))));
4976  break;
4977  }
4978  }
4979 
4980  /* if there are updates, follow the update chain */
4981  if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4982  {
4983  HTSU_Result res;
4984 
4985  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4987  mode);
4988  if (res != HeapTupleMayBeUpdated)
4989  {
4990  result = res;
4991  /* recovery code expects to have buffer lock held */
4993  goto failed;
4994  }
4995  }
4996 
4998 
4999  /*
5000  * xwait is done, but if xwait had just locked the tuple then some
5001  * other xact could update this tuple before we get to this point.
5002  * Check for xmax change, and start over if so.
5003  */
5004  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5006  xwait))
5007  goto l3;
5008 
5009  if (!(infomask & HEAP_XMAX_IS_MULTI))
5010  {
5011  /*
5012  * Otherwise check if it committed or aborted. Note we cannot
5013  * be here if the tuple was only locked by somebody who didn't
5014  * conflict with us; that would have been handled above. So
5015  * that transaction must necessarily be gone by now. But
5016  * don't check for this in the multixact case, because some
5017  * locker transactions might still be running.
5018  */
5019  UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5020  }
5021  }
5022 
5023  /* By here, we're certain that we hold buffer exclusive lock again */
5024 
5025  /*
5026  * We may lock if previous xmax aborted, or if it committed but only
5027  * locked the tuple without updating it; or if we didn't have to wait
5028  * at all for whatever reason.
5029  */
5030  if (!require_sleep ||
5031  (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5034  result = HeapTupleMayBeUpdated;
5035  else
5036  result = HeapTupleUpdated;
5037  }
5038 
5039 failed:
5040  if (result != HeapTupleMayBeUpdated)
5041  {
5042  Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated ||
5043  result == HeapTupleWouldBlock);
5044  Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5045  hufd->ctid = tuple->t_data->t_ctid;
5046  hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5047  if (result == HeapTupleSelfUpdated)
5048  hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5049  else
5050  hufd->cmax = InvalidCommandId;
5051  goto out_locked;
5052  }
5053 
5054  /*
5055  * If we didn't pin the visibility map page and the page has become all
5056  * visible while we were busy locking the buffer, or during some
5057  * subsequent window during which we had it unlocked, we'll have to unlock
5058  * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5059  * unfortunate, especially since we'll now have to recheck whether the
5060  * tuple has been locked or updated under us, but hopefully it won't
5061  * happen very often.
5062  */
5063  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5064  {
5065  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5066  visibilitymap_pin(relation, block, &vmbuffer);
5068  goto l3;
5069  }
5070 
5071  xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5072  old_infomask = tuple->t_data->t_infomask;
5073 
5074  /*
5075  * If this is the first possibly-multixact-able operation in the current
5076  * transaction, set my per-backend OldestMemberMXactId setting. We can be
5077  * certain that the transaction will never become a member of any older
5078  * MultiXactIds than that. (We have to do this even if we end up just
5079  * using our own TransactionId below, since some other backend could
5080  * incorporate our XID into a MultiXact immediately afterwards.)
5081  */
5083 
5084  /*
5085  * Compute the new xmax and infomask to store into the tuple. Note we do
5086  * not modify the tuple just yet, because that would leave it in the wrong
5087  * state if multixact.c elogs.
5088  */
5089  compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
5090  GetCurrentTransactionId(), mode, false,
5091  &xid, &new_infomask, &new_infomask2);
5092 
5094 
5095  /*
5096  * Store transaction information of xact locking the tuple.
5097  *
5098  * Note: Cmax is meaningless in this context, so don't set it; this avoids
5099  * possibly generating a useless combo CID. Moreover, if we're locking a
5100  * previously updated tuple, it's important to preserve the Cmax.
5101  *
5102  * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5103  * we would break the HOT chain.
5104  */
5105  tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
5106  tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5107  tuple->t_data->t_infomask |= new_infomask;
5108  tuple->t_data->t_infomask2 |= new_infomask2;
5109  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5111  HeapTupleHeaderSetXmax(tuple->t_data, xid);
5112 
5113  /*
5114  * Make sure there is no forward chain link in t_ctid. Note that in the
5115  * cases where the tuple has been updated, we must not overwrite t_ctid,
5116  * because it was set by the updater. Moreover, if the tuple has been
5117  * updated, we need to follow the update chain to lock the new versions of
5118  * the tuple as well.
5119  */
5120  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5121  tuple->t_data->t_ctid = *tid;
5122 
5123  /* Clear only the all-frozen bit on visibility map if needed */
5124  if (PageIsAllVisible(page) &&
5125  visibilitymap_clear(relation, block, vmbuffer,
5127  cleared_all_frozen = true;
5128 
5129 
5130  MarkBufferDirty(*buffer);
5131 
5132  /*
5133  * XLOG stuff. You might think that we don't need an XLOG record because
5134  * there is no state change worth restoring after a crash. You would be
5135  * wrong however: we have just written either a TransactionId or a
5136  * MultiXactId that may never have been seen on disk before, and we need
5137  * to make sure that there are XLOG entries covering those ID numbers.
5138  * Else the same IDs might be re-used after a crash, which would be
5139  * disastrous if this page made it to disk before the crash. Essentially
5140  * we have to enforce the WAL log-before-data rule even in this case.
5141  * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5142  * entries for everything anyway.)
5143  */
5144  if (RelationNeedsWAL(relation))
5145  {
5146  xl_heap_lock xlrec;
5147  XLogRecPtr recptr;
5148 
5149  XLogBeginInsert();
5150  XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
5151 
5152  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5153  xlrec.locking_xid = xid;
5154  xlrec.infobits_set = compute_infobits(new_infomask,
5155  tuple->t_data->t_infomask2);
5156  xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5157  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
5158 
5159  /* we don't decode row locks atm, so no need to log the origin */
5160 
5161  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
5162 
5163  PageSetLSN(page, recptr);
5164  }
5165 
5166  END_CRIT_SECTION();
5167 
5168  result = HeapTupleMayBeUpdated;
5169 
5170 out_locked:
5171  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5172 
5173 out_unlocked:
5174  if (BufferIsValid(vmbuffer))
5175  ReleaseBuffer(vmbuffer);
5176 
5177  /*
5178  * Don't update the visibility map here. Locking a tuple doesn't change
5179  * visibility info.
5180  */
5181 
5182  /*
5183  * Now that we have successfully marked the tuple as locked, we can
5184  * release the lmgr tuple lock, if we had it.
5185  */
5186  if (have_tuple_lock)
5187  UnlockTupleTuplock(relation, tid, mode);
5188 
5189  return result;
5190 }
5191 
5192 /*
5193  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5194  * its normal, Xmax-based tuple lock.
5195  *
5196  * have_tuple_lock is an input and output parameter: on input, it indicates
5197  * whether the lock has previously been acquired (and this function does
5198  * nothing in that case). If this function returns success, have_tuple_lock
5199  * has been flipped to true.
5200  *
5201  * Returns false if it was unable to obtain the lock; this can only happen if
5202  * wait_policy is Skip.
5203  */
5204 static bool
5206  LockWaitPolicy wait_policy, bool *have_tuple_lock)
5207 {
5208  if (*have_tuple_lock)
5209  return true;
5210 
5211  switch (wait_policy)
5212  {
5213  case LockWaitBlock:
5214  LockTupleTuplock(relation, tid, mode);
5215  break;
5216 
5217  case LockWaitSkip:
5218  if (!ConditionalLockTupleTuplock(relation, tid, mode))
5219  return false;
5220  break;
5221 
5222  case LockWaitError:
5223  if (!ConditionalLockTupleTuplock(relation, tid, mode))
5224  ereport(ERROR,
5225  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5226  errmsg("could not obtain lock on row in relation \"%s\"",
5227  RelationGetRelationName(relation))));
5228  break;
5229  }
5230  *have_tuple_lock = true;
5231 
5232  return true;
5233 }
5234 
5235 /*
5236  * Given an original set of Xmax and infomask, and a transaction (identified by
5237  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5238  * corresponding infomasks to use on the tuple.
5239  *
5240  * Note that this might have side effects such as creating a new MultiXactId.
5241  *
5242  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5243  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5244  * but it was not running anymore. There is a race condition, which is that the
5245  * MultiXactId may have finished since then, but that uncommon case is handled
5246  * either here, or within MultiXactIdExpand.
5247  *
5248  * There is a similar race condition possible when the old xmax was a regular
5249  * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5250  * window, but it's still possible to end up creating an unnecessary
5251  * MultiXactId. Fortunately this is harmless.
5252  */
5253 static void
5255  uint16 old_infomask2, TransactionId add_to_xmax,
5256  LockTupleMode mode, bool is_update,
5257  TransactionId *result_xmax, uint16 *result_infomask,
5258  uint16 *result_infomask2)
5259 {
5260  TransactionId new_xmax;
5261  uint16 new_infomask,
5262  new_infomask2;
5263 
5265 
5266 l5:
5267  new_infomask = 0;
5268  new_infomask2 = 0;
5269  if (old_infomask & HEAP_XMAX_INVALID)
5270  {
5271  /*
5272  * No previous locker; we just insert our own TransactionId.
5273  *
5274  * Note that it's critical that this case be the first one checked,
5275  * because there are several blocks below that come back to this one
5276  * to implement certain optimizations; old_infomask might contain
5277  * other dirty bits in those cases, but we don't really care.
5278  */
5279  if (is_update)
5280  {
5281  new_xmax = add_to_xmax;
5282  if (mode == LockTupleExclusive)
5283  new_infomask2 |= HEAP_KEYS_UPDATED;
5284  }
5285  else
5286  {
5287  new_infomask |= HEAP_XMAX_LOCK_ONLY;
5288  switch (mode)
5289  {
5290  case LockTupleKeyShare:
5291  new_xmax = add_to_xmax;
5292  new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5293  break;
5294  case LockTupleShare:
5295  new_xmax = add_to_xmax;
5296  new_infomask |= HEAP_XMAX_SHR_LOCK;
5297  break;
5299  new_xmax = add_to_xmax;
5300  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5301  break;
5302  case LockTupleExclusive:
5303  new_xmax = add_to_xmax;
5304  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5305  new_infomask2 |= HEAP_KEYS_UPDATED;
5306  break;
5307  default:
5308  new_xmax = InvalidTransactionId; /* silence compiler */
5309  elog(ERROR, "invalid lock mode");
5310  }
5311  }
5312  }
5313  else if (old_infomask & HEAP_XMAX_IS_MULTI)
5314  {
5315  MultiXactStatus new_status;
5316 
5317  /*
5318  * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5319  * cross-check.
5320  */
5321  Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5322 
5323  /*
5324  * A multixact together with LOCK_ONLY set but neither lock bit set
5325  * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5326  * anymore. This check is critical for databases upgraded by
5327  * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5328  * that such multis are never passed.
5329  */
5330  if (HEAP_LOCKED_UPGRADED(old_infomask))
5331  {
5332  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5333  old_infomask |= HEAP_XMAX_INVALID;
5334  goto l5;
5335  }
5336 
5337  /*
5338  * If the XMAX is already a MultiXactId, then we need to expand it to
5339  * include add_to_xmax; but if all the members were lockers and are
5340  * all gone, we can do away with the IS_MULTI bit and just set
5341  * add_to_xmax as the only locker/updater. If all lockers are gone
5342  * and we have an updater that aborted, we can also do without a
5343  * multi.
5344  *
5345  * The cost of doing GetMultiXactIdMembers would be paid by
5346  * MultiXactIdExpand if we weren't to do this, so this check is not
5347  * incurring extra work anyhow.
5348  */
5349  if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5350  {
5351  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5353  old_infomask)))
5354  {
5355  /*
5356  * Reset these bits and restart; otherwise fall through to
5357  * create a new multi below.
5358  */
5359  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5360  old_infomask |= HEAP_XMAX_INVALID;
5361  goto l5;
5362  }
5363  }
5364 
5365  new_status = get_mxact_status_for_lock(mode, is_update);
5366 
5367  new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5368  new_status);
5369  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5370  }
5371  else if (old_infomask & HEAP_XMAX_COMMITTED)
5372  {
5373  /*
5374  * It's a committed update, so we need to preserve him as updater of
5375  * the tuple.
5376  */
5378  MultiXactStatus new_status;
5379 
5380  if (old_infomask2 & HEAP_KEYS_UPDATED)
5381  status = MultiXactStatusUpdate;
5382  else
5383  status = MultiXactStatusNoKeyUpdate;
5384 
5385  new_status = get_mxact_status_for_lock(mode, is_update);
5386 
5387  /*
5388  * since it's not running, it's obviously impossible for the old
5389  * updater to be identical to the current one, so we need not check
5390  * for that case as we do in the block above.
5391  */
5392  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5393  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5394  }
5395  else if (TransactionIdIsInProgress(xmax))
5396  {
5397  /*
5398  * If the XMAX is a valid, in-progress TransactionId, then we need to
5399  * create a new MultiXactId that includes both the old locker or
5400  * updater and our own TransactionId.
5401  */
5402  MultiXactStatus new_status;
5403  MultiXactStatus old_status;
5404  LockTupleMode old_mode;
5405 
5406  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5407  {
5408  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5409  old_status = MultiXactStatusForKeyShare;
5410  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5411  old_status = MultiXactStatusForShare;
5412  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5413  {
5414  if (old_infomask2 & HEAP_KEYS_UPDATED)
5415  old_status = MultiXactStatusForUpdate;
5416  else
5417  old_status = MultiXactStatusForNoKeyUpdate;
5418  }
5419  else
5420  {
5421  /*
5422  * LOCK_ONLY can be present alone only when a page has been
5423  * upgraded by pg_upgrade. But in that case,
5424  * TransactionIdIsInProgress() should have returned false. We
5425  * assume it's no longer locked in this case.
5426  */
5427  elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5428  old_infomask |= HEAP_XMAX_INVALID;
5429  old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5430  goto l5;
5431  }
5432  }
5433  else
5434  {
5435  /* it's an update, but which kind? */
5436  if (old_infomask2 & HEAP_KEYS_UPDATED)
5437  old_status = MultiXactStatusUpdate;
5438  else
5439  old_status = MultiXactStatusNoKeyUpdate;
5440  }
5441 
5442  old_mode = TUPLOCK_from_mxstatus(old_status);
5443 
5444  /*
5445  * If the lock to be acquired is for the same TransactionId as the
5446  * existing lock, there's an optimization possible: consider only the
5447  * strongest of both locks as the only one present, and restart.
5448  */
5449  if (xmax == add_to_xmax)
5450  {
5451  /*
5452  * Note that it's not possible for the original tuple to be
5453  * updated: we wouldn't be here because the tuple would have been
5454  * invisible and we wouldn't try to update it. As a subtlety,
5455  * this code can also run when traversing an update chain to lock
5456  * future versions of a tuple. But we wouldn't be here either,
5457  * because the add_to_xmax would be different from the original
5458  * updater.
5459  */
5460  Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5461 
5462  /* acquire the strongest of both */
5463  if (mode < old_mode)
5464  mode = old_mode;
5465  /* mustn't touch is_update */
5466 
5467  old_infomask |= HEAP_XMAX_INVALID;
5468  goto l5;
5469  }
5470 
5471  /* otherwise, just fall back to creating a new multixact */
5472  new_status = get_mxact_status_for_lock(mode, is_update);
5473  new_xmax = MultiXactIdCreate(xmax, old_status,
5474  add_to_xmax, new_status);
5475  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5476  }
5477  else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5478  TransactionIdDidCommit(xmax))
5479  {
5480  /*
5481  * It's a committed update, so we gotta preserve him as updater of the
5482  * tuple.
5483  */
5485  MultiXactStatus new_status;
5486 
5487  if (old_infomask2 & HEAP_KEYS_UPDATED)
5488  status = MultiXactStatusUpdate;
5489  else
5490  status = MultiXactStatusNoKeyUpdate;
5491 
5492  new_status = get_mxact_status_for_lock(mode, is_update);
5493 
5494  /*
5495  * since it's not running, it's obviously impossible for the old
5496  * updater to be identical to the current one, so we need not check
5497  * for that case as we do in the block above.
5498  */
5499  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5500  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5501  }
5502  else
5503  {
5504  /*
5505  * Can get here iff the locking/updating transaction was running when
5506  * the infomask was extracted from the tuple, but finished before
5507  * TransactionIdIsInProgress got to run. Deal with it as if there was
5508  * no locker at all in the first place.
5509  */
5510  old_infomask |= HEAP_XMAX_INVALID;
5511  goto l5;
5512  }
5513 
5514  *result_infomask = new_infomask;
5515  *result_infomask2 = new_infomask2;
5516  *result_xmax = new_xmax;
5517 }
5518 
5519 /*
5520  * Subroutine for heap_lock_updated_tuple_rec.
5521  *
5522  * Given a hypothetical multixact status held by the transaction identified
5523  * with the given xid, does the current transaction need to wait, fail, or can
5524  * it continue if it wanted to acquire a lock of the given mode? "needwait"
5525  * is set to true if waiting is necessary; if it can continue, then
5526  * HeapTupleMayBeUpdated is returned. In case of a conflict, a different
5527  * HeapTupleSatisfiesUpdate return code is returned.
5528  *
5529  * The held status is said to be hypothetical because it might correspond to a
5530  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5531  * way for simplicity of API.
5532  */
5533 static HTSU_Result
5535  LockTupleMode mode, bool *needwait)
5536 {
5537  MultiXactStatus wantedstatus;
5538 
5539  *needwait = false;
5540  wantedstatus = get_mxact_status_for_lock(mode, false);
5541 
5542  /*
5543  * Note: we *must* check TransactionIdIsInProgress before
5544  * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an
5545  * explanation.
5546  */
5548  {
5549  /*
5550  * Updated by our own transaction? Just return failure. This
5551  * shouldn't normally happen.
5552  */
5553  return HeapTupleSelfUpdated;
5554  }
5555  else if (TransactionIdIsInProgress(xid))
5556  {
5557  /*
5558  * If the locking transaction is running, what we do depends on
5559  * whether the lock modes conflict: if they do, then we must wait for
5560  * it to finish; otherwise we can fall through to lock this tuple
5561  * version without waiting.
5562  */
5564  LOCKMODE_from_mxstatus(wantedstatus)))
5565  {
5566  *needwait = true;
5567  }
5568 
5569  /*
5570  * If we set needwait above, then this value doesn't matter;
5571  * otherwise, this value signals to caller that it's okay to proceed.
5572  */
5573  return HeapTupleMayBeUpdated;
5574  }
5575  else if (TransactionIdDidAbort(xid))
5576  return HeapTupleMayBeUpdated;
5577  else if (TransactionIdDidCommit(xid))
5578  {
5579  /*
5580  * The other transaction committed. If it was only a locker, then the
5581  * lock is completely gone now and we can return success; but if it
5582  * was an update, then what we do depends on whether the two lock
5583  * modes conflict. If they conflict, then we must report error to
5584  * caller. But if they don't, we can fall through to allow the current
5585  * transaction to lock the tuple.
5586  *
5587  * Note: the reason we worry about ISUPDATE here is because as soon as
5588  * a transaction ends, all its locks are gone and meaningless, and
5589  * thus we can ignore them; whereas its updates persist. In the
5590  * TransactionIdIsInProgress case, above, we don't need to check
5591  * because we know the lock is still "alive" and thus a conflict needs
5592  * always be checked.
5593  */
5594  if (!ISUPDATE_from_mxstatus(status))
5595  return HeapTupleMayBeUpdated;
5596 
5598  LOCKMODE_from_mxstatus(wantedstatus)))
5599  /* bummer */
5600  return HeapTupleUpdated;
5601 
5602  return HeapTupleMayBeUpdated;
5603  }
5604 
5605  /* Not in progress, not aborted, not committed -- must have crashed */
5606  return HeapTupleMayBeUpdated;
5607 }
5608 
5609 
5610 /*
5611  * Recursive part of heap_lock_updated_tuple
5612  *
5613  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5614  * xid with the given mode; if this tuple is updated, recurse to lock the new
5615  * version as well.
5616  */
5617 static HTSU_Result
5619  LockTupleMode mode)
5620 {
5622  ItemPointerData tupid;
5623  HeapTupleData mytup;
5624  Buffer buf;
5625  uint16 new_infomask,
5626  new_infomask2,
5627  old_infomask,
5628  old_infomask2;
5629  TransactionId xmax,
5630  new_xmax;
5631  TransactionId priorXmax = InvalidTransactionId;
5632  bool cleared_all_frozen = false;
5633  Buffer vmbuffer = InvalidBuffer;
5634  BlockNumber block;
5635 
5636  ItemPointerCopy(tid, &tupid);
5637 
5638  for (;;)
5639  {
5640  new_infomask = 0;
5641  new_xmax = InvalidTransactionId;
5642  block = ItemPointerGetBlockNumber(&tupid);
5643  ItemPointerCopy(&tupid, &(mytup.t_self));
5644 
5645  if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
5646  {
5647  /*
5648  * if we fail to find the updated version of the tuple, it's
5649  * because it was vacuumed/pruned away after its creator
5650  * transaction aborted. So behave as if we got to the end of the
5651  * chain, and there's no further tuple to lock: return success to
5652  * caller.
5653  */
5654  return HeapTupleMayBeUpdated;
5655  }
5656 
5657 l4:
5659 
5660  /*
5661  * Before locking the buffer, pin the visibility map page if it
5662  * appears to be necessary. Since we haven't got the lock yet,
5663  * someone else might be in the middle of changing this, so we'll need
5664  * to recheck after we have the lock.
5665  */
5666  if (PageIsAllVisible(BufferGetPage(buf)))
5667  visibilitymap_pin(rel, block, &vmbuffer);
5668  else
5669  vmbuffer = InvalidBuffer;
5670 
5672 
5673  /*
5674  * If we didn't pin the visibility map page and the page has become
5675  * all visible while we were busy locking the buffer, we'll have to
5676  * unlock and re-lock, to avoid holding the buffer lock across I/O.
5677  * That's a bit unfortunate, but hopefully shouldn't happen often.
5678  */
5679  if (vmbuffer == InvalidBuffer && PageIsAllVisible(BufferGetPage(buf)))
5680  {
5682  visibilitymap_pin(rel, block, &vmbuffer);
5684  }
5685 
5686  /*
5687  * Check the tuple XMIN against prior XMAX, if any. If we reached the
5688  * end of the chain, we're done, so return success.
5689  */
5690  if (TransactionIdIsValid(priorXmax) &&
5692  priorXmax))
5693  {
5694  result = HeapTupleMayBeUpdated;
5695  goto out_locked;
5696  }
5697 
5698  /*
5699  * Also check Xmin: if this tuple was created by an aborted
5700  * (sub)transaction, then we already locked the last live one in the
5701  * chain, thus we're done, so return success.
5702  */
5704  {
5705  UnlockReleaseBuffer(buf);
5706  return HeapTupleMayBeUpdated;
5707  }
5708 
5709  old_infomask = mytup.t_data->t_infomask;
5710  old_infomask2 = mytup.t_data->t_infomask2;
5711  xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5712 
5713  /*
5714  * If this tuple version has been updated or locked by some concurrent
5715  * transaction(s), what we do depends on whether our lock mode
5716  * conflicts with what those other transactions hold, and also on the
5717  * status of them.
5718  */
5719  if (!(old_infomask & HEAP_XMAX_INVALID))
5720  {
5721  TransactionId rawxmax;
5722  bool needwait;
5723 
5724  rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5725  if (old_infomask & HEAP_XMAX_IS_MULTI)
5726  {
5727  int nmembers;
5728  int i;
5729  MultiXactMember *members;
5730 
5731  /*
5732  * We don't need a test for pg_upgrade'd tuples: this is only
5733  * applied to tuples after the first in an update chain. Said
5734  * first tuple in the chain may well be locked-in-9.2-and-
5735  * pg_upgraded, but that one was already locked by our caller,
5736  * not us; and any subsequent ones cannot be because our
5737  * caller must necessarily have obtained a snapshot later than
5738  * the pg_upgrade itself.
5739  */
5741 
5742  nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5743  HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5744  for (i = 0; i < nmembers; i++)
5745  {
5746  result = test_lockmode_for_conflict(members[i].status,
5747  members[i].xid,
5748  mode, &needwait);
5749 
5750  if (needwait)
5751  {
5753  XactLockTableWait(members[i].xid, rel,
5754  &mytup.t_self,
5756  pfree(members);
5757  goto l4;
5758  }
5759  if (result != HeapTupleMayBeUpdated)
5760  {
5761  pfree(members);
5762  goto out_locked;
5763  }
5764  }
5765  if (members)
5766  pfree(members);
5767  }
5768  else
5769  {
5771 
5772  /*
5773  * For a non-multi Xmax, we first need to compute the
5774  * corresponding MultiXactStatus by using the infomask bits.
5775  */
5776  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5777  {
5778  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5779  status = MultiXactStatusForKeyShare;
5780  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5781  status = MultiXactStatusForShare;
5782  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5783  {
5784  if (old_infomask2 & HEAP_KEYS_UPDATED)
5785  status = MultiXactStatusForUpdate;
5786  else
5788  }
5789  else
5790  {
5791  /*
5792  * LOCK_ONLY present alone (a pg_upgraded tuple marked
5793  * as share-locked in the old cluster) shouldn't be
5794  * seen in the middle of an update chain.
5795  */
5796  elog(ERROR, "invalid lock status in tuple");
5797  }
5798  }
5799  else
5800  {
5801  /* it's an update, but which kind? */
5802  if (old_infomask2 & HEAP_KEYS_UPDATED)
5803  status = MultiXactStatusUpdate;
5804  else
5805  status = MultiXactStatusNoKeyUpdate;
5806  }
5807 
5808  result = test_lockmode_for_conflict(status, rawxmax, mode,
5809  &needwait);
5810  if (needwait)
5811  {
5813  XactLockTableWait(rawxmax, rel, &mytup.t_self,
5815  goto l4;
5816  }
5817  if (result != HeapTupleMayBeUpdated)
5818  {
5819  goto out_locked;
5820  }
5821  }
5822  }
5823 
5824  /* compute the new Xmax and infomask values for the tuple ... */
5825  compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5826  xid, mode, false,
5827  &new_xmax, &new_infomask, &new_infomask2);
5828 
5829  if (PageIsAllVisible(BufferGetPage(buf)) &&
5830  visibilitymap_clear(rel, block, vmbuffer,
5832  cleared_all_frozen = true;
5833 
5835 
5836  /* ... and set them */
5837  HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
5838  mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
5840  mytup.t_data->t_infomask |= new_infomask;
5841  mytup.t_data->t_infomask2 |= new_infomask2;
5842 
5843