PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
heapam.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  * heap access method code
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  * relation_open - open any relation by relation OID
16  * relation_openrv - open any relation specified by a RangeVar
17  * relation_close - close any relation
18  * heap_open - open a heap relation by relation OID
19  * heap_openrv - open a heap relation specified by a RangeVar
20  * heap_close - (now just a macro for relation_close)
21  * heap_beginscan - begin relation scan
22  * heap_rescan - restart a relation scan
23  * heap_endscan - end relation scan
24  * heap_getnext - retrieve next tuple in scan
25  * heap_fetch - retrieve tuple with given tid
26  * heap_insert - insert tuple into a relation
27  * heap_multi_insert - insert multiple tuples into a relation
28  * heap_delete - delete a tuple from a relation
29  * heap_update - replace a tuple in a relation with another tuple
30  * heap_sync - sync heap, for when no WAL has been written
31  *
32  * NOTES
33  * This file contains the heap_ routines which implement
34  * the POSTGRES heap access method used for all POSTGRES
35  * relations.
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include "access/bufmask.h"
42 #include "access/heapam.h"
43 #include "access/heapam_xlog.h"
44 #include "access/hio.h"
45 #include "access/multixact.h"
46 #include "access/parallel.h"
47 #include "access/relscan.h"
48 #include "access/sysattr.h"
49 #include "access/transam.h"
50 #include "access/tuptoaster.h"
51 #include "access/valid.h"
52 #include "access/visibilitymap.h"
53 #include "access/xact.h"
54 #include "access/xlog.h"
55 #include "access/xloginsert.h"
56 #include "access/xlogutils.h"
57 #include "catalog/catalog.h"
58 #include "catalog/namespace.h"
59 #include "miscadmin.h"
60 #include "pgstat.h"
61 #include "storage/bufmgr.h"
62 #include "storage/freespace.h"
63 #include "storage/lmgr.h"
64 #include "storage/predicate.h"
65 #include "storage/procarray.h"
66 #include "storage/smgr.h"
67 #include "storage/spin.h"
68 #include "storage/standby.h"
69 #include "utils/datum.h"
70 #include "utils/inval.h"
71 #include "utils/lsyscache.h"
72 #include "utils/relcache.h"
73 #include "utils/snapmgr.h"
74 #include "utils/syscache.h"
75 #include "utils/tqual.h"
76 
77 
78 /* GUC variable */
80 
81 
83  Snapshot snapshot,
84  int nkeys, ScanKey key,
85  ParallelHeapScanDesc parallel_scan,
86  bool allow_strat,
87  bool allow_sync,
88  bool allow_pagemode,
89  bool is_bitmapscan,
90  bool is_samplescan,
91  bool temp_snap);
93 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
94  TransactionId xid, CommandId cid, int options);
95 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
96  Buffer newbuf, HeapTuple oldtup,
97  HeapTuple newtup, HeapTuple old_key_tup,
98  bool all_visible_cleared, bool new_all_visible_cleared);
100  Bitmapset *interesting_cols,
101  HeapTuple oldtup, HeapTuple newtup);
102 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
103  LockTupleMode mode, LockWaitPolicy wait_policy,
104  bool *have_tuple_lock);
105 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
106  uint16 old_infomask2, TransactionId add_to_xmax,
107  LockTupleMode mode, bool is_update,
108  TransactionId *result_xmax, uint16 *result_infomask,
109  uint16 *result_infomask2);
111  ItemPointer ctid, TransactionId xid,
112  LockTupleMode mode);
113 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
114  uint16 *new_infomask2);
116  uint16 t_infomask);
117 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
118  LockTupleMode lockmode);
119 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
120  Relation rel, ItemPointer ctid, XLTW_Oper oper,
121  int *remaining);
123  uint16 infomask, Relation rel, int *remaining);
124 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
125 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
126  bool *copy);
127 
128 
129 /*
130  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
131  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
132  * update them). This table (and the macros below) helps us determine the
133  * heavyweight lock mode and MultiXactStatus values to use for any particular
134  * tuple lock strength.
135  *
136  * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
137  * instead.
138  */
139 static const struct
140 {
144 }
145 
147 {
148  { /* LockTupleKeyShare */
151  -1 /* KeyShare does not allow updating tuples */
152  },
153  { /* LockTupleShare */
154  RowShareLock,
156  -1 /* Share does not allow updating tuples */
157  },
158  { /* LockTupleNoKeyExclusive */
162  },
163  { /* LockTupleExclusive */
167  }
168 };
169 
170 /* Get the LOCKMODE for a given MultiXactStatus */
171 #define LOCKMODE_from_mxstatus(status) \
172  (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
173 
174 /*
175  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
176  * This is more readable than having every caller translate it to lock.h's
177  * LOCKMODE.
178  */
179 #define LockTupleTuplock(rel, tup, mode) \
180  LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
181 #define UnlockTupleTuplock(rel, tup, mode) \
182  UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
183 #define ConditionalLockTupleTuplock(rel, tup, mode) \
184  ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
185 
186 /*
187  * This table maps tuple lock strength values for each particular
188  * MultiXactStatus value.
189  */
191 {
192  LockTupleKeyShare, /* ForKeyShare */
193  LockTupleShare, /* ForShare */
194  LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
195  LockTupleExclusive, /* ForUpdate */
196  LockTupleNoKeyExclusive, /* NoKeyUpdate */
197  LockTupleExclusive /* Update */
198 };
199 
200 /* Get the LockTupleMode for a given MultiXactStatus */
201 #define TUPLOCK_from_mxstatus(status) \
202  (MultiXactStatusLock[(status)])
203 
204 /* ----------------------------------------------------------------
205  * heap support routines
206  * ----------------------------------------------------------------
207  */
208 
209 /* ----------------
210  * initscan - scan code common to heap_beginscan and heap_rescan
211  * ----------------
212  */
213 static void
214 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
215 {
216  bool allow_strat;
217  bool allow_sync;
218 
219  /*
220  * Determine the number of blocks we have to scan.
221  *
222  * It is sufficient to do this once at scan start, since any tuples added
223  * while the scan is in progress will be invisible to my snapshot anyway.
224  * (That is not true when using a non-MVCC snapshot. However, we couldn't
225  * guarantee to return tuples added after scan start anyway, since they
226  * might go into pages we already scanned. To guarantee consistent
227  * results for a non-MVCC snapshot, the caller must hold some higher-level
228  * lock that ensures the interesting tuple(s) won't change.)
229  */
230  if (scan->rs_parallel != NULL)
231  scan->rs_nblocks = scan->rs_parallel->phs_nblocks;
232  else
234 
235  /*
236  * If the table is large relative to NBuffers, use a bulk-read access
237  * strategy and enable synchronized scanning (see syncscan.c). Although
238  * the thresholds for these features could be different, we make them the
239  * same so that there are only two behaviors to tune rather than four.
240  * (However, some callers need to be able to disable one or both of these
241  * behaviors, independently of the size of the table; also there is a GUC
242  * variable that can disable synchronized scanning.)
243  *
244  * Note that heap_parallelscan_initialize has a very similar test; if you
245  * change this, consider changing that one, too.
246  */
247  if (!RelationUsesLocalBuffers(scan->rs_rd) &&
248  scan->rs_nblocks > NBuffers / 4)
249  {
250  allow_strat = scan->rs_allow_strat;
251  allow_sync = scan->rs_allow_sync;
252  }
253  else
254  allow_strat = allow_sync = false;
255 
256  if (allow_strat)
257  {
258  /* During a rescan, keep the previous strategy object. */
259  if (scan->rs_strategy == NULL)
261  }
262  else
263  {
264  if (scan->rs_strategy != NULL)
266  scan->rs_strategy = NULL;
267  }
268 
269  if (scan->rs_parallel != NULL)
270  {
271  /* For parallel scan, believe whatever ParallelHeapScanDesc says. */
272  scan->rs_syncscan = scan->rs_parallel->phs_syncscan;
273  }
274  else if (keep_startblock)
275  {
276  /*
277  * When rescanning, we want to keep the previous startblock setting,
278  * so that rewinding a cursor doesn't generate surprising results.
279  * Reset the active syncscan setting, though.
280  */
281  scan->rs_syncscan = (allow_sync && synchronize_seqscans);
282  }
283  else if (allow_sync && synchronize_seqscans)
284  {
285  scan->rs_syncscan = true;
286  scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
287  }
288  else
289  {
290  scan->rs_syncscan = false;
291  scan->rs_startblock = 0;
292  }
293 
295  scan->rs_inited = false;
296  scan->rs_ctup.t_data = NULL;
298  scan->rs_cbuf = InvalidBuffer;
300 
301  /* page-at-a-time fields are always invalid when not rs_inited */
302 
303  /*
304  * copy the scan key, if appropriate
305  */
306  if (key != NULL)
307  memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
308 
309  /*
310  * Currently, we don't have a stats counter for bitmap heap scans (but the
311  * underlying bitmap index scans will be counted) or sample scans (we only
312  * update stats for tuple fetches there)
313  */
314  if (!scan->rs_bitmapscan && !scan->rs_samplescan)
316 }
317 
318 /*
319  * heap_setscanlimits - restrict range of a heapscan
320  *
321  * startBlk is the page to start at
322  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
323  */
324 void
326 {
327  Assert(!scan->rs_inited); /* else too late to change */
328  Assert(!scan->rs_syncscan); /* else rs_startblock is significant */
329 
330  /* Check startBlk is valid (but allow case of zero blocks...) */
331  Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
332 
333  scan->rs_startblock = startBlk;
334  scan->rs_numblocks = numBlks;
335 }
336 
337 /*
338  * heapgetpage - subroutine for heapgettup()
339  *
340  * This routine reads and pins the specified page of the relation.
341  * In page-at-a-time mode it performs additional work, namely determining
342  * which tuples on the page are visible.
343  */
344 void
346 {
347  Buffer buffer;
348  Snapshot snapshot;
349  Page dp;
350  int lines;
351  int ntup;
352  OffsetNumber lineoff;
353  ItemId lpp;
354  bool all_visible;
355 
356  Assert(page < scan->rs_nblocks);
357 
358  /* release previous scan buffer, if any */
359  if (BufferIsValid(scan->rs_cbuf))
360  {
361  ReleaseBuffer(scan->rs_cbuf);
362  scan->rs_cbuf = InvalidBuffer;
363  }
364 
365  /*
366  * Be sure to check for interrupts at least once per page. Checks at
367  * higher code levels won't be able to stop a seqscan that encounters many
368  * pages' worth of consecutive dead tuples.
369  */
371 
372  /* read page using selected strategy */
373  scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
374  RBM_NORMAL, scan->rs_strategy);
375  scan->rs_cblock = page;
376 
377  if (!scan->rs_pageatatime)
378  return;
379 
380  buffer = scan->rs_cbuf;
381  snapshot = scan->rs_snapshot;
382 
383  /*
384  * Prune and repair fragmentation for the whole page, if possible.
385  */
386  heap_page_prune_opt(scan->rs_rd, buffer);
387 
388  /*
389  * We must hold share lock on the buffer content while examining tuple
390  * visibility. Afterwards, however, the tuples we have found to be
391  * visible are guaranteed good as long as we hold the buffer pin.
392  */
393  LockBuffer(buffer, BUFFER_LOCK_SHARE);
394 
395  dp = BufferGetPage(buffer);
396  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
397  lines = PageGetMaxOffsetNumber(dp);
398  ntup = 0;
399 
400  /*
401  * If the all-visible flag indicates that all tuples on the page are
402  * visible to everyone, we can skip the per-tuple visibility tests.
403  *
404  * Note: In hot standby, a tuple that's already visible to all
405  * transactions in the master might still be invisible to a read-only
406  * transaction in the standby. We partly handle this problem by tracking
407  * the minimum xmin of visible tuples as the cut-off XID while marking a
408  * page all-visible on master and WAL log that along with the visibility
409  * map SET operation. In hot standby, we wait for (or abort) all
410  * transactions that can potentially may not see one or more tuples on the
411  * page. That's how index-only scans work fine in hot standby. A crucial
412  * difference between index-only scans and heap scans is that the
413  * index-only scan completely relies on the visibility map where as heap
414  * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
415  * the page-level flag can be trusted in the same way, because it might
416  * get propagated somehow without being explicitly WAL-logged, e.g. via a
417  * full page write. Until we can prove that beyond doubt, let's check each
418  * tuple for visibility the hard way.
419  */
420  all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
421 
422  for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
423  lineoff <= lines;
424  lineoff++, lpp++)
425  {
426  if (ItemIdIsNormal(lpp))
427  {
428  HeapTupleData loctup;
429  bool valid;
430 
431  loctup.t_tableOid = RelationGetRelid(scan->rs_rd);
432  loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
433  loctup.t_len = ItemIdGetLength(lpp);
434  ItemPointerSet(&(loctup.t_self), page, lineoff);
435 
436  if (all_visible)
437  valid = true;
438  else
439  valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
440 
441  CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
442  buffer, snapshot);
443 
444  if (valid)
445  scan->rs_vistuples[ntup++] = lineoff;
446  }
447  }
448 
450 
451  Assert(ntup <= MaxHeapTuplesPerPage);
452  scan->rs_ntuples = ntup;
453 }
454 
455 /* ----------------
456  * heapgettup - fetch next heap tuple
457  *
458  * Initialize the scan if not already done; then advance to the next
459  * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
460  * or set scan->rs_ctup.t_data = NULL if no more tuples.
461  *
462  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
463  * by scan->rs_ctup".
464  *
465  * Note: the reason nkeys/key are passed separately, even though they are
466  * kept in the scan descriptor, is that the caller may not want us to check
467  * the scankeys.
468  *
469  * Note: when we fall off the end of the scan in either direction, we
470  * reset rs_inited. This means that a further request with the same
471  * scan direction will restart the scan, which is a bit odd, but a
472  * request with the opposite scan direction will start a fresh scan
473  * in the proper direction. The latter is required behavior for cursors,
474  * while the former case is generally undefined behavior in Postgres
475  * so we don't care too much.
476  * ----------------
477  */
478 static void
480  ScanDirection dir,
481  int nkeys,
482  ScanKey key)
483 {
484  HeapTuple tuple = &(scan->rs_ctup);
485  Snapshot snapshot = scan->rs_snapshot;
486  bool backward = ScanDirectionIsBackward(dir);
487  BlockNumber page;
488  bool finished;
489  Page dp;
490  int lines;
491  OffsetNumber lineoff;
492  int linesleft;
493  ItemId lpp;
494 
495  /*
496  * calculate next starting lineoff, given scan direction
497  */
498  if (ScanDirectionIsForward(dir))
499  {
500  if (!scan->rs_inited)
501  {
502  /*
503  * return null immediately if relation is empty
504  */
505  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
506  {
507  Assert(!BufferIsValid(scan->rs_cbuf));
508  tuple->t_data = NULL;
509  return;
510  }
511  if (scan->rs_parallel != NULL)
512  {
513  page = heap_parallelscan_nextpage(scan);
514 
515  /* Other processes might have already finished the scan. */
516  if (page == InvalidBlockNumber)
517  {
518  Assert(!BufferIsValid(scan->rs_cbuf));
519  tuple->t_data = NULL;
520  return;
521  }
522  }
523  else
524  page = scan->rs_startblock; /* first page */
525  heapgetpage(scan, page);
526  lineoff = FirstOffsetNumber; /* first offnum */
527  scan->rs_inited = true;
528  }
529  else
530  {
531  /* continue from previously returned page/tuple */
532  page = scan->rs_cblock; /* current page */
533  lineoff = /* next offnum */
535  }
536 
538 
539  dp = BufferGetPage(scan->rs_cbuf);
540  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
541  lines = PageGetMaxOffsetNumber(dp);
542  /* page and lineoff now reference the physically next tid */
543 
544  linesleft = lines - lineoff + 1;
545  }
546  else if (backward)
547  {
548  /* backward parallel scan not supported */
549  Assert(scan->rs_parallel == NULL);
550 
551  if (!scan->rs_inited)
552  {
553  /*
554  * return null immediately if relation is empty
555  */
556  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
557  {
558  Assert(!BufferIsValid(scan->rs_cbuf));
559  tuple->t_data = NULL;
560  return;
561  }
562 
563  /*
564  * Disable reporting to syncscan logic in a backwards scan; it's
565  * not very likely anyone else is doing the same thing at the same
566  * time, and much more likely that we'll just bollix things for
567  * forward scanners.
568  */
569  scan->rs_syncscan = false;
570  /* start from last page of the scan */
571  if (scan->rs_startblock > 0)
572  page = scan->rs_startblock - 1;
573  else
574  page = scan->rs_nblocks - 1;
575  heapgetpage(scan, page);
576  }
577  else
578  {
579  /* continue from previously returned page/tuple */
580  page = scan->rs_cblock; /* current page */
581  }
582 
584 
585  dp = BufferGetPage(scan->rs_cbuf);
586  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
587  lines = PageGetMaxOffsetNumber(dp);
588 
589  if (!scan->rs_inited)
590  {
591  lineoff = lines; /* final offnum */
592  scan->rs_inited = true;
593  }
594  else
595  {
596  lineoff = /* previous offnum */
598  }
599  /* page and lineoff now reference the physically previous tid */
600 
601  linesleft = lineoff;
602  }
603  else
604  {
605  /*
606  * ``no movement'' scan direction: refetch prior tuple
607  */
608  if (!scan->rs_inited)
609  {
610  Assert(!BufferIsValid(scan->rs_cbuf));
611  tuple->t_data = NULL;
612  return;
613  }
614 
615  page = ItemPointerGetBlockNumber(&(tuple->t_self));
616  if (page != scan->rs_cblock)
617  heapgetpage(scan, page);
618 
619  /* Since the tuple was previously fetched, needn't lock page here */
620  dp = BufferGetPage(scan->rs_cbuf);
621  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
622  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
623  lpp = PageGetItemId(dp, lineoff);
624  Assert(ItemIdIsNormal(lpp));
625 
626  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
627  tuple->t_len = ItemIdGetLength(lpp);
628 
629  return;
630  }
631 
632  /*
633  * advance the scan until we find a qualifying tuple or run out of stuff
634  * to scan
635  */
636  lpp = PageGetItemId(dp, lineoff);
637  for (;;)
638  {
639  while (linesleft > 0)
640  {
641  if (ItemIdIsNormal(lpp))
642  {
643  bool valid;
644 
645  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
646  tuple->t_len = ItemIdGetLength(lpp);
647  ItemPointerSet(&(tuple->t_self), page, lineoff);
648 
649  /*
650  * if current tuple qualifies, return it.
651  */
652  valid = HeapTupleSatisfiesVisibility(tuple,
653  snapshot,
654  scan->rs_cbuf);
655 
656  CheckForSerializableConflictOut(valid, scan->rs_rd, tuple,
657  scan->rs_cbuf, snapshot);
658 
659  if (valid && key != NULL)
660  HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
661  nkeys, key, valid);
662 
663  if (valid)
664  {
666  return;
667  }
668  }
669 
670  /*
671  * otherwise move to the next item on the page
672  */
673  --linesleft;
674  if (backward)
675  {
676  --lpp; /* move back in this page's ItemId array */
677  --lineoff;
678  }
679  else
680  {
681  ++lpp; /* move forward in this page's ItemId array */
682  ++lineoff;
683  }
684  }
685 
686  /*
687  * if we get here, it means we've exhausted the items on this page and
688  * it's time to move to the next.
689  */
691 
692  /*
693  * advance to next/prior page and detect end of scan
694  */
695  if (backward)
696  {
697  finished = (page == scan->rs_startblock) ||
698  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
699  if (page == 0)
700  page = scan->rs_nblocks;
701  page--;
702  }
703  else if (scan->rs_parallel != NULL)
704  {
705  page = heap_parallelscan_nextpage(scan);
706  finished = (page == InvalidBlockNumber);
707  }
708  else
709  {
710  page++;
711  if (page >= scan->rs_nblocks)
712  page = 0;
713  finished = (page == scan->rs_startblock) ||
714  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
715 
716  /*
717  * Report our new scan position for synchronization purposes. We
718  * don't do that when moving backwards, however. That would just
719  * mess up any other forward-moving scanners.
720  *
721  * Note: we do this before checking for end of scan so that the
722  * final state of the position hint is back at the start of the
723  * rel. That's not strictly necessary, but otherwise when you run
724  * the same query multiple times the starting position would shift
725  * a little bit backwards on every invocation, which is confusing.
726  * We don't guarantee any specific ordering in general, though.
727  */
728  if (scan->rs_syncscan)
729  ss_report_location(scan->rs_rd, page);
730  }
731 
732  /*
733  * return NULL if we've exhausted all the pages
734  */
735  if (finished)
736  {
737  if (BufferIsValid(scan->rs_cbuf))
738  ReleaseBuffer(scan->rs_cbuf);
739  scan->rs_cbuf = InvalidBuffer;
741  tuple->t_data = NULL;
742  scan->rs_inited = false;
743  return;
744  }
745 
746  heapgetpage(scan, page);
747 
749 
750  dp = BufferGetPage(scan->rs_cbuf);
751  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
752  lines = PageGetMaxOffsetNumber((Page) dp);
753  linesleft = lines;
754  if (backward)
755  {
756  lineoff = lines;
757  lpp = PageGetItemId(dp, lines);
758  }
759  else
760  {
761  lineoff = FirstOffsetNumber;
762  lpp = PageGetItemId(dp, FirstOffsetNumber);
763  }
764  }
765 }
766 
767 /* ----------------
768  * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
769  *
770  * Same API as heapgettup, but used in page-at-a-time mode
771  *
772  * The internal logic is much the same as heapgettup's too, but there are some
773  * differences: we do not take the buffer content lock (that only needs to
774  * happen inside heapgetpage), and we iterate through just the tuples listed
775  * in rs_vistuples[] rather than all tuples on the page. Notice that
776  * lineindex is 0-based, where the corresponding loop variable lineoff in
777  * heapgettup is 1-based.
778  * ----------------
779  */
780 static void
782  ScanDirection dir,
783  int nkeys,
784  ScanKey key)
785 {
786  HeapTuple tuple = &(scan->rs_ctup);
787  bool backward = ScanDirectionIsBackward(dir);
788  BlockNumber page;
789  bool finished;
790  Page dp;
791  int lines;
792  int lineindex;
793  OffsetNumber lineoff;
794  int linesleft;
795  ItemId lpp;
796 
797  /*
798  * calculate next starting lineindex, given scan direction
799  */
800  if (ScanDirectionIsForward(dir))
801  {
802  if (!scan->rs_inited)
803  {
804  /*
805  * return null immediately if relation is empty
806  */
807  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
808  {
809  Assert(!BufferIsValid(scan->rs_cbuf));
810  tuple->t_data = NULL;
811  return;
812  }
813  if (scan->rs_parallel != NULL)
814  {
815  page = heap_parallelscan_nextpage(scan);
816 
817  /* Other processes might have already finished the scan. */
818  if (page == InvalidBlockNumber)
819  {
820  Assert(!BufferIsValid(scan->rs_cbuf));
821  tuple->t_data = NULL;
822  return;
823  }
824  }
825  else
826  page = scan->rs_startblock; /* first page */
827  heapgetpage(scan, page);
828  lineindex = 0;
829  scan->rs_inited = true;
830  }
831  else
832  {
833  /* continue from previously returned page/tuple */
834  page = scan->rs_cblock; /* current page */
835  lineindex = scan->rs_cindex + 1;
836  }
837 
838  dp = BufferGetPage(scan->rs_cbuf);
839  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
840  lines = scan->rs_ntuples;
841  /* page and lineindex now reference the next visible tid */
842 
843  linesleft = lines - lineindex;
844  }
845  else if (backward)
846  {
847  /* backward parallel scan not supported */
848  Assert(scan->rs_parallel == NULL);
849 
850  if (!scan->rs_inited)
851  {
852  /*
853  * return null immediately if relation is empty
854  */
855  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
856  {
857  Assert(!BufferIsValid(scan->rs_cbuf));
858  tuple->t_data = NULL;
859  return;
860  }
861 
862  /*
863  * Disable reporting to syncscan logic in a backwards scan; it's
864  * not very likely anyone else is doing the same thing at the same
865  * time, and much more likely that we'll just bollix things for
866  * forward scanners.
867  */
868  scan->rs_syncscan = false;
869  /* start from last page of the scan */
870  if (scan->rs_startblock > 0)
871  page = scan->rs_startblock - 1;
872  else
873  page = scan->rs_nblocks - 1;
874  heapgetpage(scan, page);
875  }
876  else
877  {
878  /* continue from previously returned page/tuple */
879  page = scan->rs_cblock; /* current page */
880  }
881 
882  dp = BufferGetPage(scan->rs_cbuf);
883  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
884  lines = scan->rs_ntuples;
885 
886  if (!scan->rs_inited)
887  {
888  lineindex = lines - 1;
889  scan->rs_inited = true;
890  }
891  else
892  {
893  lineindex = scan->rs_cindex - 1;
894  }
895  /* page and lineindex now reference the previous visible tid */
896 
897  linesleft = lineindex + 1;
898  }
899  else
900  {
901  /*
902  * ``no movement'' scan direction: refetch prior tuple
903  */
904  if (!scan->rs_inited)
905  {
906  Assert(!BufferIsValid(scan->rs_cbuf));
907  tuple->t_data = NULL;
908  return;
909  }
910 
911  page = ItemPointerGetBlockNumber(&(tuple->t_self));
912  if (page != scan->rs_cblock)
913  heapgetpage(scan, page);
914 
915  /* Since the tuple was previously fetched, needn't lock page here */
916  dp = BufferGetPage(scan->rs_cbuf);
917  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
918  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
919  lpp = PageGetItemId(dp, lineoff);
920  Assert(ItemIdIsNormal(lpp));
921 
922  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
923  tuple->t_len = ItemIdGetLength(lpp);
924 
925  /* check that rs_cindex is in sync */
926  Assert(scan->rs_cindex < scan->rs_ntuples);
927  Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
928 
929  return;
930  }
931 
932  /*
933  * advance the scan until we find a qualifying tuple or run out of stuff
934  * to scan
935  */
936  for (;;)
937  {
938  while (linesleft > 0)
939  {
940  lineoff = scan->rs_vistuples[lineindex];
941  lpp = PageGetItemId(dp, lineoff);
942  Assert(ItemIdIsNormal(lpp));
943 
944  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
945  tuple->t_len = ItemIdGetLength(lpp);
946  ItemPointerSet(&(tuple->t_self), page, lineoff);
947 
948  /*
949  * if current tuple qualifies, return it.
950  */
951  if (key != NULL)
952  {
953  bool valid;
954 
955  HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
956  nkeys, key, valid);
957  if (valid)
958  {
959  scan->rs_cindex = lineindex;
960  return;
961  }
962  }
963  else
964  {
965  scan->rs_cindex = lineindex;
966  return;
967  }
968 
969  /*
970  * otherwise move to the next item on the page
971  */
972  --linesleft;
973  if (backward)
974  --lineindex;
975  else
976  ++lineindex;
977  }
978 
979  /*
980  * if we get here, it means we've exhausted the items on this page and
981  * it's time to move to the next.
982  */
983  if (backward)
984  {
985  finished = (page == scan->rs_startblock) ||
986  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
987  if (page == 0)
988  page = scan->rs_nblocks;
989  page--;
990  }
991  else if (scan->rs_parallel != NULL)
992  {
993  page = heap_parallelscan_nextpage(scan);
994  finished = (page == InvalidBlockNumber);
995  }
996  else
997  {
998  page++;
999  if (page >= scan->rs_nblocks)
1000  page = 0;
1001  finished = (page == scan->rs_startblock) ||
1002  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1003 
1004  /*
1005  * Report our new scan position for synchronization purposes. We
1006  * don't do that when moving backwards, however. That would just
1007  * mess up any other forward-moving scanners.
1008  *
1009  * Note: we do this before checking for end of scan so that the
1010  * final state of the position hint is back at the start of the
1011  * rel. That's not strictly necessary, but otherwise when you run
1012  * the same query multiple times the starting position would shift
1013  * a little bit backwards on every invocation, which is confusing.
1014  * We don't guarantee any specific ordering in general, though.
1015  */
1016  if (scan->rs_syncscan)
1017  ss_report_location(scan->rs_rd, page);
1018  }
1019 
1020  /*
1021  * return NULL if we've exhausted all the pages
1022  */
1023  if (finished)
1024  {
1025  if (BufferIsValid(scan->rs_cbuf))
1026  ReleaseBuffer(scan->rs_cbuf);
1027  scan->rs_cbuf = InvalidBuffer;
1028  scan->rs_cblock = InvalidBlockNumber;
1029  tuple->t_data = NULL;
1030  scan->rs_inited = false;
1031  return;
1032  }
1033 
1034  heapgetpage(scan, page);
1035 
1036  dp = BufferGetPage(scan->rs_cbuf);
1037  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
1038  lines = scan->rs_ntuples;
1039  linesleft = lines;
1040  if (backward)
1041  lineindex = lines - 1;
1042  else
1043  lineindex = 0;
1044  }
1045 }
1046 
1047 
1048 #if defined(DISABLE_COMPLEX_MACRO)
1049 /*
1050  * This is formatted so oddly so that the correspondence to the macro
1051  * definition in access/htup_details.h is maintained.
1052  */
1053 Datum
1054 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1055  bool *isnull)
1056 {
1057  return (
1058  (attnum) > 0 ?
1059  (
1060  (*(isnull) = false),
1061  HeapTupleNoNulls(tup) ?
1062  (
1063  (tupleDesc)->attrs[(attnum) - 1]->attcacheoff >= 0 ?
1064  (
1065  fetchatt((tupleDesc)->attrs[(attnum) - 1],
1066  (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1067  (tupleDesc)->attrs[(attnum) - 1]->attcacheoff)
1068  )
1069  :
1070  nocachegetattr((tup), (attnum), (tupleDesc))
1071  )
1072  :
1073  (
1074  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1075  (
1076  (*(isnull) = true),
1077  (Datum) NULL
1078  )
1079  :
1080  (
1081  nocachegetattr((tup), (attnum), (tupleDesc))
1082  )
1083  )
1084  )
1085  :
1086  (
1087  (Datum) NULL
1088  )
1089  );
1090 }
1091 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1092 
1093 
1094 /* ----------------------------------------------------------------
1095  * heap access method interface
1096  * ----------------------------------------------------------------
1097  */
1098 
1099 /* ----------------
1100  * relation_open - open any relation by relation OID
1101  *
1102  * If lockmode is not "NoLock", the specified kind of lock is
1103  * obtained on the relation. (Generally, NoLock should only be
1104  * used if the caller knows it has some appropriate lock on the
1105  * relation already.)
1106  *
1107  * An error is raised if the relation does not exist.
1108  *
1109  * NB: a "relation" is anything with a pg_class entry. The caller is
1110  * expected to check whether the relkind is something it can handle.
1111  * ----------------
1112  */
1113 Relation
1114 relation_open(Oid relationId, LOCKMODE lockmode)
1115 {
1116  Relation r;
1117 
1118  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1119 
1120  /* Get the lock before trying to open the relcache entry */
1121  if (lockmode != NoLock)
1122  LockRelationOid(relationId, lockmode);
1123 
1124  /* The relcache does all the real work... */
1125  r = RelationIdGetRelation(relationId);
1126 
1127  if (!RelationIsValid(r))
1128  elog(ERROR, "could not open relation with OID %u", relationId);
1129 
1130  /* Make note that we've accessed a temporary relation */
1131  if (RelationUsesLocalBuffers(r))
1133 
1134  pgstat_initstats(r);
1135 
1136  return r;
1137 }
1138 
1139 /* ----------------
1140  * try_relation_open - open any relation by relation OID
1141  *
1142  * Same as relation_open, except return NULL instead of failing
1143  * if the relation does not exist.
1144  * ----------------
1145  */
1146 Relation
1147 try_relation_open(Oid relationId, LOCKMODE lockmode)
1148 {
1149  Relation r;
1150 
1151  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1152 
1153  /* Get the lock first */
1154  if (lockmode != NoLock)
1155  LockRelationOid(relationId, lockmode);
1156 
1157  /*
1158  * Now that we have the lock, probe to see if the relation really exists
1159  * or not.
1160  */
1161  if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId)))
1162  {
1163  /* Release useless lock */
1164  if (lockmode != NoLock)
1165  UnlockRelationOid(relationId, lockmode);
1166 
1167  return NULL;
1168  }
1169 
1170  /* Should be safe to do a relcache load */
1171  r = RelationIdGetRelation(relationId);
1172 
1173  if (!RelationIsValid(r))
1174  elog(ERROR, "could not open relation with OID %u", relationId);
1175 
1176  /* Make note that we've accessed a temporary relation */
1177  if (RelationUsesLocalBuffers(r))
1179 
1180  pgstat_initstats(r);
1181 
1182  return r;
1183 }
1184 
1185 /* ----------------
1186  * relation_openrv - open any relation specified by a RangeVar
1187  *
1188  * Same as relation_open, but the relation is specified by a RangeVar.
1189  * ----------------
1190  */
1191 Relation
1192 relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
1193 {
1194  Oid relOid;
1195 
1196  /*
1197  * Check for shared-cache-inval messages before trying to open the
1198  * relation. This is needed even if we already hold a lock on the
1199  * relation, because GRANT/REVOKE are executed without taking any lock on
1200  * the target relation, and we want to be sure we see current ACL
1201  * information. We can skip this if asked for NoLock, on the assumption
1202  * that such a call is not the first one in the current command, and so we
1203  * should be reasonably up-to-date already. (XXX this all could stand to
1204  * be redesigned, but for the moment we'll keep doing this like it's been
1205  * done historically.)
1206  */
1207  if (lockmode != NoLock)
1209 
1210  /* Look up and lock the appropriate relation using namespace search */
1211  relOid = RangeVarGetRelid(relation, lockmode, false);
1212 
1213  /* Let relation_open do the rest */
1214  return relation_open(relOid, NoLock);
1215 }
1216 
1217 /* ----------------
1218  * relation_openrv_extended - open any relation specified by a RangeVar
1219  *
1220  * Same as relation_openrv, but with an additional missing_ok argument
1221  * allowing a NULL return rather than an error if the relation is not
1222  * found. (Note that some other causes, such as permissions problems,
1223  * will still result in an ereport.)
1224  * ----------------
1225  */
1226 Relation
1227 relation_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1228  bool missing_ok)
1229 {
1230  Oid relOid;
1231 
1232  /*
1233  * Check for shared-cache-inval messages before trying to open the
1234  * relation. See comments in relation_openrv().
1235  */
1236  if (lockmode != NoLock)
1238 
1239  /* Look up and lock the appropriate relation using namespace search */
1240  relOid = RangeVarGetRelid(relation, lockmode, missing_ok);
1241 
1242  /* Return NULL on not-found */
1243  if (!OidIsValid(relOid))
1244  return NULL;
1245 
1246  /* Let relation_open do the rest */
1247  return relation_open(relOid, NoLock);
1248 }
1249 
1250 /* ----------------
1251  * relation_close - close any relation
1252  *
1253  * If lockmode is not "NoLock", we then release the specified lock.
1254  *
1255  * Note that it is often sensible to hold a lock beyond relation_close;
1256  * in that case, the lock is released automatically at xact end.
1257  * ----------------
1258  */
1259 void
1260 relation_close(Relation relation, LOCKMODE lockmode)
1261 {
1262  LockRelId relid = relation->rd_lockInfo.lockRelId;
1263 
1264  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1265 
1266  /* The relcache does the real work... */
1267  RelationClose(relation);
1268 
1269  if (lockmode != NoLock)
1270  UnlockRelationId(&relid, lockmode);
1271 }
1272 
1273 
1274 /* ----------------
1275  * heap_open - open a heap relation by relation OID
1276  *
1277  * This is essentially relation_open plus check that the relation
1278  * is not an index nor a composite type. (The caller should also
1279  * check that it's not a view or foreign table before assuming it has
1280  * storage.)
1281  * ----------------
1282  */
1283 Relation
1284 heap_open(Oid relationId, LOCKMODE lockmode)
1285 {
1286  Relation r;
1287 
1288  r = relation_open(relationId, lockmode);
1289 
1290  if (r->rd_rel->relkind == RELKIND_INDEX)
1291  ereport(ERROR,
1292  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1293  errmsg("\"%s\" is an index",
1295  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1296  ereport(ERROR,
1297  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1298  errmsg("\"%s\" is a composite type",
1300 
1301  return r;
1302 }
1303 
1304 /* ----------------
1305  * heap_openrv - open a heap relation specified
1306  * by a RangeVar node
1307  *
1308  * As above, but relation is specified by a RangeVar.
1309  * ----------------
1310  */
1311 Relation
1312 heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
1313 {
1314  Relation r;
1315 
1316  r = relation_openrv(relation, lockmode);
1317 
1318  if (r->rd_rel->relkind == RELKIND_INDEX)
1319  ereport(ERROR,
1320  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1321  errmsg("\"%s\" is an index",
1323  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1324  ereport(ERROR,
1325  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1326  errmsg("\"%s\" is a composite type",
1328 
1329  return r;
1330 }
1331 
1332 /* ----------------
1333  * heap_openrv_extended - open a heap relation specified
1334  * by a RangeVar node
1335  *
1336  * As above, but optionally return NULL instead of failing for
1337  * relation-not-found.
1338  * ----------------
1339  */
1340 Relation
1341 heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1342  bool missing_ok)
1343 {
1344  Relation r;
1345 
1346  r = relation_openrv_extended(relation, lockmode, missing_ok);
1347 
1348  if (r)
1349  {
1350  if (r->rd_rel->relkind == RELKIND_INDEX)
1351  ereport(ERROR,
1352  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1353  errmsg("\"%s\" is an index",
1355  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1356  ereport(ERROR,
1357  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1358  errmsg("\"%s\" is a composite type",
1360  }
1361 
1362  return r;
1363 }
1364 
1365 
1366 /* ----------------
1367  * heap_beginscan - begin relation scan
1368  *
1369  * heap_beginscan is the "standard" case.
1370  *
1371  * heap_beginscan_catalog differs in setting up its own temporary snapshot.
1372  *
1373  * heap_beginscan_strat offers an extended API that lets the caller control
1374  * whether a nondefault buffer access strategy can be used, and whether
1375  * syncscan can be chosen (possibly resulting in the scan not starting from
1376  * block zero). Both of these default to TRUE with plain heap_beginscan.
1377  *
1378  * heap_beginscan_bm is an alternative entry point for setting up a
1379  * HeapScanDesc for a bitmap heap scan. Although that scan technology is
1380  * really quite unlike a standard seqscan, there is just enough commonality
1381  * to make it worth using the same data structure.
1382  *
1383  * heap_beginscan_sampling is an alternative entry point for setting up a
1384  * HeapScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
1385  * using the same data structure although the behavior is rather different.
1386  * In addition to the options offered by heap_beginscan_strat, this call
1387  * also allows control of whether page-mode visibility checking is used.
1388  * ----------------
1389  */
1391 heap_beginscan(Relation relation, Snapshot snapshot,
1392  int nkeys, ScanKey key)
1393 {
1394  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1395  true, true, true, false, false, false);
1396 }
1397 
1399 heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key)
1400 {
1401  Oid relid = RelationGetRelid(relation);
1402  Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
1403 
1404  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1405  true, true, true, false, false, true);
1406 }
1407 
1410  int nkeys, ScanKey key,
1411  bool allow_strat, bool allow_sync)
1412 {
1413  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1414  allow_strat, allow_sync, true,
1415  false, false, false);
1416 }
1417 
1420  int nkeys, ScanKey key)
1421 {
1422  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1423  false, false, true, true, false, false);
1424 }
1425 
1428  int nkeys, ScanKey key,
1429  bool allow_strat, bool allow_sync, bool allow_pagemode)
1430 {
1431  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1432  allow_strat, allow_sync, allow_pagemode,
1433  false, true, false);
1434 }
1435 
1436 static HeapScanDesc
1438  int nkeys, ScanKey key,
1439  ParallelHeapScanDesc parallel_scan,
1440  bool allow_strat,
1441  bool allow_sync,
1442  bool allow_pagemode,
1443  bool is_bitmapscan,
1444  bool is_samplescan,
1445  bool temp_snap)
1446 {
1447  HeapScanDesc scan;
1448 
1449  /*
1450  * increment relation ref count while scanning relation
1451  *
1452  * This is just to make really sure the relcache entry won't go away while
1453  * the scan has a pointer to it. Caller should be holding the rel open
1454  * anyway, so this is redundant in all normal scenarios...
1455  */
1457 
1458  /*
1459  * allocate and initialize scan descriptor
1460  */
1461  scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1462 
1463  scan->rs_rd = relation;
1464  scan->rs_snapshot = snapshot;
1465  scan->rs_nkeys = nkeys;
1466  scan->rs_bitmapscan = is_bitmapscan;
1467  scan->rs_samplescan = is_samplescan;
1468  scan->rs_strategy = NULL; /* set in initscan */
1469  scan->rs_allow_strat = allow_strat;
1470  scan->rs_allow_sync = allow_sync;
1471  scan->rs_temp_snap = temp_snap;
1472  scan->rs_parallel = parallel_scan;
1473 
1474  /*
1475  * we can use page-at-a-time mode if it's an MVCC-safe snapshot
1476  */
1477  scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot);
1478 
1479  /*
1480  * For a seqscan in a serializable transaction, acquire a predicate lock
1481  * on the entire relation. This is required not only to lock all the
1482  * matching tuples, but also to conflict with new insertions into the
1483  * table. In an indexscan, we take page locks on the index pages covering
1484  * the range specified in the scan qual, but in a heap scan there is
1485  * nothing more fine-grained to lock. A bitmap scan is a different story,
1486  * there we have already scanned the index and locked the index pages
1487  * covering the predicate. But in that case we still have to lock any
1488  * matching heap tuples.
1489  */
1490  if (!is_bitmapscan)
1491  PredicateLockRelation(relation, snapshot);
1492 
1493  /* we only need to set this up once */
1494  scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1495 
1496  /*
1497  * we do this here instead of in initscan() because heap_rescan also calls
1498  * initscan() and we don't want to allocate memory again
1499  */
1500  if (nkeys > 0)
1501  scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1502  else
1503  scan->rs_key = NULL;
1504 
1505  initscan(scan, key, false);
1506 
1507  return scan;
1508 }
1509 
1510 /* ----------------
1511  * heap_rescan - restart a relation scan
1512  * ----------------
1513  */
1514 void
1516  ScanKey key)
1517 {
1518  /*
1519  * unpin scan buffers
1520  */
1521  if (BufferIsValid(scan->rs_cbuf))
1522  ReleaseBuffer(scan->rs_cbuf);
1523 
1524  /*
1525  * reinitialize scan descriptor
1526  */
1527  initscan(scan, key, true);
1528 
1529  /*
1530  * reset parallel scan, if present
1531  */
1532  if (scan->rs_parallel != NULL)
1533  {
1534  ParallelHeapScanDesc parallel_scan;
1535 
1536  /*
1537  * Caller is responsible for making sure that all workers have
1538  * finished the scan before calling this, so it really shouldn't be
1539  * necessary to acquire the mutex at all. We acquire it anyway, just
1540  * to be tidy.
1541  */
1542  parallel_scan = scan->rs_parallel;
1543  SpinLockAcquire(&parallel_scan->phs_mutex);
1544  parallel_scan->phs_cblock = parallel_scan->phs_startblock;
1545  SpinLockRelease(&parallel_scan->phs_mutex);
1546  }
1547 }
1548 
1549 /* ----------------
1550  * heap_rescan_set_params - restart a relation scan after changing params
1551  *
1552  * This call allows changing the buffer strategy, syncscan, and pagemode
1553  * options before starting a fresh scan. Note that although the actual use
1554  * of syncscan might change (effectively, enabling or disabling reporting),
1555  * the previously selected startblock will be kept.
1556  * ----------------
1557  */
1558 void
1560  bool allow_strat, bool allow_sync, bool allow_pagemode)
1561 {
1562  /* adjust parameters */
1563  scan->rs_allow_strat = allow_strat;
1564  scan->rs_allow_sync = allow_sync;
1565  scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(scan->rs_snapshot);
1566  /* ... and rescan */
1567  heap_rescan(scan, key);
1568 }
1569 
1570 /* ----------------
1571  * heap_endscan - end relation scan
1572  *
1573  * See how to integrate with index scans.
1574  * Check handling if reldesc caching.
1575  * ----------------
1576  */
1577 void
1579 {
1580  /* Note: no locking manipulations needed */
1581 
1582  /*
1583  * unpin scan buffers
1584  */
1585  if (BufferIsValid(scan->rs_cbuf))
1586  ReleaseBuffer(scan->rs_cbuf);
1587 
1588  /*
1589  * decrement relation reference count and free scan descriptor storage
1590  */
1592 
1593  if (scan->rs_key)
1594  pfree(scan->rs_key);
1595 
1596  if (scan->rs_strategy != NULL)
1598 
1599  if (scan->rs_temp_snap)
1601 
1602  pfree(scan);
1603 }
1604 
1605 /* ----------------
1606  * heap_parallelscan_estimate - estimate storage for ParallelHeapScanDesc
1607  *
1608  * Sadly, this doesn't reduce to a constant, because the size required
1609  * to serialize the snapshot can vary.
1610  * ----------------
1611  */
1612 Size
1614 {
1615  return add_size(offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1616  EstimateSnapshotSpace(snapshot));
1617 }
1618 
1619 /* ----------------
1620  * heap_parallelscan_initialize - initialize ParallelHeapScanDesc
1621  *
1622  * Must allow as many bytes of shared memory as returned by
1623  * heap_parallelscan_estimate. Call this just once in the leader
1624  * process; then, individual workers attach via heap_beginscan_parallel.
1625  * ----------------
1626  */
1627 void
1629  Snapshot snapshot)
1630 {
1631  target->phs_relid = RelationGetRelid(relation);
1632  target->phs_nblocks = RelationGetNumberOfBlocks(relation);
1633  /* compare phs_syncscan initialization to similar logic in initscan */
1634  target->phs_syncscan = synchronize_seqscans &&
1635  !RelationUsesLocalBuffers(relation) &&
1636  target->phs_nblocks > NBuffers / 4;
1637  SpinLockInit(&target->phs_mutex);
1638  target->phs_cblock = InvalidBlockNumber;
1640  SerializeSnapshot(snapshot, target->phs_snapshot_data);
1641 }
1642 
1643 /* ----------------
1644  * heap_beginscan_parallel - join a parallel scan
1645  *
1646  * Caller must hold a suitable lock on the correct relation.
1647  * ----------------
1648  */
1651 {
1652  Snapshot snapshot;
1653 
1654  Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
1655  snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data);
1656  RegisterSnapshot(snapshot);
1657 
1658  return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan,
1659  true, true, true, false, false, true);
1660 }
1661 
1662 /* ----------------
1663  * heap_parallelscan_nextpage - get the next page to scan
1664  *
1665  * Get the next page to scan. Even if there are no pages left to scan,
1666  * another backend could have grabbed a page to scan and not yet finished
1667  * looking at it, so it doesn't follow that the scan is done when the
1668  * first backend gets an InvalidBlockNumber return.
1669  * ----------------
1670  */
1671 static BlockNumber
1673 {
1675  BlockNumber sync_startpage = InvalidBlockNumber;
1676  BlockNumber report_page = InvalidBlockNumber;
1677  ParallelHeapScanDesc parallel_scan;
1678 
1679  Assert(scan->rs_parallel);
1680  parallel_scan = scan->rs_parallel;
1681 
1682 retry:
1683  /* Grab the spinlock. */
1684  SpinLockAcquire(&parallel_scan->phs_mutex);
1685 
1686  /*
1687  * If the scan's startblock has not yet been initialized, we must do so
1688  * now. If this is not a synchronized scan, we just start at block 0, but
1689  * if it is a synchronized scan, we must get the starting position from
1690  * the synchronized scan machinery. We can't hold the spinlock while
1691  * doing that, though, so release the spinlock, get the information we
1692  * need, and retry. If nobody else has initialized the scan in the
1693  * meantime, we'll fill in the value we fetched on the second time
1694  * through.
1695  */
1696  if (parallel_scan->phs_startblock == InvalidBlockNumber)
1697  {
1698  if (!parallel_scan->phs_syncscan)
1699  parallel_scan->phs_startblock = 0;
1700  else if (sync_startpage != InvalidBlockNumber)
1701  parallel_scan->phs_startblock = sync_startpage;
1702  else
1703  {
1704  SpinLockRelease(&parallel_scan->phs_mutex);
1705  sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks);
1706  goto retry;
1707  }
1708  parallel_scan->phs_cblock = parallel_scan->phs_startblock;
1709  }
1710 
1711  /*
1712  * The current block number is the next one that needs to be scanned,
1713  * unless it's InvalidBlockNumber already, in which case there are no more
1714  * blocks to scan. After remembering the current value, we must advance
1715  * it so that the next call to this function returns the next block to be
1716  * scanned.
1717  */
1718  page = parallel_scan->phs_cblock;
1719  if (page != InvalidBlockNumber)
1720  {
1721  parallel_scan->phs_cblock++;
1722  if (parallel_scan->phs_cblock >= scan->rs_nblocks)
1723  parallel_scan->phs_cblock = 0;
1724  if (parallel_scan->phs_cblock == parallel_scan->phs_startblock)
1725  {
1726  parallel_scan->phs_cblock = InvalidBlockNumber;
1727  report_page = parallel_scan->phs_startblock;
1728  }
1729  }
1730 
1731  /* Release the lock. */
1732  SpinLockRelease(&parallel_scan->phs_mutex);
1733 
1734  /*
1735  * Report scan location. Normally, we report the current page number.
1736  * When we reach the end of the scan, though, we report the starting page,
1737  * not the ending page, just so the starting positions for later scans
1738  * doesn't slew backwards. We only report the position at the end of the
1739  * scan once, though: subsequent callers will have report nothing, since
1740  * they will have page == InvalidBlockNumber.
1741  */
1742  if (scan->rs_syncscan)
1743  {
1744  if (report_page == InvalidBlockNumber)
1745  report_page = page;
1746  if (report_page != InvalidBlockNumber)
1747  ss_report_location(scan->rs_rd, report_page);
1748  }
1749 
1750  return page;
1751 }
1752 
1753 /* ----------------
1754  * heap_update_snapshot
1755  *
1756  * Update snapshot info in heap scan descriptor.
1757  * ----------------
1758  */
1759 void
1761 {
1762  Assert(IsMVCCSnapshot(snapshot));
1763 
1764  RegisterSnapshot(snapshot);
1765  scan->rs_snapshot = snapshot;
1766  scan->rs_temp_snap = true;
1767 }
1768 
1769 /* ----------------
1770  * heap_getnext - retrieve next tuple in scan
1771  *
1772  * Fix to work with index relations.
1773  * We don't return the buffer anymore, but you can get it from the
1774  * returned HeapTuple.
1775  * ----------------
1776  */
1777 
1778 #ifdef HEAPDEBUGALL
1779 #define HEAPDEBUG_1 \
1780  elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1781  RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1782 #define HEAPDEBUG_2 \
1783  elog(DEBUG2, "heap_getnext returning EOS")
1784 #define HEAPDEBUG_3 \
1785  elog(DEBUG2, "heap_getnext returning tuple")
1786 #else
1787 #define HEAPDEBUG_1
1788 #define HEAPDEBUG_2
1789 #define HEAPDEBUG_3
1790 #endif /* !defined(HEAPDEBUGALL) */
1791 
1792 
1793 HeapTuple
1795 {
1796  /* Note: no locking manipulations needed */
1797 
1798  HEAPDEBUG_1; /* heap_getnext( info ) */
1799 
1800  if (scan->rs_pageatatime)
1801  heapgettup_pagemode(scan, direction,
1802  scan->rs_nkeys, scan->rs_key);
1803  else
1804  heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);
1805 
1806  if (scan->rs_ctup.t_data == NULL)
1807  {
1808  HEAPDEBUG_2; /* heap_getnext returning EOS */
1809  return NULL;
1810  }
1811 
1812  /*
1813  * if we get here it means we have a new current scan tuple, so point to
1814  * the proper return buffer and return the tuple.
1815  */
1816  HEAPDEBUG_3; /* heap_getnext returning tuple */
1817 
1819 
1820  return &(scan->rs_ctup);
1821 }
1822 
1823 /*
1824  * heap_fetch - retrieve tuple with given tid
1825  *
1826  * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1827  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1828  * against the specified snapshot.
1829  *
1830  * If successful (tuple found and passes snapshot time qual), then *userbuf
1831  * is set to the buffer holding the tuple and TRUE is returned. The caller
1832  * must unpin the buffer when done with the tuple.
1833  *
1834  * If the tuple is not found (ie, item number references a deleted slot),
1835  * then tuple->t_data is set to NULL and FALSE is returned.
1836  *
1837  * If the tuple is found but fails the time qual check, then FALSE is returned
1838  * but tuple->t_data is left pointing to the tuple.
1839  *
1840  * keep_buf determines what is done with the buffer in the FALSE-result cases.
1841  * When the caller specifies keep_buf = true, we retain the pin on the buffer
1842  * and return it in *userbuf (so the caller must eventually unpin it); when
1843  * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
1844  *
1845  * stats_relation is the relation to charge the heap_fetch operation against
1846  * for statistical purposes. (This could be the heap rel itself, an
1847  * associated index, or NULL to not count the fetch at all.)
1848  *
1849  * heap_fetch does not follow HOT chains: only the exact TID requested will
1850  * be fetched.
1851  *
1852  * It is somewhat inconsistent that we ereport() on invalid block number but
1853  * return false on invalid item number. There are a couple of reasons though.
1854  * One is that the caller can relatively easily check the block number for
1855  * validity, but cannot check the item number without reading the page
1856  * himself. Another is that when we are following a t_ctid link, we can be
1857  * reasonably confident that the page number is valid (since VACUUM shouldn't
1858  * truncate off the destination page without having killed the referencing
1859  * tuple first), but the item number might well not be good.
1860  */
1861 bool
1863  Snapshot snapshot,
1864  HeapTuple tuple,
1865  Buffer *userbuf,
1866  bool keep_buf,
1867  Relation stats_relation)
1868 {
1869  ItemPointer tid = &(tuple->t_self);
1870  ItemId lp;
1871  Buffer buffer;
1872  Page page;
1873  OffsetNumber offnum;
1874  bool valid;
1875 
1876  /*
1877  * Fetch and pin the appropriate page of the relation.
1878  */
1879  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1880 
1881  /*
1882  * Need share lock on buffer to examine tuple commit status.
1883  */
1884  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1885  page = BufferGetPage(buffer);
1886  TestForOldSnapshot(snapshot, relation, page);
1887 
1888  /*
1889  * We'd better check for out-of-range offnum in case of VACUUM since the
1890  * TID was obtained.
1891  */
1892  offnum = ItemPointerGetOffsetNumber(tid);
1893  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1894  {
1895  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1896  if (keep_buf)
1897  *userbuf = buffer;
1898  else
1899  {
1900  ReleaseBuffer(buffer);
1901  *userbuf = InvalidBuffer;
1902  }
1903  tuple->t_data = NULL;
1904  return false;
1905  }
1906 
1907  /*
1908  * get the item line pointer corresponding to the requested tid
1909  */
1910  lp = PageGetItemId(page, offnum);
1911 
1912  /*
1913  * Must check for deleted tuple.
1914  */
1915  if (!ItemIdIsNormal(lp))
1916  {
1917  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1918  if (keep_buf)
1919  *userbuf = buffer;
1920  else
1921  {
1922  ReleaseBuffer(buffer);
1923  *userbuf = InvalidBuffer;
1924  }
1925  tuple->t_data = NULL;
1926  return false;
1927  }
1928 
1929  /*
1930  * fill in *tuple fields
1931  */
1932  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1933  tuple->t_len = ItemIdGetLength(lp);
1934  tuple->t_tableOid = RelationGetRelid(relation);
1935 
1936  /*
1937  * check time qualification of tuple, then release lock
1938  */
1939  valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1940 
1941  if (valid)
1942  PredicateLockTuple(relation, tuple, snapshot);
1943 
1944  CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1945 
1946  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1947 
1948  if (valid)
1949  {
1950  /*
1951  * All checks passed, so return the tuple as valid. Caller is now
1952  * responsible for releasing the buffer.
1953  */
1954  *userbuf = buffer;
1955 
1956  /* Count the successful fetch against appropriate rel, if any */
1957  if (stats_relation != NULL)
1958  pgstat_count_heap_fetch(stats_relation);
1959 
1960  return true;
1961  }
1962 
1963  /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1964  if (keep_buf)
1965  *userbuf = buffer;
1966  else
1967  {
1968  ReleaseBuffer(buffer);
1969  *userbuf = InvalidBuffer;
1970  }
1971 
1972  return false;
1973 }
1974 
1975 /*
1976  * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1977  *
1978  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1979  * of a HOT chain), and buffer is the buffer holding this tuple. We search
1980  * for the first chain member satisfying the given snapshot. If one is
1981  * found, we update *tid to reference that tuple's offset number, and
1982  * return TRUE. If no match, return FALSE without modifying *tid.
1983  *
1984  * heapTuple is a caller-supplied buffer. When a match is found, we return
1985  * the tuple here, in addition to updating *tid. If no match is found, the
1986  * contents of this buffer on return are undefined.
1987  *
1988  * If all_dead is not NULL, we check non-visible tuples to see if they are
1989  * globally dead; *all_dead is set TRUE if all members of the HOT chain
1990  * are vacuumable, FALSE if not.
1991  *
1992  * Unlike heap_fetch, the caller must already have pin and (at least) share
1993  * lock on the buffer; it is still pinned/locked at exit. Also unlike
1994  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1995  */
1996 bool
1998  Snapshot snapshot, HeapTuple heapTuple,
1999  bool *all_dead, bool first_call)
2000 {
2001  Page dp = (Page) BufferGetPage(buffer);
2002  TransactionId prev_xmax = InvalidTransactionId;
2003  OffsetNumber offnum;
2004  bool at_chain_start;
2005  bool valid;
2006  bool skip;
2007 
2008  /* If this is not the first call, previous call returned a (live!) tuple */
2009  if (all_dead)
2010  *all_dead = first_call;
2011 
2013 
2015  offnum = ItemPointerGetOffsetNumber(tid);
2016  at_chain_start = first_call;
2017  skip = !first_call;
2018 
2019  heapTuple->t_self = *tid;
2020 
2021  /* Scan through possible multiple members of HOT-chain */
2022  for (;;)
2023  {
2024  ItemId lp;
2025 
2026  /* check for bogus TID */
2027  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
2028  break;
2029 
2030  lp = PageGetItemId(dp, offnum);
2031 
2032  /* check for unused, dead, or redirected items */
2033  if (!ItemIdIsNormal(lp))
2034  {
2035  /* We should only see a redirect at start of chain */
2036  if (ItemIdIsRedirected(lp) && at_chain_start)
2037  {
2038  /* Follow the redirect */
2039  offnum = ItemIdGetRedirect(lp);
2040  at_chain_start = false;
2041  continue;
2042  }
2043  /* else must be end of chain */
2044  break;
2045  }
2046 
2047  heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
2048  heapTuple->t_len = ItemIdGetLength(lp);
2049  heapTuple->t_tableOid = RelationGetRelid(relation);
2050  ItemPointerSetOffsetNumber(&heapTuple->t_self, offnum);
2051 
2052  /*
2053  * Shouldn't see a HEAP_ONLY tuple at chain start.
2054  */
2055  if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
2056  break;
2057 
2058  /*
2059  * The xmin should match the previous xmax value, else chain is
2060  * broken.
2061  */
2062  if (TransactionIdIsValid(prev_xmax) &&
2063  !TransactionIdEquals(prev_xmax,
2064  HeapTupleHeaderGetXmin(heapTuple->t_data)))
2065  break;
2066 
2067  /*
2068  * When first_call is true (and thus, skip is initially false) we'll
2069  * return the first tuple we find. But on later passes, heapTuple
2070  * will initially be pointing to the tuple we returned last time.
2071  * Returning it again would be incorrect (and would loop forever), so
2072  * we skip it and return the next match we find.
2073  */
2074  if (!skip)
2075  {
2076  /*
2077  * For the benefit of logical decoding, have t_self point at the
2078  * element of the HOT chain we're currently investigating instead
2079  * of the root tuple of the HOT chain. This is important because
2080  * the *Satisfies routine for historical mvcc snapshots needs the
2081  * correct tid to decide about the visibility in some cases.
2082  */
2083  ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum);
2084 
2085  /* If it's visible per the snapshot, we must return it */
2086  valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
2087  CheckForSerializableConflictOut(valid, relation, heapTuple,
2088  buffer, snapshot);
2089  /* reset to original, non-redirected, tid */
2090  heapTuple->t_self = *tid;
2091 
2092  if (valid)
2093  {
2094  ItemPointerSetOffsetNumber(tid, offnum);
2095  PredicateLockTuple(relation, heapTuple, snapshot);
2096  if (all_dead)
2097  *all_dead = false;
2098  return true;
2099  }
2100  }
2101  skip = false;
2102 
2103  /*
2104  * If we can't see it, maybe no one else can either. At caller
2105  * request, check whether all chain members are dead to all
2106  * transactions.
2107  */
2108  if (all_dead && *all_dead &&
2110  *all_dead = false;
2111 
2112  /*
2113  * Check to see if HOT chain continues past this tuple; if so fetch
2114  * the next offnum and loop around.
2115  */
2116  if (HeapTupleIsHotUpdated(heapTuple))
2117  {
2120  offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
2121  at_chain_start = false;
2122  prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
2123  }
2124  else
2125  break; /* end of chain */
2126  }
2127 
2128  return false;
2129 }
2130 
2131 /*
2132  * heap_hot_search - search HOT chain for tuple satisfying snapshot
2133  *
2134  * This has the same API as heap_hot_search_buffer, except that the caller
2135  * does not provide the buffer containing the page, rather we access it
2136  * locally.
2137  */
2138 bool
2140  bool *all_dead)
2141 {
2142  bool result;
2143  Buffer buffer;
2144  HeapTupleData heapTuple;
2145 
2146  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
2147  LockBuffer(buffer, BUFFER_LOCK_SHARE);
2148  result = heap_hot_search_buffer(tid, relation, buffer, snapshot,
2149  &heapTuple, all_dead, true);
2150  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2151  ReleaseBuffer(buffer);
2152  return result;
2153 }
2154 
2155 /*
2156  * heap_get_latest_tid - get the latest tid of a specified tuple
2157  *
2158  * Actually, this gets the latest version that is visible according to
2159  * the passed snapshot. You can pass SnapshotDirty to get the very latest,
2160  * possibly uncommitted version.
2161  *
2162  * *tid is both an input and an output parameter: it is updated to
2163  * show the latest version of the row. Note that it will not be changed
2164  * if no version of the row passes the snapshot test.
2165  */
2166 void
2168  Snapshot snapshot,
2169  ItemPointer tid)
2170 {
2171  BlockNumber blk;
2172  ItemPointerData ctid;
2173  TransactionId priorXmax;
2174 
2175  /* this is to avoid Assert failures on bad input */
2176  if (!ItemPointerIsValid(tid))
2177  return;
2178 
2179  /*
2180  * Since this can be called with user-supplied TID, don't trust the input
2181  * too much. (RelationGetNumberOfBlocks is an expensive check, so we
2182  * don't check t_ctid links again this way. Note that it would not do to
2183  * call it just once and save the result, either.)
2184  */
2185  blk = ItemPointerGetBlockNumber(tid);
2186  if (blk >= RelationGetNumberOfBlocks(relation))
2187  elog(ERROR, "block number %u is out of range for relation \"%s\"",
2188  blk, RelationGetRelationName(relation));
2189 
2190  /*
2191  * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
2192  * need to examine, and *tid is the TID we will return if ctid turns out
2193  * to be bogus.
2194  *
2195  * Note that we will loop until we reach the end of the t_ctid chain.
2196  * Depending on the snapshot passed, there might be at most one visible
2197  * version of the row, but we don't try to optimize for that.
2198  */
2199  ctid = *tid;
2200  priorXmax = InvalidTransactionId; /* cannot check first XMIN */
2201  for (;;)
2202  {
2203  Buffer buffer;
2204  Page page;
2205  OffsetNumber offnum;
2206  ItemId lp;
2207  HeapTupleData tp;
2208  bool valid;
2209 
2210  /*
2211  * Read, pin, and lock the page.
2212  */
2213  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
2214  LockBuffer(buffer, BUFFER_LOCK_SHARE);
2215  page = BufferGetPage(buffer);
2216  TestForOldSnapshot(snapshot, relation, page);
2217 
2218  /*
2219  * Check for bogus item number. This is not treated as an error
2220  * condition because it can happen while following a t_ctid link. We
2221  * just assume that the prior tid is OK and return it unchanged.
2222  */
2223  offnum = ItemPointerGetOffsetNumber(&ctid);
2224  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
2225  {
2226  UnlockReleaseBuffer(buffer);
2227  break;
2228  }
2229  lp = PageGetItemId(page, offnum);
2230  if (!ItemIdIsNormal(lp))
2231  {
2232  UnlockReleaseBuffer(buffer);
2233  break;
2234  }
2235 
2236  /* OK to access the tuple */
2237  tp.t_self = ctid;
2238  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2239  tp.t_len = ItemIdGetLength(lp);
2240  tp.t_tableOid = RelationGetRelid(relation);
2241 
2242  /*
2243  * After following a t_ctid link, we might arrive at an unrelated
2244  * tuple. Check for XMIN match.
2245  */
2246  if (TransactionIdIsValid(priorXmax) &&
2248  {
2249  UnlockReleaseBuffer(buffer);
2250  break;
2251  }
2252 
2253  /*
2254  * Check time qualification of tuple; if visible, set it as the new
2255  * result candidate.
2256  */
2257  valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2258  CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2259  if (valid)
2260  *tid = ctid;
2261 
2262  /*
2263  * If there's a valid t_ctid link, follow it, else we're done.
2264  */
2265  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2268  {
2269  UnlockReleaseBuffer(buffer);
2270  break;
2271  }
2272 
2273  ctid = tp.t_data->t_ctid;
2274  priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2275  UnlockReleaseBuffer(buffer);
2276  } /* end of loop */
2277 }
2278 
2279 
2280 /*
2281  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2282  *
2283  * This is called after we have waited for the XMAX transaction to terminate.
2284  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2285  * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2286  * hint bit if possible --- but beware that that may not yet be possible,
2287  * if the transaction committed asynchronously.
2288  *
2289  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2290  * even if it commits.
2291  *
2292  * Hence callers should look only at XMAX_INVALID.
2293  *
2294  * Note this is not allowed for tuples whose xmax is a multixact.
2295  */
2296 static void
2298 {
2300  Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
2301 
2302  if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
2303  {
2304  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2307  xid);
2308  else
2309  HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
2311  }
2312 }
2313 
2314 
2315 /*
2316  * GetBulkInsertState - prepare status object for a bulk insert
2317  */
2320 {
2321  BulkInsertState bistate;
2322 
2323  bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2325  bistate->current_buf = InvalidBuffer;
2326  return bistate;
2327 }
2328 
2329 /*
2330  * FreeBulkInsertState - clean up after finishing a bulk insert
2331  */
2332 void
2334 {
2335  if (bistate->current_buf != InvalidBuffer)
2336  ReleaseBuffer(bistate->current_buf);
2337  FreeAccessStrategy(bistate->strategy);
2338  pfree(bistate);
2339 }
2340 
2341 /*
2342  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2343  */
2344 void
2346 {
2347  if (bistate->current_buf != InvalidBuffer)
2348  ReleaseBuffer(bistate->current_buf);
2349  bistate->current_buf = InvalidBuffer;
2350 }
2351 
2352 
2353 /*
2354  * heap_insert - insert tuple into a heap
2355  *
2356  * The new tuple is stamped with current transaction ID and the specified
2357  * command ID.
2358  *
2359  * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
2360  * logged in WAL, even for a non-temp relation. Safe usage of this behavior
2361  * requires that we arrange that all new tuples go into new pages not
2362  * containing any tuples from other transactions, and that the relation gets
2363  * fsync'd before commit. (See also heap_sync() comments)
2364  *
2365  * The HEAP_INSERT_SKIP_FSM option is passed directly to
2366  * RelationGetBufferForTuple, which see for more info.
2367  *
2368  * HEAP_INSERT_FROZEN should only be specified for inserts into
2369  * relfilenodes created during the current subtransaction and when
2370  * there are no prior snapshots or pre-existing portals open.
2371  * This causes rows to be frozen, which is an MVCC violation and
2372  * requires explicit options chosen by user.
2373  *
2374  * HEAP_INSERT_IS_SPECULATIVE is used on so-called "speculative insertions",
2375  * which can be backed out afterwards without aborting the whole transaction.
2376  * Other sessions can wait for the speculative insertion to be confirmed,
2377  * turning it into a regular tuple, or aborted, as if it never existed.
2378  * Speculatively inserted tuples behave as "value locks" of short duration,
2379  * used to implement INSERT .. ON CONFLICT.
2380  *
2381  * Note that most of these options will be applied when inserting into the
2382  * heap's TOAST table, too, if the tuple requires any out-of-line data. Only
2383  * HEAP_INSERT_IS_SPECULATIVE is explicitly ignored, as the toast data does
2384  * not partake in speculative insertion.
2385  *
2386  * The BulkInsertState object (if any; bistate can be NULL for default
2387  * behavior) is also just passed through to RelationGetBufferForTuple.
2388  *
2389  * The return value is the OID assigned to the tuple (either here or by the
2390  * caller), or InvalidOid if no OID. The header fields of *tup are updated
2391  * to match the stored tuple; in particular tup->t_self receives the actual
2392  * TID where the tuple was stored. But note that any toasting of fields
2393  * within the tuple data is NOT reflected into *tup.
2394  */
2395 Oid
2397  int options, BulkInsertState bistate)
2398 {
2400  HeapTuple heaptup;
2401  Buffer buffer;
2402  Buffer vmbuffer = InvalidBuffer;
2403  bool all_visible_cleared = false;
2404 
2405  /*
2406  * Fill in tuple header fields, assign an OID, and toast the tuple if
2407  * necessary.
2408  *
2409  * Note: below this point, heaptup is the data we actually intend to store
2410  * into the relation; tup is the caller's original untoasted data.
2411  */
2412  heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2413 
2414  /*
2415  * Find buffer to insert this tuple into. If the page is all visible,
2416  * this will also pin the requisite visibility map page.
2417  */
2418  buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2419  InvalidBuffer, options, bistate,
2420  &vmbuffer, NULL);
2421 
2422  /*
2423  * We're about to do the actual insert -- but check for conflict first, to
2424  * avoid possibly having to roll back work we've just done.
2425  *
2426  * This is safe without a recheck as long as there is no possibility of
2427  * another process scanning the page between this check and the insert
2428  * being visible to the scan (i.e., an exclusive buffer content lock is
2429  * continuously held from this point until the tuple insert is visible).
2430  *
2431  * For a heap insert, we only need to check for table-level SSI locks. Our
2432  * new tuple can't possibly conflict with existing tuple locks, and heap
2433  * page locks are only consolidated versions of tuple locks; they do not
2434  * lock "gaps" as index page locks do. So we don't need to specify a
2435  * buffer when making the call, which makes for a faster check.
2436  */
2438 
2439  /* NO EREPORT(ERROR) from here till changes are logged */
2441 
2442  RelationPutHeapTuple(relation, buffer, heaptup,
2443  (options & HEAP_INSERT_SPECULATIVE) != 0);
2444 
2445  if (PageIsAllVisible(BufferGetPage(buffer)))
2446  {
2447  all_visible_cleared = true;
2449  visibilitymap_clear(relation,
2450  ItemPointerGetBlockNumber(&(heaptup->t_self)),
2451  vmbuffer, VISIBILITYMAP_VALID_BITS);
2452  }
2453 
2454  /*
2455  * XXX Should we set PageSetPrunable on this page ?
2456  *
2457  * The inserting transaction may eventually abort thus making this tuple
2458  * DEAD and hence available for pruning. Though we don't want to optimize
2459  * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2460  * aborted tuple will never be pruned until next vacuum is triggered.
2461  *
2462  * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2463  */
2464 
2465  MarkBufferDirty(buffer);
2466 
2467  /* XLOG stuff */
2468  if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
2469  {
2470  xl_heap_insert xlrec;
2471  xl_heap_header xlhdr;
2472  XLogRecPtr recptr;
2473  Page page = BufferGetPage(buffer);
2474  uint8 info = XLOG_HEAP_INSERT;
2475  int bufflags = 0;
2476 
2477  /*
2478  * If this is a catalog, we need to transmit combocids to properly
2479  * decode, so log that as well.
2480  */
2482  log_heap_new_cid(relation, heaptup);
2483 
2484  /*
2485  * If this is the single and first tuple on page, we can reinit the
2486  * page instead of restoring the whole thing. Set flag, and hide
2487  * buffer references from XLogInsert.
2488  */
2489  if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2491  {
2492  info |= XLOG_HEAP_INIT_PAGE;
2493  bufflags |= REGBUF_WILL_INIT;
2494  }
2495 
2496  xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2497  xlrec.flags = 0;
2498  if (all_visible_cleared)
2500  if (options & HEAP_INSERT_SPECULATIVE)
2503 
2504  /*
2505  * For logical decoding, we need the tuple even if we're doing a full
2506  * page write, so make sure it's included even if we take a full-page
2507  * image. (XXX We could alternatively store a pointer into the FPW).
2508  */
2509  if (RelationIsLogicallyLogged(relation))
2510  {
2512  bufflags |= REGBUF_KEEP_DATA;
2513  }
2514 
2515  XLogBeginInsert();
2516  XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2517 
2518  xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2519  xlhdr.t_infomask = heaptup->t_data->t_infomask;
2520  xlhdr.t_hoff = heaptup->t_data->t_hoff;
2521 
2522  /*
2523  * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2524  * write the whole page to the xlog, we don't need to store
2525  * xl_heap_header in the xlog.
2526  */
2527  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2528  XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2529  /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2531  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2532  heaptup->t_len - SizeofHeapTupleHeader);
2533 
2534  /* filtering by origin on a row level is much more efficient */
2536 
2537  recptr = XLogInsert(RM_HEAP_ID, info);
2538 
2539  PageSetLSN(page, recptr);
2540  }
2541 
2542  END_CRIT_SECTION();
2543 
2544  UnlockReleaseBuffer(buffer);
2545  if (vmbuffer != InvalidBuffer)
2546  ReleaseBuffer(vmbuffer);
2547 
2548  /*
2549  * If tuple is cachable, mark it for invalidation from the caches in case
2550  * we abort. Note it is OK to do this after releasing the buffer, because
2551  * the heaptup data structure is all in local memory, not in the shared
2552  * buffer.
2553  */
2554  CacheInvalidateHeapTuple(relation, heaptup, NULL);
2555 
2556  /* Note: speculative insertions are counted too, even if aborted later */
2557  pgstat_count_heap_insert(relation, 1);
2558 
2559  /*
2560  * If heaptup is a private copy, release it. Don't forget to copy t_self
2561  * back to the caller's image, too.
2562  */
2563  if (heaptup != tup)
2564  {
2565  tup->t_self = heaptup->t_self;
2566  heap_freetuple(heaptup);
2567  }
2568 
2569  return HeapTupleGetOid(tup);
2570 }
2571 
2572 /*
2573  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2574  * tuple header fields, assigns an OID, and toasts the tuple if necessary.
2575  * Returns a toasted version of the tuple if it was toasted, or the original
2576  * tuple if not. Note that in any case, the header fields are also set in
2577  * the original tuple.
2578  */
2579 static HeapTuple
2581  CommandId cid, int options)
2582 {
2583  /*
2584  * For now, parallel operations are required to be strictly read-only.
2585  * Unlike heap_update() and heap_delete(), an insert should never create a
2586  * combo CID, so it might be possible to relax this restriction, but not
2587  * without more thought and testing.
2588  */
2589  if (IsInParallelMode())
2590  ereport(ERROR,
2591  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2592  errmsg("cannot insert tuples during a parallel operation")));
2593 
2594  if (relation->rd_rel->relhasoids)
2595  {
2596 #ifdef NOT_USED
2597  /* this is redundant with an Assert in HeapTupleSetOid */
2599 #endif
2600 
2601  /*
2602  * If the object id of this tuple has already been assigned, trust the
2603  * caller. There are a couple of ways this can happen. At initial db
2604  * creation, the backend program sets oids for tuples. When we define
2605  * an index, we set the oid. Finally, in the future, we may allow
2606  * users to set their own object ids in order to support a persistent
2607  * object store (objects need to contain pointers to one another).
2608  */
2609  if (!OidIsValid(HeapTupleGetOid(tup)))
2610  HeapTupleSetOid(tup, GetNewOid(relation));
2611  }
2612  else
2613  {
2614  /* check there is not space for an OID */
2615  Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
2616  }
2617 
2618  tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2619  tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2621  HeapTupleHeaderSetXmin(tup->t_data, xid);
2622  if (options & HEAP_INSERT_FROZEN)
2624 
2625  HeapTupleHeaderSetCmin(tup->t_data, cid);
2626  HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2627  tup->t_tableOid = RelationGetRelid(relation);
2628 
2629  /*
2630  * If the new tuple is too big for storage or contains already toasted
2631  * out-of-line attributes from some other relation, invoke the toaster.
2632  */
2633  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2634  relation->rd_rel->relkind != RELKIND_MATVIEW)
2635  {
2636  /* toast table entries should never be recursively toasted */
2638  return tup;
2639  }
2640  else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2641  return toast_insert_or_update(relation, tup, NULL, options);
2642  else
2643  return tup;
2644 }
2645 
2646 /*
2647  * heap_multi_insert - insert multiple tuple into a heap
2648  *
2649  * This is like heap_insert(), but inserts multiple tuples in one operation.
2650  * That's faster than calling heap_insert() in a loop, because when multiple
2651  * tuples can be inserted on a single page, we can write just a single WAL
2652  * record covering all of them, and only need to lock/unlock the page once.
2653  *
2654  * Note: this leaks memory into the current memory context. You can create a
2655  * temporary context before calling this, if that's a problem.
2656  */
2657 void
2658 heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
2659  CommandId cid, int options, BulkInsertState bistate)
2660 {
2662  HeapTuple *heaptuples;
2663  int i;
2664  int ndone;
2665  char *scratch = NULL;
2666  Page page;
2667  bool needwal;
2668  Size saveFreeSpace;
2669  bool need_tuple_data = RelationIsLogicallyLogged(relation);
2670  bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2671 
2672  needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2673  saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2675 
2676  /* Toast and set header data in all the tuples */
2677  heaptuples = palloc(ntuples * sizeof(HeapTuple));
2678  for (i = 0; i < ntuples; i++)
2679  heaptuples[i] = heap_prepare_insert(relation, tuples[i],
2680  xid, cid, options);
2681 
2682  /*
2683  * Allocate some memory to use for constructing the WAL record. Using
2684  * palloc() within a critical section is not safe, so we allocate this
2685  * beforehand.
2686  */
2687  if (needwal)
2688  scratch = palloc(BLCKSZ);
2689 
2690  /*
2691  * We're about to do the actual inserts -- but check for conflict first,
2692  * to minimize the possibility of having to roll back work we've just
2693  * done.
2694  *
2695  * A check here does not definitively prevent a serialization anomaly;
2696  * that check MUST be done at least past the point of acquiring an
2697  * exclusive buffer content lock on every buffer that will be affected,
2698  * and MAY be done after all inserts are reflected in the buffers and
2699  * those locks are released; otherwise there race condition. Since
2700  * multiple buffers can be locked and unlocked in the loop below, and it
2701  * would not be feasible to identify and lock all of those buffers before
2702  * the loop, we must do a final check at the end.
2703  *
2704  * The check here could be omitted with no loss of correctness; it is
2705  * present strictly as an optimization.
2706  *
2707  * For heap inserts, we only need to check for table-level SSI locks. Our
2708  * new tuples can't possibly conflict with existing tuple locks, and heap
2709  * page locks are only consolidated versions of tuple locks; they do not
2710  * lock "gaps" as index page locks do. So we don't need to specify a
2711  * buffer when making the call, which makes for a faster check.
2712  */
2714 
2715  ndone = 0;
2716  while (ndone < ntuples)
2717  {
2718  Buffer buffer;
2719  Buffer vmbuffer = InvalidBuffer;
2720  bool all_visible_cleared = false;
2721  int nthispage;
2722 
2724 
2725  /*
2726  * Find buffer where at least the next tuple will fit. If the page is
2727  * all-visible, this will also pin the requisite visibility map page.
2728  */
2729  buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2730  InvalidBuffer, options, bistate,
2731  &vmbuffer, NULL);
2732  page = BufferGetPage(buffer);
2733 
2734  /* NO EREPORT(ERROR) from here till changes are logged */
2736 
2737  /*
2738  * RelationGetBufferForTuple has ensured that the first tuple fits.
2739  * Put that on the page, and then as many other tuples as fit.
2740  */
2741  RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2742  for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2743  {
2744  HeapTuple heaptup = heaptuples[ndone + nthispage];
2745 
2746  if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2747  break;
2748 
2749  RelationPutHeapTuple(relation, buffer, heaptup, false);
2750 
2751  /*
2752  * We don't use heap_multi_insert for catalog tuples yet, but
2753  * better be prepared...
2754  */
2755  if (needwal && need_cids)
2756  log_heap_new_cid(relation, heaptup);
2757  }
2758 
2759  if (PageIsAllVisible(page))
2760  {
2761  all_visible_cleared = true;
2762  PageClearAllVisible(page);
2763  visibilitymap_clear(relation,
2764  BufferGetBlockNumber(buffer),
2765  vmbuffer, VISIBILITYMAP_VALID_BITS);
2766  }
2767 
2768  /*
2769  * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2770  */
2771 
2772  MarkBufferDirty(buffer);
2773 
2774  /* XLOG stuff */
2775  if (needwal)
2776  {
2777  XLogRecPtr recptr;
2778  xl_heap_multi_insert *xlrec;
2780  char *tupledata;
2781  int totaldatalen;
2782  char *scratchptr = scratch;
2783  bool init;
2784  int bufflags = 0;
2785 
2786  /*
2787  * If the page was previously empty, we can reinit the page
2788  * instead of restoring the whole thing.
2789  */
2790  init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2791  PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2792 
2793  /* allocate xl_heap_multi_insert struct from the scratch area */
2794  xlrec = (xl_heap_multi_insert *) scratchptr;
2795  scratchptr += SizeOfHeapMultiInsert;
2796 
2797  /*
2798  * Allocate offsets array. Unless we're reinitializing the page,
2799  * in that case the tuples are stored in order starting at
2800  * FirstOffsetNumber and we don't need to store the offsets
2801  * explicitly.
2802  */
2803  if (!init)
2804  scratchptr += nthispage * sizeof(OffsetNumber);
2805 
2806  /* the rest of the scratch space is used for tuple data */
2807  tupledata = scratchptr;
2808 
2809  xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2810  xlrec->ntuples = nthispage;
2811 
2812  /*
2813  * Write out an xl_multi_insert_tuple and the tuple data itself
2814  * for each tuple.
2815  */
2816  for (i = 0; i < nthispage; i++)
2817  {
2818  HeapTuple heaptup = heaptuples[ndone + i];
2819  xl_multi_insert_tuple *tuphdr;
2820  int datalen;
2821 
2822  if (!init)
2823  xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2824  /* xl_multi_insert_tuple needs two-byte alignment. */
2825  tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2826  scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2827 
2828  tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2829  tuphdr->t_infomask = heaptup->t_data->t_infomask;
2830  tuphdr->t_hoff = heaptup->t_data->t_hoff;
2831 
2832  /* write bitmap [+ padding] [+ oid] + data */
2833  datalen = heaptup->t_len - SizeofHeapTupleHeader;
2834  memcpy(scratchptr,
2835  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2836  datalen);
2837  tuphdr->datalen = datalen;
2838  scratchptr += datalen;
2839  }
2840  totaldatalen = scratchptr - tupledata;
2841  Assert((scratchptr - scratch) < BLCKSZ);
2842 
2843  if (need_tuple_data)
2845 
2846  /*
2847  * Signal that this is the last xl_heap_multi_insert record
2848  * emitted by this call to heap_multi_insert(). Needed for logical
2849  * decoding so it knows when to cleanup temporary data.
2850  */
2851  if (ndone + nthispage == ntuples)
2852  xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2853 
2854  if (init)
2855  {
2856  info |= XLOG_HEAP_INIT_PAGE;
2857  bufflags |= REGBUF_WILL_INIT;
2858  }
2859 
2860  /*
2861  * If we're doing logical decoding, include the new tuple data
2862  * even if we take a full-page image of the page.
2863  */
2864  if (need_tuple_data)
2865  bufflags |= REGBUF_KEEP_DATA;
2866 
2867  XLogBeginInsert();
2868  XLogRegisterData((char *) xlrec, tupledata - scratch);
2869  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2870 
2871  XLogRegisterBufData(0, tupledata, totaldatalen);
2872 
2873  /* filtering by origin on a row level is much more efficient */
2875 
2876  recptr = XLogInsert(RM_HEAP2_ID, info);
2877 
2878  PageSetLSN(page, recptr);
2879  }
2880 
2881  END_CRIT_SECTION();
2882 
2883  UnlockReleaseBuffer(buffer);
2884  if (vmbuffer != InvalidBuffer)
2885  ReleaseBuffer(vmbuffer);
2886 
2887  ndone += nthispage;
2888  }
2889 
2890  /*
2891  * We're done with the actual inserts. Check for conflicts again, to
2892  * ensure that all rw-conflicts in to these inserts are detected. Without
2893  * this final check, a sequential scan of the heap may have locked the
2894  * table after the "before" check, missing one opportunity to detect the
2895  * conflict, and then scanned the table before the new tuples were there,
2896  * missing the other chance to detect the conflict.
2897  *
2898  * For heap inserts, we only need to check for table-level SSI locks. Our
2899  * new tuples can't possibly conflict with existing tuple locks, and heap
2900  * page locks are only consolidated versions of tuple locks; they do not
2901  * lock "gaps" as index page locks do. So we don't need to specify a
2902  * buffer when making the call.
2903  */
2905 
2906  /*
2907  * If tuples are cachable, mark them for invalidation from the caches in
2908  * case we abort. Note it is OK to do this after releasing the buffer,
2909  * because the heaptuples data structure is all in local memory, not in
2910  * the shared buffer.
2911  */
2912  if (IsCatalogRelation(relation))
2913  {
2914  for (i = 0; i < ntuples; i++)
2915  CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2916  }
2917 
2918  /*
2919  * Copy t_self fields back to the caller's original tuples. This does
2920  * nothing for untoasted tuples (tuples[i] == heaptuples[i)], but it's
2921  * probably faster to always copy than check.
2922  */
2923  for (i = 0; i < ntuples; i++)
2924  tuples[i]->t_self = heaptuples[i]->t_self;
2925 
2926  pgstat_count_heap_insert(relation, ntuples);
2927 }
2928 
2929 /*
2930  * simple_heap_insert - insert a tuple
2931  *
2932  * Currently, this routine differs from heap_insert only in supplying
2933  * a default command ID and not allowing access to the speedup options.
2934  *
2935  * This should be used rather than using heap_insert directly in most places
2936  * where we are modifying system catalogs.
2937  */
2938 Oid
2940 {
2941  return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2942 }
2943 
2944 /*
2945  * Given infomask/infomask2, compute the bits that must be saved in the
2946  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2947  * xl_heap_lock_updated WAL records.
2948  *
2949  * See fix_infomask_from_infobits.
2950  */
2951 static uint8
2952 compute_infobits(uint16 infomask, uint16 infomask2)
2953 {
2954  return
2955  ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2956  ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2957  ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2958  /* note we ignore HEAP_XMAX_SHR_LOCK here */
2959  ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2960  ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2961  XLHL_KEYS_UPDATED : 0);
2962 }
2963 
2964 /*
2965  * Given two versions of the same t_infomask for a tuple, compare them and
2966  * return whether the relevant status for a tuple Xmax has changed. This is
2967  * used after a buffer lock has been released and reacquired: we want to ensure
2968  * that the tuple state continues to be the same it was when we previously
2969  * examined it.
2970  *
2971  * Note the Xmax field itself must be compared separately.
2972  */
2973 static inline bool
2974 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2975 {
2976  const uint16 interesting =
2978 
2979  if ((new_infomask & interesting) != (old_infomask & interesting))
2980  return true;
2981 
2982  return false;
2983 }
2984 
2985 /*
2986  * heap_delete - delete a tuple
2987  *
2988  * NB: do not call this directly unless you are prepared to deal with
2989  * concurrent-update conditions. Use simple_heap_delete instead.
2990  *
2991  * relation - table to be modified (caller must hold suitable lock)
2992  * tid - TID of tuple to be deleted
2993  * cid - delete command ID (used for visibility test, and stored into
2994  * cmax if successful)
2995  * crosscheck - if not InvalidSnapshot, also check tuple against this
2996  * wait - true if should wait for any conflicting update to commit/abort
2997  * hufd - output parameter, filled in failure cases (see below)
2998  *
2999  * Normal, successful return value is HeapTupleMayBeUpdated, which
3000  * actually means we did delete it. Failure return codes are
3001  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3002  * (the last only possible if wait == false).
3003  *
3004  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3005  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3006  * (the last only for HeapTupleSelfUpdated, since we
3007  * cannot obtain cmax from a combocid generated by another transaction).
3008  * See comments for struct HeapUpdateFailureData for additional info.
3009  */
3012  CommandId cid, Snapshot crosscheck, bool wait,
3013  HeapUpdateFailureData *hufd)
3014 {
3017  ItemId lp;
3018  HeapTupleData tp;
3019  Page page;
3020  BlockNumber block;
3021  Buffer buffer;
3022  Buffer vmbuffer = InvalidBuffer;
3023  TransactionId new_xmax;
3024  uint16 new_infomask,
3025  new_infomask2;
3026  bool have_tuple_lock = false;
3027  bool iscombo;
3028  bool all_visible_cleared = false;
3029  HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
3030  bool old_key_copied = false;
3031 
3032  Assert(ItemPointerIsValid(tid));
3033 
3034  /*
3035  * Forbid this during a parallel operation, lest it allocate a combocid.
3036  * Other workers might need that combocid for visibility checks, and we
3037  * have no provision for broadcasting it to them.
3038  */
3039  if (IsInParallelMode())
3040  ereport(ERROR,
3041  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3042  errmsg("cannot delete tuples during a parallel operation")));
3043 
3044  block = ItemPointerGetBlockNumber(tid);
3045  buffer = ReadBuffer(relation, block);
3046  page = BufferGetPage(buffer);
3047 
3048  /*
3049  * Before locking the buffer, pin the visibility map page if it appears to
3050  * be necessary. Since we haven't got the lock yet, someone else might be
3051  * in the middle of changing this, so we'll need to recheck after we have
3052  * the lock.
3053  */
3054  if (PageIsAllVisible(page))
3055  visibilitymap_pin(relation, block, &vmbuffer);
3056 
3058 
3059  /*
3060  * If we didn't pin the visibility map page and the page has become all
3061  * visible while we were busy locking the buffer, we'll have to unlock and
3062  * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
3063  * unfortunate, but hopefully shouldn't happen often.
3064  */
3065  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3066  {
3067  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3068  visibilitymap_pin(relation, block, &vmbuffer);
3070  }
3071 
3072  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
3073  Assert(ItemIdIsNormal(lp));
3074 
3075  tp.t_tableOid = RelationGetRelid(relation);
3076  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3077  tp.t_len = ItemIdGetLength(lp);
3078  tp.t_self = *tid;
3079 
3080 l1:
3081  result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
3082 
3083  if (result == HeapTupleInvisible)
3084  {
3085  UnlockReleaseBuffer(buffer);
3086  ereport(ERROR,
3087  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3088  errmsg("attempted to delete invisible tuple")));
3089  }
3090  else if (result == HeapTupleBeingUpdated && wait)
3091  {
3092  TransactionId xwait;
3093  uint16 infomask;
3094 
3095  /* must copy state data before unlocking buffer */
3096  xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
3097  infomask = tp.t_data->t_infomask;
3098 
3099  /*
3100  * Sleep until concurrent transaction ends -- except when there's a
3101  * single locker and it's our own transaction. Note we don't care
3102  * which lock mode the locker has, because we need the strongest one.
3103  *
3104  * Before sleeping, we need to acquire tuple lock to establish our
3105  * priority for the tuple (see heap_lock_tuple). LockTuple will
3106  * release us when we are next-in-line for the tuple.
3107  *
3108  * If we are forced to "start over" below, we keep the tuple lock;
3109  * this arranges that we stay at the head of the line while rechecking
3110  * tuple state.
3111  */
3112  if (infomask & HEAP_XMAX_IS_MULTI)
3113  {
3114  /* wait for multixact */
3115  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3117  {
3118  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3119 
3120  /* acquire tuple lock, if necessary */
3122  LockWaitBlock, &have_tuple_lock);
3123 
3124  /* wait for multixact */
3126  relation, &(tp.t_self), XLTW_Delete,
3127  NULL);
3129 
3130  /*
3131  * If xwait had just locked the tuple then some other xact
3132  * could update this tuple before we get to this point. Check
3133  * for xmax change, and start over if so.
3134  */
3135  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3137  xwait))
3138  goto l1;
3139  }
3140 
3141  /*
3142  * You might think the multixact is necessarily done here, but not
3143  * so: it could have surviving members, namely our own xact or
3144  * other subxacts of this backend. It is legal for us to delete
3145  * the tuple in either case, however (the latter case is
3146  * essentially a situation of upgrading our former shared lock to
3147  * exclusive). We don't bother changing the on-disk hint bits
3148  * since we are about to overwrite the xmax altogether.
3149  */
3150  }
3151  else if (!TransactionIdIsCurrentTransactionId(xwait))
3152  {
3153  /*
3154  * Wait for regular transaction to end; but first, acquire tuple
3155  * lock.
3156  */
3157  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3159  LockWaitBlock, &have_tuple_lock);
3160  XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3162 
3163  /*
3164  * xwait is done, but if xwait had just locked the tuple then some
3165  * other xact could update this tuple before we get to this point.
3166  * Check for xmax change, and start over if so.
3167  */
3168  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3170  xwait))
3171  goto l1;
3172 
3173  /* Otherwise check if it committed or aborted */
3174  UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3175  }
3176 
3177  /*
3178  * We may overwrite if previous xmax aborted, or if it committed but
3179  * only locked the tuple without updating it.
3180  */
3181  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3184  result = HeapTupleMayBeUpdated;
3185  else
3186  result = HeapTupleUpdated;
3187  }
3188 
3189  if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3190  {
3191  /* Perform additional check for transaction-snapshot mode RI updates */
3192  if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3193  result = HeapTupleUpdated;
3194  }
3195 
3196  if (result != HeapTupleMayBeUpdated)
3197  {
3198  Assert(result == HeapTupleSelfUpdated ||
3199  result == HeapTupleUpdated ||
3200  result == HeapTupleBeingUpdated);
3202  hufd->ctid = tp.t_data->t_ctid;
3204  if (result == HeapTupleSelfUpdated)
3205  hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
3206  else
3207  hufd->cmax = InvalidCommandId;
3208  UnlockReleaseBuffer(buffer);
3209  if (have_tuple_lock)
3210  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3211  if (vmbuffer != InvalidBuffer)
3212  ReleaseBuffer(vmbuffer);
3213  return result;
3214  }
3215 
3216  /*
3217  * We're about to do the actual delete -- check for conflict first, to
3218  * avoid possibly having to roll back work we've just done.
3219  *
3220  * This is safe without a recheck as long as there is no possibility of
3221  * another process scanning the page between this check and the delete
3222  * being visible to the scan (i.e., an exclusive buffer content lock is
3223  * continuously held from this point until the tuple delete is visible).
3224  */
3225  CheckForSerializableConflictIn(relation, &tp, buffer);
3226 
3227  /* replace cid with a combo cid if necessary */
3228  HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
3229 
3230  /*
3231  * Compute replica identity tuple before entering the critical section so
3232  * we don't PANIC upon a memory allocation failure.
3233  */
3234  old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3235 
3236  /*
3237  * If this is the first possibly-multixact-able operation in the current
3238  * transaction, set my per-backend OldestMemberMXactId setting. We can be
3239  * certain that the transaction will never become a member of any older
3240  * MultiXactIds than that. (We have to do this even if we end up just
3241  * using our own TransactionId below, since some other backend could
3242  * incorporate our XID into a MultiXact immediately afterwards.)
3243  */
3245 
3248  xid, LockTupleExclusive, true,
3249  &new_xmax, &new_infomask, &new_infomask2);
3250 
3252 
3253  /*
3254  * If this transaction commits, the tuple will become DEAD sooner or
3255  * later. Set flag that this page is a candidate for pruning once our xid
3256  * falls below the OldestXmin horizon. If the transaction finally aborts,
3257  * the subsequent page pruning will be a no-op and the hint will be
3258  * cleared.
3259  */
3260  PageSetPrunable(page, xid);
3261 
3262  if (PageIsAllVisible(page))
3263  {
3264  all_visible_cleared = true;
3265  PageClearAllVisible(page);
3266  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3267  vmbuffer, VISIBILITYMAP_VALID_BITS);
3268  }
3269 
3270  /* store transaction information of xact deleting the tuple */
3273  tp.t_data->t_infomask |= new_infomask;
3274  tp.t_data->t_infomask2 |= new_infomask2;
3276  HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3277  HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
3278  /* Make sure there is no forward chain link in t_ctid */
3279  tp.t_data->t_ctid = tp.t_self;
3280 
3281  MarkBufferDirty(buffer);
3282 
3283  /*
3284  * XLOG stuff
3285  *
3286  * NB: heap_abort_speculative() uses the same xlog record and replay
3287  * routines.
3288  */
3289  if (RelationNeedsWAL(relation))
3290  {
3291  xl_heap_delete xlrec;
3292  XLogRecPtr recptr;
3293 
3294  /* For logical decode we need combocids to properly decode the catalog */
3296  log_heap_new_cid(relation, &tp);
3297 
3298  xlrec.flags = all_visible_cleared ? XLH_DELETE_ALL_VISIBLE_CLEARED : 0;
3300  tp.t_data->t_infomask2);
3302  xlrec.xmax = new_xmax;
3303 
3304  if (old_key_tuple != NULL)
3305  {
3306  if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3308  else
3310  }
3311 
3312  XLogBeginInsert();
3313  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3314 
3315  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3316 
3317  /*
3318  * Log replica identity of the deleted tuple if there is one
3319  */
3320  if (old_key_tuple != NULL)
3321  {
3322  xl_heap_header xlhdr;
3323 
3324  xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3325  xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3326  xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3327 
3328  XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3329  XLogRegisterData((char *) old_key_tuple->t_data
3331  old_key_tuple->t_len
3333  }
3334 
3335  /* filtering by origin on a row level is much more efficient */
3337 
3338  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3339 
3340  PageSetLSN(page, recptr);
3341  }
3342 
3343  END_CRIT_SECTION();
3344 
3345  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3346 
3347  if (vmbuffer != InvalidBuffer)
3348  ReleaseBuffer(vmbuffer);
3349 
3350  /*
3351  * If the tuple has toasted out-of-line attributes, we need to delete
3352  * those items too. We have to do this before releasing the buffer
3353  * because we need to look at the contents of the tuple, but it's OK to
3354  * release the content lock on the buffer first.
3355  */
3356  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3357  relation->rd_rel->relkind != RELKIND_MATVIEW)
3358  {
3359  /* toast table entries should never be recursively toasted */
3361  }
3362  else if (HeapTupleHasExternal(&tp))
3363  toast_delete(relation, &tp, false);
3364 
3365  /*
3366  * Mark tuple for invalidation from system caches at next command
3367  * boundary. We have to do this before releasing the buffer because we
3368  * need to look at the contents of the tuple.
3369  */
3370  CacheInvalidateHeapTuple(relation, &tp, NULL);
3371 
3372  /* Now we can release the buffer */
3373  ReleaseBuffer(buffer);
3374 
3375  /*
3376  * Release the lmgr tuple lock, if we had it.
3377  */
3378  if (have_tuple_lock)
3379  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3380 
3381  pgstat_count_heap_delete(relation);
3382 
3383  if (old_key_tuple != NULL && old_key_copied)
3384  heap_freetuple(old_key_tuple);
3385 
3386  return HeapTupleMayBeUpdated;
3387 }
3388 
3389 /*
3390  * simple_heap_delete - delete a tuple
3391  *
3392  * This routine may be used to delete a tuple when concurrent updates of
3393  * the target tuple are not expected (for example, because we have a lock
3394  * on the relation associated with the tuple). Any failure is reported
3395  * via ereport().
3396  */
3397 void
3399 {
3401  HeapUpdateFailureData hufd;
3402 
3403  result = heap_delete(relation, tid,
3405  true /* wait for commit */ ,
3406  &hufd);
3407  switch (result)
3408  {
3409  case HeapTupleSelfUpdated:
3410  /* Tuple was already updated in current command? */
3411  elog(ERROR, "tuple already updated by self");
3412  break;
3413 
3414  case HeapTupleMayBeUpdated:
3415  /* done successfully */
3416  break;
3417 
3418  case HeapTupleUpdated:
3419  elog(ERROR, "tuple concurrently updated");
3420  break;
3421 
3422  default:
3423  elog(ERROR, "unrecognized heap_delete status: %u", result);
3424  break;
3425  }
3426 }
3427 
3428 /*
3429  * heap_update - replace a tuple
3430  *
3431  * NB: do not call this directly unless you are prepared to deal with
3432  * concurrent-update conditions. Use simple_heap_update instead.
3433  *
3434  * relation - table to be modified (caller must hold suitable lock)
3435  * otid - TID of old tuple to be replaced
3436  * newtup - newly constructed tuple data to store
3437  * cid - update command ID (used for visibility test, and stored into
3438  * cmax/cmin if successful)
3439  * crosscheck - if not InvalidSnapshot, also check old tuple against this
3440  * wait - true if should wait for any conflicting update to commit/abort
3441  * hufd - output parameter, filled in failure cases (see below)
3442  * lockmode - output parameter, filled with lock mode acquired on tuple
3443  *
3444  * Normal, successful return value is HeapTupleMayBeUpdated, which
3445  * actually means we *did* update it. Failure return codes are
3446  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3447  * (the last only possible if wait == false).
3448  *
3449  * On success, the header fields of *newtup are updated to match the new
3450  * stored tuple; in particular, newtup->t_self is set to the TID where the
3451  * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
3452  * update was done. However, any TOAST changes in the new tuple's
3453  * data are not reflected into *newtup.
3454  *
3455  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3456  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3457  * (the last only for HeapTupleSelfUpdated, since we
3458  * cannot obtain cmax from a combocid generated by another transaction).
3459  * See comments for struct HeapUpdateFailureData for additional info.
3460  */
3463  CommandId cid, Snapshot crosscheck, bool wait,
3464  HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
3465 {
3468  Bitmapset *hot_attrs;
3469  Bitmapset *key_attrs;
3470  Bitmapset *id_attrs;
3471  Bitmapset *interesting_attrs;
3472  Bitmapset *modified_attrs;
3473  ItemId lp;
3474  HeapTupleData oldtup;
3475  HeapTuple heaptup;
3476  HeapTuple old_key_tuple = NULL;
3477  bool old_key_copied = false;
3478  Page page;
3479  BlockNumber block;
3480  MultiXactStatus mxact_status;
3481  Buffer buffer,
3482  newbuf,
3483  vmbuffer = InvalidBuffer,
3484  vmbuffer_new = InvalidBuffer;
3485  bool need_toast;
3486  Size newtupsize,
3487  pagefree;
3488  bool have_tuple_lock = false;
3489  bool iscombo;
3490  bool use_hot_update = false;
3491  bool hot_attrs_checked = false;
3492  bool key_intact;
3493  bool all_visible_cleared = false;
3494  bool all_visible_cleared_new = false;
3495  bool checked_lockers;
3496  bool locker_remains;
3497  TransactionId xmax_new_tuple,
3498  xmax_old_tuple;
3499  uint16 infomask_old_tuple,
3500  infomask2_old_tuple,
3501  infomask_new_tuple,
3502  infomask2_new_tuple;
3503 
3504  Assert(ItemPointerIsValid(otid));
3505 
3506  /*
3507  * Forbid this during a parallel operation, lest it allocate a combocid.
3508  * Other workers might need that combocid for visibility checks, and we
3509  * have no provision for broadcasting it to them.
3510  */
3511  if (IsInParallelMode())
3512  ereport(ERROR,
3513  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3514  errmsg("cannot update tuples during a parallel operation")));
3515 
3516  /*
3517  * Fetch the list of attributes to be checked for various operations.
3518  *
3519  * For HOT considerations, this is wasted effort if we fail to update or
3520  * have to put the new tuple on a different page. But we must compute the
3521  * list before obtaining buffer lock --- in the worst case, if we are
3522  * doing an update on one of the relevant system catalogs, we could
3523  * deadlock if we try to fetch the list later. In any case, the relcache
3524  * caches the data so this is usually pretty cheap.
3525  *
3526  * We also need columns used by the replica identity and columns that are
3527  * considered the "key" of rows in the table.
3528  *
3529  * Note that we get copies of each bitmap, so we need not worry about
3530  * relcache flush happening midway through.
3531  */
3532  hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
3533  key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3534  id_attrs = RelationGetIndexAttrBitmap(relation,
3536 
3537 
3538  block = ItemPointerGetBlockNumber(otid);
3539  buffer = ReadBuffer(relation, block);
3540  page = BufferGetPage(buffer);
3541 
3542  interesting_attrs = NULL;
3543 
3544  /*
3545  * If the page is already full, there is hardly any chance of doing a HOT
3546  * update on this page. It might be wasteful effort to look for index
3547  * column updates only to later reject HOT updates for lack of space in
3548  * the same page. So we be conservative and only fetch hot_attrs if the
3549  * page is not already full. Since we are already holding a pin on the
3550  * buffer, there is no chance that the buffer can get cleaned up
3551  * concurrently and even if that was possible, in the worst case we lose a
3552  * chance to do a HOT update.
3553  */
3554  if (!PageIsFull(page))
3555  {
3556  interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3557  hot_attrs_checked = true;
3558  }
3559  interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3560  interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3561 
3562  /*
3563  * Before locking the buffer, pin the visibility map page if it appears to
3564  * be necessary. Since we haven't got the lock yet, someone else might be
3565  * in the middle of changing this, so we'll need to recheck after we have
3566  * the lock.
3567  */
3568  if (PageIsAllVisible(page))
3569  visibilitymap_pin(relation, block, &vmbuffer);
3570 
3572 
3573  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3574  Assert(ItemIdIsNormal(lp));
3575 
3576  /*
3577  * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3578  * properly.
3579  */
3580  oldtup.t_tableOid = RelationGetRelid(relation);
3581  oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3582  oldtup.t_len = ItemIdGetLength(lp);
3583  oldtup.t_self = *otid;
3584 
3585  /* the new tuple is ready, except for this: */
3586  newtup->t_tableOid = RelationGetRelid(relation);
3587 
3588  /* Fill in OID for newtup */
3589  if (relation->rd_rel->relhasoids)
3590  {
3591 #ifdef NOT_USED
3592  /* this is redundant with an Assert in HeapTupleSetOid */
3593  Assert(newtup->t_data->t_infomask & HEAP_HASOID);
3594 #endif
3595  HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
3596  }
3597  else
3598  {
3599  /* check there is not space for an OID */
3600  Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
3601  }
3602 
3603  /* Determine columns modified by the update. */
3604  modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3605  &oldtup, newtup);
3606 
3607  /*
3608  * If we're not updating any "key" column, we can grab a weaker lock type.
3609  * This allows for more concurrency when we are running simultaneously
3610  * with foreign key checks.
3611  *
3612  * Note that if a column gets detoasted while executing the update, but
3613  * the value ends up being the same, this test will fail and we will use
3614  * the stronger lock. This is acceptable; the important case to optimize
3615  * is updates that don't manipulate key columns, not those that
3616  * serendipitiously arrive at the same key values.
3617  */
3618  if (!bms_overlap(modified_attrs, key_attrs))
3619  {
3620  *lockmode = LockTupleNoKeyExclusive;
3621  mxact_status = MultiXactStatusNoKeyUpdate;
3622  key_intact = true;
3623 
3624  /*
3625  * If this is the first possibly-multixact-able operation in the
3626  * current transaction, set my per-backend OldestMemberMXactId
3627  * setting. We can be certain that the transaction will never become a
3628  * member of any older MultiXactIds than that. (We have to do this
3629  * even if we end up just using our own TransactionId below, since
3630  * some other backend could incorporate our XID into a MultiXact
3631  * immediately afterwards.)
3632  */
3634  }
3635  else
3636  {
3637  *lockmode = LockTupleExclusive;
3638  mxact_status = MultiXactStatusUpdate;
3639  key_intact = false;
3640  }
3641 
3642  /*
3643  * Note: beyond this point, use oldtup not otid to refer to old tuple.
3644  * otid may very well point at newtup->t_self, which we will overwrite
3645  * with the new tuple's location, so there's great risk of confusion if we
3646  * use otid anymore.
3647  */
3648 
3649 l2:
3650  checked_lockers = false;
3651  locker_remains = false;
3652  result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3653 
3654  /* see below about the "no wait" case */
3655  Assert(result != HeapTupleBeingUpdated || wait);
3656 
3657  if (result == HeapTupleInvisible)
3658  {
3659  UnlockReleaseBuffer(buffer);
3660  ereport(ERROR,
3661  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3662  errmsg("attempted to update invisible tuple")));
3663  }
3664  else if (result == HeapTupleBeingUpdated && wait)
3665  {
3666  TransactionId xwait;
3667  uint16 infomask;
3668  bool can_continue = false;
3669 
3670  /*
3671  * XXX note that we don't consider the "no wait" case here. This
3672  * isn't a problem currently because no caller uses that case, but it
3673  * should be fixed if such a caller is introduced. It wasn't a
3674  * problem previously because this code would always wait, but now
3675  * that some tuple locks do not conflict with one of the lock modes we
3676  * use, it is possible that this case is interesting to handle
3677  * specially.
3678  *
3679  * This may cause failures with third-party code that calls
3680  * heap_update directly.
3681  */
3682 
3683  /* must copy state data before unlocking buffer */
3684  xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3685  infomask = oldtup.t_data->t_infomask;
3686 
3687  /*
3688  * Now we have to do something about the existing locker. If it's a
3689  * multi, sleep on it; we might be awakened before it is completely
3690  * gone (or even not sleep at all in some cases); we need to preserve
3691  * it as locker, unless it is gone completely.
3692  *
3693  * If it's not a multi, we need to check for sleeping conditions
3694  * before actually going to sleep. If the update doesn't conflict
3695  * with the locks, we just continue without sleeping (but making sure
3696  * it is preserved).
3697  *
3698  * Before sleeping, we need to acquire tuple lock to establish our
3699  * priority for the tuple (see heap_lock_tuple). LockTuple will
3700  * release us when we are next-in-line for the tuple. Note we must
3701  * not acquire the tuple lock until we're sure we're going to sleep;
3702  * otherwise we're open for race conditions with other transactions
3703  * holding the tuple lock which sleep on us.
3704  *
3705  * If we are forced to "start over" below, we keep the tuple lock;
3706  * this arranges that we stay at the head of the line while rechecking
3707  * tuple state.
3708  */
3709  if (infomask & HEAP_XMAX_IS_MULTI)
3710  {
3711  TransactionId update_xact;
3712  int remain;
3713 
3714  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3715  *lockmode))
3716  {
3717  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3718 
3719  /* acquire tuple lock, if necessary */
3720  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3721  LockWaitBlock, &have_tuple_lock);
3722 
3723  /* wait for multixact */
3724  MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3725  relation, &oldtup.t_self, XLTW_Update,
3726  &remain);
3727  checked_lockers = true;
3728  locker_remains = remain != 0;
3730 
3731  /*
3732  * If xwait had just locked the tuple then some other xact
3733  * could update this tuple before we get to this point. Check
3734  * for xmax change, and start over if so.
3735  */
3737  infomask) ||
3739  xwait))
3740  goto l2;
3741  }
3742 
3743  /*
3744  * Note that the multixact may not be done by now. It could have
3745  * surviving members; our own xact or other subxacts of this
3746  * backend, and also any other concurrent transaction that locked
3747  * the tuple with KeyShare if we only got TupleLockUpdate. If
3748  * this is the case, we have to be careful to mark the updated
3749  * tuple with the surviving members in Xmax.
3750  *
3751  * Note that there could have been another update in the
3752  * MultiXact. In that case, we need to check whether it committed
3753  * or aborted. If it aborted we are safe to update it again;
3754  * otherwise there is an update conflict, and we have to return
3755  * HeapTupleUpdated below.
3756  *
3757  * In the LockTupleExclusive case, we still need to preserve the
3758  * surviving members: those would include the tuple locks we had
3759  * before this one, which are important to keep in case this
3760  * subxact aborts.
3761  */
3763  update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3764  else
3765  update_xact = InvalidTransactionId;
3766 
3767  /*
3768  * There was no UPDATE in the MultiXact; or it aborted. No
3769  * TransactionIdIsInProgress() call needed here, since we called
3770  * MultiXactIdWait() above.
3771  */
3772  if (!TransactionIdIsValid(update_xact) ||
3773  TransactionIdDidAbort(update_xact))
3774  can_continue = true;
3775  }
3776  else if (TransactionIdIsCurrentTransactionId(xwait))
3777  {
3778  /*
3779  * The only locker is ourselves; we can avoid grabbing the tuple
3780  * lock here, but must preserve our locking information.
3781  */
3782  checked_lockers = true;
3783  locker_remains = true;
3784  can_continue = true;
3785  }
3786  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3787  {
3788  /*
3789  * If it's just a key-share locker, and we're not changing the key
3790  * columns, we don't need to wait for it to end; but we need to
3791  * preserve it as locker.
3792  */
3793  checked_lockers = true;
3794  locker_remains = true;
3795  can_continue = true;
3796  }
3797  else
3798  {
3799  /*
3800  * Wait for regular transaction to end; but first, acquire tuple
3801  * lock.
3802  */
3803  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3804  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3805  LockWaitBlock, &have_tuple_lock);
3806  XactLockTableWait(xwait, relation, &oldtup.t_self,
3807  XLTW_Update);
3808  checked_lockers = true;
3810 
3811  /*
3812  * xwait is done, but if xwait had just locked the tuple then some
3813  * other xact could update this tuple before we get to this point.
3814  * Check for xmax change, and start over if so.
3815  */
3816  if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3817  !TransactionIdEquals(xwait,
3819  goto l2;
3820 
3821  /* Otherwise check if it committed or aborted */
3822  UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3823  if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3824  can_continue = true;
3825  }
3826 
3827  result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
3828  }
3829 
3830  if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3831  {
3832  /* Perform additional check for transaction-snapshot mode RI updates */
3833  if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3834  result = HeapTupleUpdated;
3835  }
3836 
3837  if (result != HeapTupleMayBeUpdated)
3838  {
3839  Assert(result == HeapTupleSelfUpdated ||
3840  result == HeapTupleUpdated ||
3841  result == HeapTupleBeingUpdated);
3842  Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3843  hufd->ctid = oldtup.t_data->t_ctid;
3844  hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3845  if (result == HeapTupleSelfUpdated)
3846  hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3847  else
3848  hufd->cmax = InvalidCommandId;
3849  UnlockReleaseBuffer(buffer);
3850  if (have_tuple_lock)
3851  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3852  if (vmbuffer != InvalidBuffer)
3853  ReleaseBuffer(vmbuffer);
3854  bms_free(hot_attrs);
3855  bms_free(key_attrs);
3856  bms_free(id_attrs);
3857  bms_free(modified_attrs);
3858  bms_free(interesting_attrs);
3859  return result;
3860  }
3861 
3862  /*
3863  * If we didn't pin the visibility map page and the page has become all
3864  * visible while we were busy locking the buffer, or during some
3865  * subsequent window during which we had it unlocked, we'll have to unlock
3866  * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3867  * bit unfortunate, especially since we'll now have to recheck whether the
3868  * tuple has been locked or updated under us, but hopefully it won't
3869  * happen very often.
3870  */
3871  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3872  {
3873  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3874  visibilitymap_pin(relation, block, &vmbuffer);
3876  goto l2;
3877  }
3878 
3879  /* Fill in transaction status data */
3880 
3881  /*
3882  * If the tuple we're updating is locked, we need to preserve the locking
3883  * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3884  */
3886  oldtup.t_data->t_infomask,
3887  oldtup.t_data->t_infomask2,
3888  xid, *lockmode, true,
3889  &xmax_old_tuple, &infomask_old_tuple,
3890  &infomask2_old_tuple);
3891 
3892  /*
3893  * And also prepare an Xmax value for the new copy of the tuple. If there
3894  * was no xmax previously, or there was one but all lockers are now gone,
3895  * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3896  * rare cases that might also be InvalidXid and yet not have the
3897  * HEAP_XMAX_INVALID bit set; that's fine.)
3898  */
3899  if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3901  (checked_lockers && !locker_remains))
3902  xmax_new_tuple = InvalidTransactionId;
3903  else
3904  xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3905 
3906  if (!TransactionIdIsValid(xmax_new_tuple))
3907  {
3908  infomask_new_tuple = HEAP_XMAX_INVALID;
3909  infomask2_new_tuple = 0;
3910  }
3911  else
3912  {
3913  /*
3914  * If we found a valid Xmax for the new tuple, then the infomask bits
3915  * to use on the new tuple depend on what was there on the old one.
3916  * Note that since we're doing an update, the only possibility is that
3917  * the lockers had FOR KEY SHARE lock.
3918  */
3919  if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3920  {
3921  GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3922  &infomask2_new_tuple);
3923  }
3924  else
3925  {
3926  infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3927  infomask2_new_tuple = 0;
3928  }
3929  }
3930 
3931  /*
3932  * Prepare the new tuple with the appropriate initial values of Xmin and
3933  * Xmax, as well as initial infomask bits as computed above.
3934  */
3935  newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3936  newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3937  HeapTupleHeaderSetXmin(newtup->t_data, xid);
3938  HeapTupleHeaderSetCmin(newtup->t_data, cid);
3939  newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3940  newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3941  HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3942 
3943  /*
3944  * Replace cid with a combo cid if necessary. Note that we already put
3945  * the plain cid into the new tuple.
3946  */
3947  HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3948 
3949  /*
3950  * If the toaster needs to be activated, OR if the new tuple will not fit
3951  * on the same page as the old, then we need to release the content lock
3952  * (but not the pin!) on the old tuple's buffer while we are off doing
3953  * TOAST and/or table-file-extension work. We must mark the old tuple to
3954  * show that it's locked, else other processes may try to update it
3955  * themselves.
3956  *
3957  * We need to invoke the toaster if there are already any out-of-line
3958  * toasted values present, or if the new tuple is over-threshold.
3959  */
3960  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3961  relation->rd_rel->relkind != RELKIND_MATVIEW)
3962  {
3963  /* toast table entries should never be recursively toasted */
3964  Assert(!HeapTupleHasExternal(&oldtup));
3965  Assert(!HeapTupleHasExternal(newtup));
3966  need_toast = false;
3967  }
3968  else
3969  need_toast = (HeapTupleHasExternal(&oldtup) ||
3970  HeapTupleHasExternal(newtup) ||
3971  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3972 
3973  pagefree = PageGetHeapFreeSpace(page);
3974 
3975  newtupsize = MAXALIGN(newtup->t_len);
3976 
3977  if (need_toast || newtupsize > pagefree)
3978  {
3979  TransactionId xmax_lock_old_tuple;
3980  uint16 infomask_lock_old_tuple,
3981  infomask2_lock_old_tuple;
3982  bool cleared_all_frozen = false;
3983 
3984  /*
3985  * To prevent concurrent sessions from updating the tuple, we have to
3986  * temporarily mark it locked, while we release the lock.
3987  *
3988  * To satisfy the rule that any xid potentially appearing in a buffer
3989  * written out to disk, we unfortunately have to WAL log this
3990  * temporary modification. We can reuse xl_heap_lock for this
3991  * purpose. If we crash/error before following through with the
3992  * actual update, xmax will be of an aborted transaction, allowing
3993  * other sessions to proceed.
3994  */
3995 
3996  /*
3997  * Compute xmax / infomask appropriate for locking the tuple. This has
3998  * to be done separately from the lock, because the potentially
3999  * created multixact would otherwise be wrong.
4000  */
4002  oldtup.t_data->t_infomask,
4003  oldtup.t_data->t_infomask2,
4004  xid, *lockmode, false,
4005  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
4006  &infomask2_lock_old_tuple);
4007 
4008  Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
4009 
4011 
4012  /* Clear obsolete visibility flags ... */
4013  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4014  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4015  HeapTupleClearHotUpdated(&oldtup);
4016  /* ... and store info about transaction updating this tuple */
4017  Assert(TransactionIdIsValid(xmax_lock_old_tuple));
4018  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
4019  oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
4020  oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
4021  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4022 
4023  /* temporarily make it look not-updated, but locked */
4024  oldtup.t_data->t_ctid = oldtup.t_self;
4025 
4026  /*
4027  * Clear all-frozen bit on visibility map if needed. We could
4028  * immediately reset ALL_VISIBLE, but given that the WAL logging
4029  * overhead would be unchanged, that doesn't seem necessarily
4030  * worthwhile.
4031  */
4032  if (PageIsAllVisible(BufferGetPage(buffer)) &&
4033  visibilitymap_clear(relation, block, vmbuffer,
4035  cleared_all_frozen = true;
4036 
4037  MarkBufferDirty(buffer);
4038 
4039  if (RelationNeedsWAL(relation))
4040  {
4041  xl_heap_lock xlrec;
4042  XLogRecPtr recptr;
4043 
4044  XLogBeginInsert();
4045  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
4046 
4047  xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
4048  xlrec.locking_xid = xmax_lock_old_tuple;
4050  oldtup.t_data->t_infomask2);
4051  xlrec.flags =
4052  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4053  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4054  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4055  PageSetLSN(page, recptr);
4056  }
4057 
4058  END_CRIT_SECTION();
4059 
4060  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4061 
4062  /*
4063  * Let the toaster do its thing, if needed.
4064  *
4065  * Note: below this point, heaptup is the data we actually intend to
4066  * store into the relation; newtup is the caller's original untoasted
4067  * data.
4068  */
4069  if (need_toast)
4070  {
4071  /* Note we always use WAL and FSM during updates */
4072  heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
4073  newtupsize = MAXALIGN(heaptup->t_len);
4074  }
4075  else
4076  heaptup = newtup;
4077 
4078  /*
4079  * Now, do we need a new page for the tuple, or not? This is a bit
4080  * tricky since someone else could have added tuples to the page while
4081  * we weren't looking. We have to recheck the available space after
4082  * reacquiring the buffer lock. But don't bother to do that if the
4083  * former amount of free space is still not enough; it's unlikely
4084  * there's more free now than before.
4085  *
4086  * What's more, if we need to get a new page, we will need to acquire
4087  * buffer locks on both old and new pages. To avoid deadlock against
4088  * some other backend trying to get the same two locks in the other
4089  * order, we must be consistent about the order we get the locks in.
4090  * We use the rule "lock the lower-numbered page of the relation
4091  * first". To implement this, we must do RelationGetBufferForTuple
4092  * while not holding the lock on the old page, and we must rely on it
4093  * to get the locks on both pages in the correct order.
4094  */
4095  if (newtupsize > pagefree)
4096  {
4097  /* Assume there's no chance to put heaptup on same page. */
4098  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4099  buffer, 0, NULL,
4100  &vmbuffer_new, &vmbuffer);
4101  }
4102  else
4103  {
4104  /* Re-acquire the lock on the old tuple's page. */
4106  /* Re-check using the up-to-date free space */
4107  pagefree = PageGetHeapFreeSpace(page);
4108  if (newtupsize > pagefree)
4109  {
4110  /*
4111  * Rats, it doesn't fit anymore. We must now unlock and
4112  * relock to avoid deadlock. Fortunately, this path should
4113  * seldom be taken.
4114  */
4115  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4116  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4117  buffer, 0, NULL,
4118  &vmbuffer_new, &vmbuffer);
4119  }
4120  else
4121  {
4122  /* OK, it fits here, so we're done. */
4123  newbuf = buffer;
4124  }
4125  }
4126  }
4127  else
4128  {
4129  /* No TOAST work needed, and it'll fit on same page */
4130  newbuf = buffer;
4131  heaptup = newtup;
4132  }
4133 
4134  /*
4135  * We're about to do the actual update -- check for conflict first, to
4136  * avoid possibly having to roll back work we've just done.
4137  *
4138  * This is safe without a recheck as long as there is no possibility of
4139  * another process scanning the pages between this check and the update
4140  * being visible to the scan (i.e., exclusive buffer content lock(s) are
4141  * continuously held from this point until the tuple update is visible).
4142  *
4143  * For the new tuple the only check needed is at the relation level, but
4144  * since both tuples are in the same relation and the check for oldtup
4145  * will include checking the relation level, there is no benefit to a
4146  * separate check for the new tuple.
4147  */
4148  CheckForSerializableConflictIn(relation, &oldtup, buffer);
4149 
4150  /*
4151  * At this point newbuf and buffer are both pinned and locked, and newbuf
4152  * has enough space for the new tuple. If they are the same buffer, only
4153  * one pin is held.
4154  */
4155 
4156  if (newbuf == buffer)
4157  {
4158  /*
4159  * Since the new tuple is going into the same page, we might be able
4160  * to do a HOT update. Check if any of the index columns have been
4161  * changed. If the page was already full, we may have skipped checking
4162  * for index columns. If so, HOT update is possible.
4163  */
4164  if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
4165  use_hot_update = true;
4166  }
4167  else
4168  {
4169  /* Set a hint that the old page could use prune/defrag */
4170  PageSetFull(page);
4171  }
4172 
4173  /*
4174  * Compute replica identity tuple before entering the critical section so
4175  * we don't PANIC upon a memory allocation failure.
4176  * ExtractReplicaIdentity() will return NULL if nothing needs to be
4177  * logged.
4178  */
4179  old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
4180  bms_overlap(modified_attrs, id_attrs),
4181  &old_key_copied);
4182 
4183  /* NO EREPORT(ERROR) from here till changes are logged */
4185 
4186  /*
4187  * If this transaction commits, the old tuple will become DEAD sooner or
4188  * later. Set flag that this page is a candidate for pruning once our xid
4189  * falls below the OldestXmin horizon. If the transaction finally aborts,
4190  * the subsequent page pruning will be a no-op and the hint will be
4191  * cleared.
4192  *
4193  * XXX Should we set hint on newbuf as well? If the transaction aborts,
4194  * there would be a prunable tuple in the newbuf; but for now we choose
4195  * not to optimize for aborts. Note that heap_xlog_update must be kept in
4196  * sync if this decision changes.
4197  */
4198  PageSetPrunable(page, xid);
4199 
4200  if (use_hot_update)
4201  {
4202  /* Mark the old tuple as HOT-updated */
4203  HeapTupleSetHotUpdated(&oldtup);
4204  /* And mark the new tuple as heap-only */
4205  HeapTupleSetHeapOnly(heaptup);
4206  /* Mark the caller's copy too, in case different from heaptup */
4207  HeapTupleSetHeapOnly(newtup);
4208  }
4209  else
4210  {
4211  /* Make sure tuples are correctly marked as not-HOT */
4212  HeapTupleClearHotUpdated(&oldtup);
4213  HeapTupleClearHeapOnly(heaptup);
4214  HeapTupleClearHeapOnly(newtup);
4215  }
4216 
4217  RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4218 
4219 
4220  /* Clear obsolete visibility flags, possibly set by ourselves above... */
4221  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4222  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4223  /* ... and store info about transaction updating this tuple */
4224  Assert(TransactionIdIsValid(xmax_old_tuple));
4225  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
4226  oldtup.t_data->t_infomask |= infomask_old_tuple;
4227  oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4228  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4229 
4230  /* record address of new tuple in t_ctid of old one */
4231  oldtup.t_data->t_ctid = heaptup->t_self;
4232 
4233  /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4234  if (PageIsAllVisible(BufferGetPage(buffer)))
4235  {
4236  all_visible_cleared = true;
4238  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4239  vmbuffer, VISIBILITYMAP_VALID_BITS);
4240  }
4241  if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4242  {
4243  all_visible_cleared_new = true;
4245  visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
4246  vmbuffer_new, VISIBILITYMAP_VALID_BITS);
4247  }
4248 
4249  if (newbuf != buffer)
4250  MarkBufferDirty(newbuf);
4251  MarkBufferDirty(buffer);
4252 
4253  /* XLOG stuff */
4254  if (RelationNeedsWAL(relation))
4255  {
4256  XLogRecPtr recptr;
4257 
4258  /*
4259  * For logical decoding we need combocids to properly decode the
4260  * catalog.
4261  */
4263  {
4264  log_heap_new_cid(relation, &oldtup);
4265  log_heap_new_cid(relation, heaptup);
4266  }
4267 
4268  recptr = log_heap_update(relation, buffer,
4269  newbuf, &oldtup, heaptup,
4270  old_key_tuple,
4271  all_visible_cleared,
4272  all_visible_cleared_new);
4273  if (newbuf != buffer)
4274  {
4275  PageSetLSN(BufferGetPage(newbuf), recptr);
4276  }
4277  PageSetLSN(BufferGetPage(buffer), recptr);
4278  }
4279 
4280  END_CRIT_SECTION();
4281 
4282  if (newbuf != buffer)
4283  LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
4284  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4285 
4286  /*
4287  * Mark old tuple for invalidation from system caches at next command
4288  * boundary, and mark the new tuple for invalidation in case we abort. We
4289  * have to do this before releasing the buffer because oldtup is in the
4290  * buffer. (heaptup is all in local memory, but it's necessary to process
4291  * both tuple versions in one call to inval.c so we can avoid redundant
4292  * sinval messages.)
4293  */
4294  CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4295 
4296  /* Now we can release the buffer(s) */
4297  if (newbuf != buffer)
4298  ReleaseBuffer(newbuf);
4299  ReleaseBuffer(buffer);
4300  if (BufferIsValid(vmbuffer_new))
4301  ReleaseBuffer(vmbuffer_new);
4302  if (BufferIsValid(vmbuffer))
4303  ReleaseBuffer(vmbuffer);
4304 
4305  /*
4306  * Release the lmgr tuple lock, if we had it.
4307  */
4308  if (have_tuple_lock)
4309  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4310 
4311  pgstat_count_heap_update(relation, use_hot_update);
4312 
4313  /*
4314  * If heaptup is a private copy, release it. Don't forget to copy t_self
4315  * back to the caller's image, too.
4316  */
4317  if (heaptup != newtup)
4318  {
4319  newtup->t_self = heaptup->t_self;
4320  heap_freetuple(heaptup);
4321  }
4322 
4323  if (old_key_tuple != NULL && old_key_copied)
4324  heap_freetuple(old_key_tuple);
4325 
4326  bms_free(hot_attrs);
4327  bms_free(key_attrs);
4328  bms_free(id_attrs);
4329  bms_free(modified_attrs);
4330  bms_free(interesting_attrs);
4331 
4332  return HeapTupleMayBeUpdated;
4333 }
4334 
4335 /*
4336  * Check if the specified attribute's value is same in both given tuples.
4337  * Subroutine for HeapDetermineModifiedColumns.
4338  */
4339 static bool
4340 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4341  HeapTuple tup1, HeapTuple tup2)
4342 {
4343  Datum value1,
4344  value2;
4345  bool isnull1,
4346  isnull2;
4347  Form_pg_attribute att;
4348 
4349  /*
4350  * If it's a whole-tuple reference, say "not equal". It's not really
4351  * worth supporting this case, since it could only succeed after a no-op
4352  * update, which is hardly a case worth optimizing for.
4353  */
4354  if (attrnum == 0)
4355  return false;
4356 
4357  /*
4358  * Likewise, automatically say "not equal" for any system attribute other
4359  * than OID and tableOID; we cannot expect these to be consistent in a HOT
4360  * chain, or even to be set correctly yet in the new tuple.
4361  */
4362  if (attrnum < 0)
4363  {
4364  if (attrnum != ObjectIdAttributeNumber &&
4365  attrnum != TableOidAttributeNumber)
4366  return false;
4367  }
4368 
4369  /*
4370  * Extract the corresponding values. XXX this is pretty inefficient if
4371  * there are many indexed columns. Should HeapDetermineModifiedColumns do
4372  * a single heap_deform_tuple call on each tuple, instead? But that
4373  * doesn't work for system columns ...
4374  */
4375  value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4376  value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4377 
4378  /*
4379  * If one value is NULL and other is not, then they are certainly not
4380  * equal
4381  */
4382  if (isnull1 != isnull2)
4383  return false;
4384 
4385  /*
4386  * If both are NULL, they can be considered equal.
4387  */
4388  if (isnull1)
4389  return true;
4390 
4391  /*
4392  * We do simple binary comparison of the two datums. This may be overly
4393  * strict because there can be multiple binary representations for the
4394  * same logical value. But we should be OK as long as there are no false
4395  * positives. Using a type-specific equality operator is messy because
4396  * there could be multiple notions of equality in different operator
4397  * classes; furthermore, we cannot safely invoke user-defined functions
4398  * while holding exclusive buffer lock.
4399  */
4400  if (attrnum <= 0)
4401  {
4402  /* The only allowed system columns are OIDs, so do this */
4403  return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4404  }
4405  else
4406  {
4407  Assert(attrnum <= tupdesc->natts);
4408  att = tupdesc->attrs[attrnum - 1];
4409  return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4410  }
4411 }
4412 
4413 /*
4414  * Check which columns are being updated.
4415  *
4416  * Given an updated tuple, determine (and return into the output bitmapset),
4417  * from those listed as interesting, the set of columns that changed.
4418  *
4419  * The input bitmapset is destructively modified; that is OK since this is
4420  * invoked at most once in heap_update.
4421  */
4422 static Bitmapset *
4424  HeapTuple oldtup, HeapTuple newtup)
4425 {
4426  int attnum;
4427  Bitmapset *modified = NULL;
4428 
4429  while ((attnum = bms_first_member(interesting_cols)) >= 0)
4430  {
4432 
4434  attnum, oldtup, newtup))
4435  modified = bms_add_member(modified,
4437  }
4438 
4439  return modified;
4440 }
4441 
4442 /*
4443  * simple_heap_update - replace a tuple
4444  *
4445  * This routine may be used to update a tuple when concurrent updates of
4446  * the target tuple are not expected (for example, because we have a lock
4447  * on the relation associated with the tuple). Any failure is reported
4448  * via ereport().
4449  */
4450 void
4452 {
4454  HeapUpdateFailureData hufd;
4455  LockTupleMode lockmode;
4456 
4457  result = heap_update(relation, otid, tup,
4459  true /* wait for commit */ ,
4460  &hufd, &lockmode);
4461  switch (result)
4462  {
4463  case HeapTupleSelfUpdated:
4464  /* Tuple was already updated in current command? */
4465  elog(ERROR, "tuple already updated by self");
4466  break;
4467 
4468  case HeapTupleMayBeUpdated:
4469  /* done successfully */
4470  break;
4471 
4472  case HeapTupleUpdated:
4473  elog(ERROR, "tuple concurrently updated");
4474  break;
4475 
4476  default:
4477  elog(ERROR, "unrecognized heap_update status: %u", result);
4478  break;
4479  }
4480 }
4481 
4482 
4483 /*
4484  * Return the MultiXactStatus corresponding to the given tuple lock mode.
4485  */
4486 static MultiXactStatus
4488 {
4489  int retval;
4490 
4491  if (is_update)
4492  retval = tupleLockExtraInfo[mode].updstatus;
4493  else
4494  retval = tupleLockExtraInfo[mode].lockstatus;
4495 
4496  if (retval == -1)
4497  elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4498  is_update ? "true" : "false");
4499 
4500  return (MultiXactStatus) retval;
4501 }
4502 
4503 /*
4504  * heap_lock_tuple - lock a tuple in shared or exclusive mode
4505  *
4506  * Note that this acquires a buffer pin, which the caller must release.
4507  *
4508  * Input parameters:
4509  * relation: relation containing tuple (caller must hold suitable lock)
4510  * tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
4511  * cid: current command ID (used for visibility test, and stored into
4512  * tuple's cmax if lock is successful)
4513  * mode: indicates if shared or exclusive tuple lock is desired
4514  * wait_policy: what to do if tuple lock is not available
4515  * follow_updates: if true, follow the update chain to also lock descendant
4516  * tuples.
4517  *
4518  * Output parameters:
4519  * *tuple: all fields filled in
4520  * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4521  * *hufd: filled in failure cases (see below)
4522  *
4523  * Function result may be:
4524  * HeapTupleMayBeUpdated: lock was successfully acquired
4525  * HeapTupleInvisible: lock failed because tuple was never visible to us
4526  * HeapTupleSelfUpdated: lock failed because tuple updated by self
4527  * HeapTupleUpdated: lock failed because tuple updated by other xact
4528  * HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip
4529  *
4530  * In the failure cases other than HeapTupleInvisible, the routine fills
4531  * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4532  * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated,
4533  * since we cannot obtain cmax from a combocid generated by another
4534  * transaction).
4535  * See comments for struct HeapUpdateFailureData for additional info.
4536  *
4537  * See README.tuplock for a thorough explanation of this mechanism.
4538  */
4541  CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4542  bool follow_updates,
4544 {
4546  ItemPointer tid = &(tuple->t_self);
4547  ItemId lp;
4548  Page page;
4549  Buffer vmbuffer = InvalidBuffer;
4550  BlockNumber block;
4551  TransactionId xid,
4552  xmax;
4553  uint16 old_infomask,
4554  new_infomask,
4555  new_infomask2;
4556  bool first_time = true;
4557  bool have_tuple_lock = false;
4558  bool cleared_all_frozen = false;
4559 
4560  *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4561  block = ItemPointerGetBlockNumber(tid);
4562 
4563  /*
4564  * Before locking the buffer, pin the visibility map page if it appears to
4565  * be necessary. Since we haven't got the lock yet, someone else might be
4566  * in the middle of changing this, so we'll need to recheck after we have
4567  * the lock.
4568  */
4569  if (PageIsAllVisible(BufferGetPage(*buffer)))
4570  visibilitymap_pin(relation, block, &vmbuffer);
4571 
4573 
4574  page = BufferGetPage(*buffer);
4575  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4576  Assert(ItemIdIsNormal(lp));
4577 
4578  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4579  tuple->t_len = ItemIdGetLength(lp);
4580  tuple->t_tableOid = RelationGetRelid(relation);
4581 
4582 l3:
4583  result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4584 
4585  if (result == HeapTupleInvisible)
4586  {
4587  /*
4588  * This is possible, but only when locking a tuple for ON CONFLICT
4589  * UPDATE. We return this value here rather than throwing an error in
4590  * order to give that case the opportunity to throw a more specific
4591  * error.
4592  */
4593  result = HeapTupleInvisible;
4594  goto out_locked;
4595  }
4596  else if (result == HeapTupleBeingUpdated || result == HeapTupleUpdated)
4597  {
4598  TransactionId xwait;
4599  uint16 infomask;
4600  uint16 infomask2;
4601  bool require_sleep;
4602  ItemPointerData t_ctid;
4603 
4604  /* must copy state data before unlocking buffer */
4605  xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4606  infomask = tuple->t_data->t_infomask;
4607  infomask2 = tuple->t_data->t_infomask2;
4608  ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4609 
4610  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4611 
4612  /*
4613  * If any subtransaction of the current top transaction already holds
4614  * a lock as strong as or stronger than what we're requesting, we
4615  * effectively hold the desired lock already. We *must* succeed
4616  * without trying to take the tuple lock, else we will deadlock
4617  * against anyone wanting to acquire a stronger lock.
4618  *
4619  * Note we only do this the first time we loop on the HTSU result;
4620  * there is no point in testing in subsequent passes, because
4621  * evidently our own transaction cannot have acquired a new lock after
4622  * the first time we checked.
4623  */
4624  if (first_time)
4625  {
4626  first_time = false;
4627 
4628  if (infomask & HEAP_XMAX_IS_MULTI)
4629  {
4630  int i;
4631  int nmembers;
4632  MultiXactMember *members;
4633 
4634  /*
4635  * We don't need to allow old multixacts here; if that had
4636  * been the case, HeapTupleSatisfiesUpdate would have returned
4637  * MayBeUpdated and we wouldn't be here.
4638  */
4639  nmembers =
4640  GetMultiXactIdMembers(xwait, &members, false,
4641  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4642 
4643  for (i = 0; i < nmembers; i++)
4644  {
4645  /* only consider members of our own transaction */
4646  if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4647  continue;
4648 
4649  if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4650  {
4651  pfree(members);
4652  result = HeapTupleMayBeUpdated;
4653  goto out_unlocked;
4654  }
4655  }
4656 
4657  if (members)
4658  pfree(members);
4659  }
4660  else if (TransactionIdIsCurrentTransactionId(xwait))
4661  {
4662  switch (mode)
4663  {
4664  case LockTupleKeyShare:
4665  Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4666  HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4667  HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4668  result = HeapTupleMayBeUpdated;
4669  goto out_unlocked;
4670  case LockTupleShare:
4671  if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4672  HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4673  {
4674  result = HeapTupleMayBeUpdated;
4675  goto out_unlocked;
4676  }
4677  break;
4679  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4680  {
4681  result = HeapTupleMayBeUpdated;
4682  goto out_unlocked;
4683  }
4684  break;
4685  case LockTupleExclusive:
4686  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4687  infomask2 & HEAP_KEYS_UPDATED)
4688  {
4689  result = HeapTupleMayBeUpdated;
4690  goto out_unlocked;
4691  }
4692  break;
4693  }
4694  }
4695  }
4696 
4697  /*
4698  * Initially assume that we will have to wait for the locking
4699  * transaction(s) to finish. We check various cases below in which
4700  * this can be turned off.
4701  */
4702  require_sleep = true;
4703  if (mode == LockTupleKeyShare)
4704  {
4705  /*
4706  * If we're requesting KeyShare, and there's no update present, we
4707  * don't need to wait. Even if there is an update, we can still
4708  * continue if the key hasn't been modified.
4709  *
4710  * However, if there are updates, we need to walk the update chain
4711  * to mark future versions of the row as locked, too. That way,
4712  * if somebody deletes that future version, we're protected
4713  * against the key going away. This locking of future versions
4714  * could block momentarily, if a concurrent transaction is
4715  * deleting a key; or it could return a value to the effect that
4716  * the transaction deleting the key has already committed. So we
4717  * do this before re-locking the buffer; otherwise this would be
4718  * prone to deadlocks.
4719  *
4720  * Note that the TID we're locking was grabbed before we unlocked
4721  * the buffer. For it to change while we're not looking, the
4722  * other properties we're testing for below after re-locking the
4723  * buffer would also change, in which case we would restart this
4724  * loop above.
4725  */
4726  if (!(infomask2 & HEAP_KEYS_UPDATED))
4727  {
4728  bool updated;
4729 
4730  updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4731 
4732  /*
4733  * If there are updates, follow the update chain; bail out if
4734  * that cannot be done.
4735  */
4736  if (follow_updates && updated)
4737  {
4738  HTSU_Result res;
4739 
4740  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4742  mode);
4743  if (res != HeapTupleMayBeUpdated)
4744  {
4745  result = res;
4746  /* recovery code expects to have buffer lock held */
4748  goto failed;
4749  }
4750  }
4751 
4753 
4754  /*
4755  * Make sure it's still an appropriate lock, else start over.
4756  * Also, if it wasn't updated before we released the lock, but
4757  * is updated now, we start over too; the reason is that we
4758  * now need to follow the update chain to lock the new
4759  * versions.
4760  */
4761  if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4762  ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4763  !updated))
4764  goto l3;
4765 
4766  /* Things look okay, so we can skip sleeping */
4767  require_sleep = false;
4768 
4769  /*
4770  * Note we allow Xmax to change here; other updaters/lockers
4771  * could have modified it before we grabbed the buffer lock.
4772  * However, this is not a problem, because with the recheck we
4773  * just did we ensure that they still don't conflict with the
4774  * lock we want.
4775  */
4776  }
4777  }
4778  else if (mode == LockTupleShare)
4779  {
4780  /*
4781  * If we're requesting Share, we can similarly avoid sleeping if
4782  * there's no update and no exclusive lock present.
4783  */
4784  if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4785  !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4786  {
4788 
4789  /*
4790  * Make sure it's still an appropriate lock, else start over.
4791  * See above about allowing xmax to change.
4792  */
4793  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4795  goto l3;
4796  require_sleep = false;
4797  }
4798  }
4799  else if (mode == LockTupleNoKeyExclusive)
4800  {
4801  /*
4802  * If we're requesting NoKeyExclusive, we might also be able to
4803  * avoid sleeping; just ensure that there no conflicting lock
4804  * already acquired.
4805  */
4806  if (infomask & HEAP_XMAX_IS_MULTI)
4807  {
4808  if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4809  mode))
4810  {
4811  /*
4812  * No conflict, but if the xmax changed under us in the
4813  * meantime, start over.
4814  */
4816  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4818  xwait))
4819  goto l3;
4820 
4821  /* otherwise, we're good */
4822  require_sleep = false;
4823  }
4824  }
4825  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4826  {
4828 
4829  /* if the xmax changed in the meantime, start over */
4830  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4833  xwait))
4834  goto l3;
4835  /* otherwise, we're good */
4836  require_sleep = false;
4837  }
4838  }
4839 
4840  /*
4841  * As a check independent from those above, we can also avoid sleeping
4842  * if the current transaction is the sole locker of the tuple. Note
4843  * that the strength of the lock already held is irrelevant; this is
4844  * not about recording the lock in Xmax (which will be done regardless
4845  * of this optimization, below). Also, note that the cases where we
4846  * hold a lock stronger than we are requesting are already handled
4847  * above by not doing anything.
4848  *
4849  * Note we only deal with the non-multixact case here; MultiXactIdWait
4850  * is well equipped to deal with this situation on its own.
4851  */
4852  if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4854  {
4855  /* ... but if the xmax changed in the meantime, start over */
4857  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4859  xwait))
4860  goto l3;
4862  require_sleep = false;
4863  }
4864 
4865  /*
4866  * Time to sleep on the other transaction/multixact, if necessary.
4867  *
4868  * If the other transaction is an update that's already committed,
4869  * then sleeping cannot possibly do any good: if we're required to
4870  * sleep, get out to raise an error instead.
4871  *
4872  * By here, we either have already acquired the buffer exclusive lock,
4873  * or we must wait for the locking transaction or multixact; so below
4874  * we ensure that we grab buffer lock after the sleep.
4875  */
4876  if (require_sleep && result == HeapTupleUpdated)
4877  {
4879  goto failed;
4880  }
4881  else if (require_sleep)
4882  {
4883  /*
4884  * Acquire tuple lock to establish our priority for the tuple, or
4885  * die trying. LockTuple will release us when we are next-in-line
4886  * for the tuple. We must do this even if we are share-locking.
4887  *
4888  * If we are forced to "start over" below, we keep the tuple lock;
4889  * this arranges that we stay at the head of the line while
4890  * rechecking tuple state.
4891  */
4892  if (!heap_acquire_tuplock(relation, tid, mode, wait_policy,
4893  &have_tuple_lock))
4894  {
4895  /*
4896  * This can only happen if wait_policy is Skip and the lock
4897  * couldn't be obtained.
4898  */
4899  result = HeapTupleWouldBlock;
4900  /* recovery code expects to have buffer lock held */
4902  goto failed;
4903  }
4904 
4905  if (infomask & HEAP_XMAX_IS_MULTI)
4906  {
4908 
4909  /* We only ever lock tuples, never update them */
4910  if (status >= MultiXactStatusNoKeyUpdate)
4911  elog(ERROR, "invalid lock mode in heap_lock_tuple");
4912 
4913  /* wait for multixact to end, or die trying */
4914  switch (wait_policy)
4915  {
4916  case LockWaitBlock:
4917  MultiXactIdWait((MultiXactId) xwait, status, infomask,
4918  relation, &tuple->t_self, XLTW_Lock, NULL);
4919  break;
4920  case LockWaitSkip:
4922  status, infomask, relation,
4923  NULL))
4924  {
4925  result = HeapTupleWouldBlock;
4926  /* recovery code expects to have buffer lock held */
4928  goto failed;
4929  }
4930  break;
4931  case LockWaitError:
4933  status, infomask, relation,
4934  NULL))
4935  ereport(ERROR,
4936  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4937  errmsg("could not obtain lock on row in relation \"%s\"",
4938  RelationGetRelationName(relation))));
4939 
4940  break;
4941  }
4942 
4943  /*
4944  * Of course, the multixact might not be done here: if we're
4945  * requesting a light lock mode, other transactions with light
4946  * locks could still be alive, as well as locks owned by our
4947  * own xact or other subxacts of this backend. We need to
4948  * preserve the surviving MultiXact members. Note that it
4949  * isn't absolutely necessary in the latter case, but doing so
4950  * is simpler.
4951  */
4952  }
4953  else
4954  {
4955  /* wait for regular transaction to end, or die trying */
4956  switch (wait_policy)
4957  {
4958  case LockWaitBlock:
4959  XactLockTableWait(xwait, relation, &tuple->t_self,
4960  XLTW_Lock);
4961  break;
4962  case LockWaitSkip:
4963  if (!ConditionalXactLockTableWait(xwait))
4964  {
4965  result = HeapTupleWouldBlock;
4966  /* recovery code expects to have buffer lock held */
4968  goto failed;
4969  }
4970  break;
4971  case LockWaitError:
4972  if (!ConditionalXactLockTableWait(xwait))
4973  ereport(ERROR,
4974  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4975  errmsg("could not obtain lock on row in relation \"%s\"",
4976  RelationGetRelationName(relation))));
4977  break;
4978  }
4979  }
4980 
4981  /* if there are updates, follow the update chain */
4982  if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4983  {
4984  HTSU_Result res;
4985 
4986  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4988  mode);
4989  if (res != HeapTupleMayBeUpdated)
4990  {
4991  result = res;
4992  /* recovery code expects to have buffer lock held */
4994  goto failed;
4995  }
4996  }
4997 
4999 
5000  /*
5001  * xwait is done, but if xwait had just locked the tuple then some
5002  * other xact could update this tuple before we get to this point.
5003  * Check for xmax change, and start over if so.
5004  */
5005  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5007  xwait))
5008  goto l3;
5009 
5010  if (!(infomask & HEAP_XMAX_IS_MULTI))
5011  {
5012  /*
5013  * Otherwise check if it committed or aborted. Note we cannot
5014  * be here if the tuple was only locked by somebody who didn't
5015  * conflict with us; that would have been handled above. So
5016  * that transaction must necessarily be gone by now. But
5017  * don't check for this in the multixact case, because some
5018  * locker transactions might still be running.
5019  */
5020  UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5021  }
5022  }
5023 
5024  /* By here, we're certain that we hold buffer exclusive lock again */
5025 
5026  /*
5027  * We may lock if previous xmax aborted, or if it committed but only
5028  * locked the tuple without updating it; or if we didn't have to wait
5029  * at all for whatever reason.
5030  */
5031  if (!require_sleep ||
5032  (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5035  result = HeapTupleMayBeUpdated;
5036  else
5037  result = HeapTupleUpdated;
5038  }
5039 
5040 failed:
5041  if (result != HeapTupleMayBeUpdated)
5042  {
5043  Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated ||
5044  result == HeapTupleWouldBlock);
5045  Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5046  hufd->ctid = tuple->t_data->t_ctid;
5047  hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5048  if (result == HeapTupleSelfUpdated)
5049  hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5050  else
5051  hufd->cmax = InvalidCommandId;
5052  goto out_locked;
5053  }
5054 
5055  /*
5056  * If we didn't pin the visibility map page and the page has become all
5057  * visible while we were busy locking the buffer, or during some
5058  * subsequent window during which we had it unlocked, we'll have to unlock
5059  * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5060  * unfortunate, especially since we'll now have to recheck whether the
5061  * tuple has been locked or updated under us, but hopefully it won't
5062  * happen very often.
5063  */
5064  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5065  {
5066  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5067  visibilitymap_pin(relation, block, &vmbuffer);
5069  goto l3;
5070  }
5071 
5072  xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5073  old_infomask = tuple->t_data->t_infomask;
5074 
5075  /*
5076  * If this is the first possibly-multixact-able operation in the current
5077  * transaction, set my per-backend OldestMemberMXactId setting. We can be
5078  * certain that the transaction will never become a member of any older
5079  * MultiXactIds than that. (We have to do this even if we end up just
5080  * using our own TransactionId below, since some other backend could
5081  * incorporate our XID into a MultiXact immediately afterwards.)
5082  */
5084 
5085  /*
5086  * Compute the new xmax and infomask to store into the tuple. Note we do
5087  * not modify the tuple just yet, because that would leave it in the wrong
5088  * state if multixact.c elogs.
5089  */
5090  compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
5091  GetCurrentTransactionId(), mode, false,
5092  &xid, &new_infomask, &new_infomask2);
5093 
5095 
5096  /*
5097  * Store transaction information of xact locking the tuple.
5098  *
5099  * Note: Cmax is meaningless in this context, so don't set it; this avoids
5100  * possibly generating a useless combo CID. Moreover, if we're locking a
5101  * previously updated tuple, it's important to preserve the Cmax.
5102  *
5103  * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5104  * we would break the HOT chain.
5105  */
5106  tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
5107  tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5108  tuple->t_data->t_infomask |= new_infomask;
5109  tuple->t_data->t_infomask2 |= new_infomask2;
5110  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5112  HeapTupleHeaderSetXmax(tuple->t_data, xid);
5113 
5114  /*
5115  * Make sure there is no forward chain link in t_ctid. Note that in the
5116  * cases where the tuple has been updated, we must not overwrite t_ctid,
5117  * because it was set by the updater. Moreover, if the tuple has been
5118  * updated, we need to follow the update chain to lock the new versions of
5119  * the tuple as well.
5120  */
5121  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5122  tuple->t_data->t_ctid = *tid;
5123 
5124  /* Clear only the all-frozen bit on visibility map if needed */
5125  if (PageIsAllVisible(page) &&
5126  visibilitymap_clear(relation, block, vmbuffer,
5128  cleared_all_frozen = true;
5129 
5130 
5131  MarkBufferDirty(*buffer);
5132 
5133  /*
5134  * XLOG stuff. You might think that we don't need an XLOG record because
5135  * there is no state change worth restoring after a crash. You would be
5136  * wrong however: we have just written either a TransactionId or a
5137  * MultiXactId that may never have been seen on disk before, and we need
5138  * to make sure that there are XLOG entries covering those ID numbers.
5139  * Else the same IDs might be re-used after a crash, which would be
5140  * disastrous if this page made it to disk before the crash. Essentially
5141  * we have to enforce the WAL log-before-data rule even in this case.
5142  * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5143  * entries for everything anyway.)
5144  */
5145  if (RelationNeedsWAL(relation))
5146  {
5147  xl_heap_lock xlrec;
5148  XLogRecPtr recptr;
5149 
5150  XLogBeginInsert();
5151  XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
5152 
5153  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5154  xlrec.locking_xid = xid;
5155  xlrec.infobits_set = compute_infobits(new_infomask,
5156  tuple->t_data->t_infomask2);
5157  xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5158  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
5159 
5160  /* we don't decode row locks atm, so no need to log the origin */
5161 
5162  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
5163 
5164  PageSetLSN(page, recptr);
5165  }
5166 
5167  END_CRIT_SECTION();
5168 
5169  result = HeapTupleMayBeUpdated;
5170 
5171 out_locked:
5172  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5173 
5174 out_unlocked:
5175  if (BufferIsValid(vmbuffer))
5176  ReleaseBuffer(vmbuffer);
5177 
5178  /*
5179  * Don't update the visibility map here. Locking a tuple doesn't change
5180  * visibility info.
5181  */
5182 
5183  /*
5184  * Now that we have successfully marked the tuple as locked, we can
5185  * release the lmgr tuple lock, if we had it.
5186  */
5187  if (have_tuple_lock)
5188  UnlockTupleTuplock(relation, tid, mode);
5189 
5190  return result;
5191 }
5192 
5193 /*
5194  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5195  * its normal, Xmax-based tuple lock.
5196  *
5197  * have_tuple_lock is an input and output parameter: on input, it indicates
5198  * whether the lock has previously been acquired (and this function does
5199  * nothing in that case). If this function returns success, have_tuple_lock
5200  * has been flipped to true.
5201  *
5202  * Returns false if it was unable to obtain the lock; this can only happen if
5203  * wait_policy is Skip.
5204  */
5205 static bool
5207  LockWaitPolicy wait_policy, bool *have_tuple_lock)
5208 {
5209  if (*have_tuple_lock)
5210  return true;
5211 
5212  switch (wait_policy)
5213  {
5214  case LockWaitBlock:
5215  LockTupleTuplock(relation, tid, mode);
5216  break;
5217 
5218  case LockWaitSkip:
5219  if (!ConditionalLockTupleTuplock(relation, tid, mode))
5220  return false;
5221  break;
5222 
5223  case LockWaitError:
5224  if (!ConditionalLockTupleTuplock(relation, tid, mode))
5225  ereport(ERROR,
5226  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5227  errmsg("could not obtain lock on row in relation \"%s\"",
5228  RelationGetRelationName(relation))));
5229  break;
5230  }
5231  *have_tuple_lock = true;
5232 
5233  return true;
5234 }
5235 
5236 /*
5237  * Given an original set of Xmax and infomask, and a transaction (identified by
5238  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5239  * corresponding infomasks to use on the tuple.
5240  *
5241  * Note that this might have side effects such as creating a new MultiXactId.
5242  *
5243  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5244  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5245  * but it was not running anymore. There is a race condition, which is that the
5246  * MultiXactId may have finished since then, but that uncommon case is handled
5247  * either here, or within MultiXactIdExpand.
5248  *
5249  * There is a similar race condition possible when the old xmax was a regular
5250  * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5251  * window, but it's still possible to end up creating an unnecessary
5252  * MultiXactId. Fortunately this is harmless.
5253  */
5254 static void
5256  uint16 old_infomask2, TransactionId add_to_xmax,
5257  LockTupleMode mode, bool is_update,
5258  TransactionId *result_xmax, uint16 *result_infomask,
5259  uint16 *result_infomask2)
5260 {
5261  TransactionId new_xmax;
5262  uint16 new_infomask,
5263  new_infomask2;
5264 
5266 
5267 l5:
5268  new_infomask = 0;
5269  new_infomask2 = 0;
5270  if (old_infomask & HEAP_XMAX_INVALID)
5271  {
5272  /*
5273  * No previous locker; we just insert our own TransactionId.
5274  *
5275  * Note that it's critical that this case be the first one checked,
5276  * because there are several blocks below that come back to this one
5277  * to implement certain optimizations; old_infomask might contain
5278  * other dirty bits in those cases, but we don't really care.
5279  */
5280  if (is_update)
5281  {
5282  new_xmax = add_to_xmax;
5283  if (mode == LockTupleExclusive)
5284  new_infomask2 |= HEAP_KEYS_UPDATED;
5285  }
5286  else
5287  {
5288  new_infomask |= HEAP_XMAX_LOCK_ONLY;
5289  switch (mode)
5290  {
5291  case LockTupleKeyShare:
5292  new_xmax = add_to_xmax;
5293  new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5294  break;
5295  case LockTupleShare:
5296  new_xmax = add_to_xmax;
5297  new_infomask |= HEAP_XMAX_SHR_LOCK;
5298  break;
5300  new_xmax = add_to_xmax;
5301  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5302  break;
5303  case LockTupleExclusive:
5304  new_xmax = add_to_xmax;
5305  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5306  new_infomask2 |= HEAP_KEYS_UPDATED;
5307  break;
5308  default:
5309  new_xmax = InvalidTransactionId; /* silence compiler */
5310  elog(ERROR, "invalid lock mode");
5311  }
5312  }
5313  }
5314  else if (old_infomask & HEAP_XMAX_IS_MULTI)
5315  {
5316  MultiXactStatus new_status;
5317 
5318  /*
5319  * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5320  * cross-check.
5321  */
5322  Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5323 
5324  /*
5325  * A multixact together with LOCK_ONLY set but neither lock bit set
5326  * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5327  * anymore. This check is critical for databases upgraded by
5328  * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5329  * that such multis are never passed.
5330  */
5331  if (HEAP_LOCKED_UPGRADED(old_infomask))
5332  {
5333  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5334  old_infomask |= HEAP_XMAX_INVALID;
5335  goto l5;
5336  }
5337 
5338  /*
5339  * If the XMAX is already a MultiXactId, then we need to expand it to
5340  * include add_to_xmax; but if all the members were lockers and are
5341  * all gone, we can do away with the IS_MULTI bit and just set
5342  * add_to_xmax as the only locker/updater. If all lockers are gone
5343  * and we have an updater that aborted, we can also do without a
5344  * multi.
5345  *
5346  * The cost of doing GetMultiXactIdMembers would be paid by
5347  * MultiXactIdExpand if we weren't to do this, so this check is not
5348  * incurring extra work anyhow.
5349  */
5350  if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5351  {
5352  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5354  old_infomask)))
5355  {
5356  /*
5357  * Reset these bits and restart; otherwise fall through to
5358  * create a new multi below.
5359  */
5360  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5361  old_infomask |= HEAP_XMAX_INVALID;
5362  goto l5;
5363  }
5364  }
5365 
5366  new_status = get_mxact_status_for_lock(mode, is_update);
5367 
5368  new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5369  new_status);
5370  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5371  }
5372  else if (old_infomask & HEAP_XMAX_COMMITTED)
5373  {
5374  /*
5375  * It's a committed update, so we need to preserve him as updater of
5376  * the tuple.
5377  */
5379  MultiXactStatus new_status;
5380 
5381  if (old_infomask2 & HEAP_KEYS_UPDATED)
5382  status = MultiXactStatusUpdate;
5383  else
5384  status = MultiXactStatusNoKeyUpdate;
5385 
5386  new_status = get_mxact_status_for_lock(mode, is_update);
5387 
5388  /*
5389  * since it's not running, it's obviously impossible for the old
5390  * updater to be identical to the current one, so we need not check
5391  * for that case as we do in the block above.
5392  */
5393  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5394  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5395  }
5396  else if (TransactionIdIsInProgress(xmax))
5397  {
5398  /*
5399  * If the XMAX is a valid, in-progress TransactionId, then we need to
5400  * create a new MultiXactId that includes both the old locker or
5401  * updater and our own TransactionId.
5402  */
5403  MultiXactStatus new_status;
5404  MultiXactStatus old_status;
5405  LockTupleMode old_mode;
5406 
5407  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5408  {
5409  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5410  old_status = MultiXactStatusForKeyShare;
5411  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5412  old_status = MultiXactStatusForShare;
5413  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5414  {
5415  if (old_infomask2 & HEAP_KEYS_UPDATED)
5416  old_status = MultiXactStatusForUpdate;
5417  else
5418  old_status = MultiXactStatusForNoKeyUpdate;
5419  }
5420  else
5421  {
5422  /*
5423  * LOCK_ONLY can be present alone only when a page has been
5424  * upgraded by pg_upgrade. But in that case,
5425  * TransactionIdIsInProgress() should have returned false. We
5426  * assume it's no longer locked in this case.
5427  */
5428  elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5429  old_infomask |= HEAP_XMAX_INVALID;
5430  old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5431  goto l5;
5432  }
5433  }
5434  else
5435  {
5436  /* it's an update, but which kind? */
5437  if (old_infomask2 & HEAP_KEYS_UPDATED)
5438  old_status = MultiXactStatusUpdate;
5439  else
5440  old_status = MultiXactStatusNoKeyUpdate;
5441  }
5442 
5443  old_mode = TUPLOCK_from_mxstatus(old_status);
5444 
5445  /*
5446  * If the lock to be acquired is for the same TransactionId as the
5447  * existing lock, there's an optimization possible: consider only the
5448  * strongest of both locks as the only one present, and restart.
5449  */
5450  if (xmax == add_to_xmax)
5451  {
5452  /*
5453  * Note that it's not possible for the original tuple to be
5454  * updated: we wouldn't be here because the tuple would have been
5455  * invisible and we wouldn't try to update it. As a subtlety,
5456  * this code can also run when traversing an update chain to lock
5457  * future versions of a tuple. But we wouldn't be here either,
5458  * because the add_to_xmax would be different from the original
5459  * updater.
5460  */
5461  Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5462 
5463  /* acquire the strongest of both */
5464  if (mode < old_mode)
5465  mode = old_mode;
5466  /* mustn't touch is_update */
5467 
5468  old_infomask |= HEAP_XMAX_INVALID;
5469  goto l5;
5470  }
5471 
5472  /* otherwise, just fall back to creating a new multixact */
5473  new_status = get_mxact_status_for_lock(mode, is_update);
5474  new_xmax = MultiXactIdCreate(xmax, old_status,
5475  add_to_xmax, new_status);
5476  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5477  }
5478  else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5479  TransactionIdDidCommit(xmax))
5480  {
5481  /*
5482  * It's a committed update, so we gotta preserve him as updater of the
5483  * tuple.
5484  */
5486  MultiXactStatus new_status;
5487 
5488  if (old_infomask2 & HEAP_KEYS_UPDATED)
5489  status = MultiXactStatusUpdate;
5490  else
5491  status = MultiXactStatusNoKeyUpdate;
5492 
5493  new_status = get_mxact_status_for_lock(mode, is_update);
5494 
5495  /*
5496  * since it's not running, it's obviously impossible for the old
5497  * updater to be identical to the current one, so we need not check
5498  * for that case as we do in the block above.
5499  */
5500  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5501  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5502  }
5503  else
5504  {
5505  /*
5506  * Can get here iff the locking/updating transaction was running when
5507  * the infomask was extracted from the tuple, but finished before
5508  * TransactionIdIsInProgress got to run. Deal with it as if there was
5509  * no locker at all in the first place.
5510  */
5511  old_infomask |= HEAP_XMAX_INVALID;
5512  goto l5;
5513  }
5514 
5515  *result_infomask = new_infomask;
5516  *result_infomask2 = new_infomask2;
5517  *result_xmax = new_xmax;
5518 }
5519 
5520 /*
5521  * Subroutine for heap_lock_updated_tuple_rec.
5522  *
5523  * Given a hypothetical multixact status held by the transaction identified
5524  * with the given xid, does the current transaction need to wait, fail, or can
5525  * it continue if it wanted to acquire a lock of the given mode? "needwait"
5526  * is set to true if waiting is necessary; if it can continue, then
5527  * HeapTupleMayBeUpdated is returned. If the lock is already held by the
5528  * current transaction, return HeapTupleSelfUpdated. In case of a conflict
5529  * with another transaction, a different HeapTupleSatisfiesUpdate return code
5530  * is returned.
5531  *
5532  * The held status is said to be hypothetical because it might correspond to a
5533  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5534  * way for simplicity of API.
5535  */
5536 static HTSU_Result
5538  LockTupleMode mode, bool *needwait)
5539 {
5540  MultiXactStatus wantedstatus;
5541 
5542  *needwait = false;
5543  wantedstatus = get_mxact_status_for_lock(mode, false);
5544 
5545  /*
5546  * Note: we *must* check TransactionIdIsInProgress before
5547  * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an
5548  * explanation.
5549  */
5551  {
5552  /*
5553  * The tuple has already been locked by our own transaction. This is
5554  * very rare but can happen if multiple transactions are trying to
5555  * lock an ancient version of the same tuple.
5556  */
5557  return HeapTupleSelfUpdated;
5558  }
5559  else if (TransactionIdIsInProgress(xid))
5560  {
5561  /*
5562  * If the locking transaction is running, what we do depends on
5563  * whether the lock modes conflict: if they do, then we must wait for
5564  * it to finish; otherwise we can fall through to lock this tuple
5565  * version without waiting.
5566  */
5568  LOCKMODE_from_mxstatus(wantedstatus)))
5569  {
5570  *needwait = true;
5571  }
5572 
5573  /*
5574  * If we set needwait above, then this value doesn't matter;
5575  * otherwise, this value signals to caller that it's okay to proceed.
5576  */
5577  return HeapTupleMayBeUpdated;
5578  }
5579  else if (TransactionIdDidAbort(xid))
5580  return HeapTupleMayBeUpdated;
5581  else if (TransactionIdDidCommit(xid))
5582  {
5583  /*
5584  * The other transaction committed. If it was only a locker, then the
5585  * lock is completely gone now and we can return success; but if it
5586  * was an update, then what we do depends on whether the two lock
5587  * modes conflict. If they conflict, then we must report error to
5588  * caller. But if they don't, we can fall through to allow the current
5589  * transaction to lock the tuple.
5590  *
5591  * Note: the reason we worry about ISUPDATE here is because as soon as
5592  * a transaction ends, all its locks are gone and meaningless, and
5593  * thus we can ignore them; whereas its updates persist. In the
5594  * TransactionIdIsInProgress case, above, we don't need to check
5595  * because we know the lock is still "alive" and thus a conflict needs
5596  * always be checked.
5597  */
5598  if (!ISUPDATE_from_mxstatus(status))
5599  return HeapTupleMayBeUpdated;
5600 
5602  LOCKMODE_from_mxstatus(wantedstatus)))
5603  /* bummer */
5604  return HeapTupleUpdated;
5605 
5606  return HeapTupleMayBeUpdated;
5607  }
5608 
5609  /* Not in progress, not aborted, not committed -- must have crashed */
5610  return HeapTupleMayBeUpdated;
5611 }
5612 
5613 
5614 /*
5615  * Recursive part of heap_lock_updated_tuple
5616  *
5617  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5618  * xid with the given mode; if this tuple is updated, recurse to lock the new
5619  * version as well.
5620  */
5621 static HTSU_Result
5623  LockTupleMode mode)
5624 {
5626  ItemPointerData tupid;
5627  HeapTupleData mytup;
5628  Buffer buf;
5629  uint16 new_infomask,
5630  new_infomask2,
5631  old_infomask,
5632  old_infomask2;
5633  TransactionId xmax,
5634  new_xmax;
5635  TransactionId priorXmax = InvalidTransactionId;
5636  bool cleared_all_frozen = false;
5637  Buffer vmbuffer = InvalidBuffer;
5638  BlockNumber block;
5639 
5640  ItemPointerCopy(tid, &tupid);
5641 
5642  for (;;)
5643  {
5644  new_infomask = 0;
5645  new_xmax = InvalidTransactionId;
5646  block = ItemPointerGetBlockNumber(&tupid);
5647  ItemPointerCopy(&tupid, &(mytup.t_self));
5648 
5649  if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
5650  {
5651  /*
5652  * if we fail to find the updated version of the tuple, it's
5653  * because it was vacuumed/pruned away after its creator
5654  * transaction aborted. So behave as if we got to the end of the
5655  * chain, and there's no further tuple to lock: return success to
5656  * caller.
5657  */
5658  return HeapTupleMayBeUpdated;
5659  }
5660 
5661 l4:
5663 
5664  /*
5665  * Before locking the buffer, pin the visibility map page if it
5666  * appears to be necessary. Since we haven't got the lock yet,
5667  * someone else might be in the middle of changing this, so we'll need
5668  * to recheck after we have the lock.
5669  */
5670  if (PageIsAllVisible(BufferGetPage(buf)))
5671  visibilitymap_pin(rel, block, &vmbuffer);
5672  else
5673  vmbuffer = InvalidBuffer;
5674 
5676 
5677  /*
5678  * If we didn't pin the visibility map page and the page has become
5679  * all visible while we were busy locking the buffer, we'll have to
5680  * unlock and re-lock, to avoid holding the buffer lock across I/O.
5681  * That's a bit unfortunate, but hopefully shouldn't happen often.
5682  */
5683  if (vmbuffer == InvalidBuffer && PageIsAllVisible(BufferGetPage(buf)))
5684  {
5686  visibilitymap_pin(rel, block, &vmbuffer);
5688  }
5689 
5690  /*
5691  * Check the tuple XMIN against prior XMAX, if any. If we reached the
5692  * end of the chain, we're done, so return success.
5693  */
5694  if (TransactionIdIsValid(priorXmax) &&
5696  priorXmax))
5697  {
5698  result = HeapTupleMayBeUpdated;
5699  goto out_locked;
5700  }
5701 
5702  /*
5703  * Also check Xmin: if this tuple was created by an aborted
5704  * (sub)transaction, then we already locked the last live one in the
5705  * chain, thus we're done, so return success.
5706  */
5708  {
5709  UnlockReleaseBuffer(buf);
5710  return HeapTupleMayBeUpdated;
5711  }
5712 
5713  old_infomask = mytup.t_data->t_infomask;
5714  old_infomask2 = mytup.t_data->t_infomask2;
5715  xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5716 
5717  /*
5718  * If this tuple version has been updated or locked by some concurrent
5719  * transaction(s), what we do depends on whether our lock mode
5720  * conflicts with what those other transactions hold, and also on the
5721  * status of them.
5722  */
5723  if (!(old_infomask & HEAP_XMAX_INVALID))
5724  {
5725  TransactionId rawxmax;
5726  bool needwait;
5727 
5728  rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5729  if (old_infomask & HEAP_XMAX_IS_MULTI)
5730  {
5731  int nmembers;
5732  int i;
5733  MultiXactMember *members;
5734 
5735  /*
5736  * We don't need a test for pg_upgrade'd tuples: this is only
5737  * applied to tuples after the first in an update chain. Said
5738  * first tuple in the chain may well be locked-in-9.2-and-
5739  * pg_upgraded, but that one was already locked by our caller,
5740  * not us; and any subsequent ones cannot be because our
5741  * caller must necessarily have obtained a snapshot later than
5742  * the pg_upgrade itself.
5743  */
5745 
5746  nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5747  HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5748  for (i = 0; i < nmembers; i++)
5749  {
5750  result = test_lockmode_for_conflict(members[i].status,
5751  members[i].xid,
5752  mode, &needwait);
5753 
5754  /*
5755  * If the tuple was already locked by ourselves in a
5756  * previous iteration of this (say heap_lock_tuple was
5757  * forced to restart the locking loop because of a change
5758  * in xmax), then we hold the lock already on this tuple
5759  * version and we don't need to do anything; and this is
5760  * not an error condition either. We just need to skip
5761  * this tuple and continue locking the next version in the
5762  * update chain.
5763  */
5764  if (result == HeapTupleSelfUpdated)
5765  {
5766  pfree(members);
5767  goto next;
5768  }
5769 
5770  if (needwait)
5771  {
5773  XactLockTableWait(members[i].xid, rel,
5774  &mytup.t_self,
5776  pfree(members);
5777  goto l4;
5778  }
5779  if (result != HeapTupleMayBeUpdated)
5780  {
5781  pfree(members);
5782  goto out_locked;
5783  }
5784  }
5785  if (members)
5786  pfree(members);
5787  }
5788  else
5789  {
5791 
5792  /*
5793  * For a non-multi Xmax, we first need to compute the
5794  * corresponding MultiXactStatus by using the infomask bits.
5795  */
5796  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5797  {
5798  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5799  status = MultiXactStatusForKeyShare;
5800  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5801  status = MultiXactStatusForShare;
5802  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5803  {
5804  if (old_infomask2 & HEAP_KEYS_UPDATED)
5805  status = MultiXactStatusForUpdate;
5806  else
5808  }
5809  else
5810  {
5811  /*
5812  * LOCK_ONLY present alone (a pg_upgraded tuple marked
5813  * as share-locked in the old cluster) shouldn't be
5814  * seen in the middle of an update chain.
5815  */
5816  elog(ERROR, "invalid lock status in tuple");
5817  }
5818  }
5819  else
5820  {
5821  /* it's an update, but which kind? */
5822  if (old_infomask2 & HEAP_KEYS_UPDATED)
5823  status = MultiXactStatusUpdate;
5824  else
5825  status = MultiXactStatusNoKeyUpdate;
5826  }
5827 
5828  result = test_lockmode_for_conflict(status, rawxmax, mode,
5829  &needwait);
5830 
5831  /*
5832  * If the tuple was already locked by ourselves in a previous
5833  * iteration of this (say heap_lock_tuple was forced to
5834  * restart the locking loop because of a change in xmax), then
5835  * we hold the lock already on this tuple version and we don't
5836  * need to do anything; and this is not an error condition
5837  * either. We just need to skip this tuple and continue
5838  * locking the next version in the update chain.
5839  */
5840  if (result == HeapTupleSelfUpdated)
5841  goto next;
5842 
5843  if (needwait)
5844  {
5846  XactLockTableWait(rawxmax, rel, &mytup.t_self,
5847