PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
heapam.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  * heap access method code
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  * relation_open - open any relation by relation OID
16  * relation_openrv - open any relation specified by a RangeVar
17  * relation_close - close any relation
18  * heap_open - open a heap relation by relation OID
19  * heap_openrv - open a heap relation specified by a RangeVar
20  * heap_close - (now just a macro for relation_close)
21  * heap_beginscan - begin relation scan
22  * heap_rescan - restart a relation scan
23  * heap_endscan - end relation scan
24  * heap_getnext - retrieve next tuple in scan
25  * heap_fetch - retrieve tuple with given tid
26  * heap_insert - insert tuple into a relation
27  * heap_multi_insert - insert multiple tuples into a relation
28  * heap_delete - delete a tuple from a relation
29  * heap_update - replace a tuple in a relation with another tuple
30  * heap_sync - sync heap, for when no WAL has been written
31  *
32  * NOTES
33  * This file contains the heap_ routines which implement
34  * the POSTGRES heap access method used for all POSTGRES
35  * relations.
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include "access/bufmask.h"
42 #include "access/heapam.h"
43 #include "access/heapam_xlog.h"
44 #include "access/hio.h"
45 #include "access/multixact.h"
46 #include "access/parallel.h"
47 #include "access/relscan.h"
48 #include "access/sysattr.h"
49 #include "access/transam.h"
50 #include "access/tuptoaster.h"
51 #include "access/valid.h"
52 #include "access/visibilitymap.h"
53 #include "access/xact.h"
54 #include "access/xlog.h"
55 #include "access/xloginsert.h"
56 #include "access/xlogutils.h"
57 #include "catalog/catalog.h"
58 #include "catalog/namespace.h"
59 #include "miscadmin.h"
60 #include "pgstat.h"
61 #include "storage/bufmgr.h"
62 #include "storage/freespace.h"
63 #include "storage/lmgr.h"
64 #include "storage/predicate.h"
65 #include "storage/procarray.h"
66 #include "storage/smgr.h"
67 #include "storage/spin.h"
68 #include "storage/standby.h"
69 #include "utils/datum.h"
70 #include "utils/inval.h"
71 #include "utils/lsyscache.h"
72 #include "utils/relcache.h"
73 #include "utils/snapmgr.h"
74 #include "utils/syscache.h"
75 #include "utils/tqual.h"
76 
77 
78 /* GUC variable */
80 
81 
83  Snapshot snapshot,
84  int nkeys, ScanKey key,
85  ParallelHeapScanDesc parallel_scan,
86  bool allow_strat,
87  bool allow_sync,
88  bool allow_pagemode,
89  bool is_bitmapscan,
90  bool is_samplescan,
91  bool temp_snap);
93 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
94  TransactionId xid, CommandId cid, int options);
95 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
96  Buffer newbuf, HeapTuple oldtup,
97  HeapTuple newtup, HeapTuple old_key_tup,
98  bool all_visible_cleared, bool new_all_visible_cleared);
99 static void HeapSatisfiesHOTandKeyUpdate(Relation relation,
100  Bitmapset *hot_attrs,
101  Bitmapset *key_attrs, Bitmapset *id_attrs,
102  bool *satisfies_hot, bool *satisfies_key,
103  bool *satisfies_id,
104  HeapTuple oldtup, HeapTuple newtup);
105 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
106  LockTupleMode mode, LockWaitPolicy wait_policy,
107  bool *have_tuple_lock);
108 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
109  uint16 old_infomask2, TransactionId add_to_xmax,
110  LockTupleMode mode, bool is_update,
111  TransactionId *result_xmax, uint16 *result_infomask,
112  uint16 *result_infomask2);
114  ItemPointer ctid, TransactionId xid,
115  LockTupleMode mode);
116 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
117  uint16 *new_infomask2);
119  uint16 t_infomask);
120 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
121  LockTupleMode lockmode);
122 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
123  Relation rel, ItemPointer ctid, XLTW_Oper oper,
124  int *remaining);
126  uint16 infomask, Relation rel, int *remaining);
127 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
128 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
129  bool *copy);
130 
131 
132 /*
133  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
134  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
135  * update them). This table (and the macros below) helps us determine the
136  * heavyweight lock mode and MultiXactStatus values to use for any particular
137  * tuple lock strength.
138  *
139  * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
140  * instead.
141  */
142 static const struct
143 {
147 }
148 
150 {
151  { /* LockTupleKeyShare */
154  -1 /* KeyShare does not allow updating tuples */
155  },
156  { /* LockTupleShare */
157  RowShareLock,
159  -1 /* Share does not allow updating tuples */
160  },
161  { /* LockTupleNoKeyExclusive */
165  },
166  { /* LockTupleExclusive */
170  }
171 };
172 
173 /* Get the LOCKMODE for a given MultiXactStatus */
174 #define LOCKMODE_from_mxstatus(status) \
175  (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
176 
177 /*
178  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
179  * This is more readable than having every caller translate it to lock.h's
180  * LOCKMODE.
181  */
182 #define LockTupleTuplock(rel, tup, mode) \
183  LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
184 #define UnlockTupleTuplock(rel, tup, mode) \
185  UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
186 #define ConditionalLockTupleTuplock(rel, tup, mode) \
187  ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
188 
189 /*
190  * This table maps tuple lock strength values for each particular
191  * MultiXactStatus value.
192  */
194 {
195  LockTupleKeyShare, /* ForKeyShare */
196  LockTupleShare, /* ForShare */
197  LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
198  LockTupleExclusive, /* ForUpdate */
199  LockTupleNoKeyExclusive, /* NoKeyUpdate */
200  LockTupleExclusive /* Update */
201 };
202 
203 /* Get the LockTupleMode for a given MultiXactStatus */
204 #define TUPLOCK_from_mxstatus(status) \
205  (MultiXactStatusLock[(status)])
206 
207 /* ----------------------------------------------------------------
208  * heap support routines
209  * ----------------------------------------------------------------
210  */
211 
212 /* ----------------
213  * initscan - scan code common to heap_beginscan and heap_rescan
214  * ----------------
215  */
216 static void
217 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
218 {
219  bool allow_strat;
220  bool allow_sync;
221 
222  /*
223  * Determine the number of blocks we have to scan.
224  *
225  * It is sufficient to do this once at scan start, since any tuples added
226  * while the scan is in progress will be invisible to my snapshot anyway.
227  * (That is not true when using a non-MVCC snapshot. However, we couldn't
228  * guarantee to return tuples added after scan start anyway, since they
229  * might go into pages we already scanned. To guarantee consistent
230  * results for a non-MVCC snapshot, the caller must hold some higher-level
231  * lock that ensures the interesting tuple(s) won't change.)
232  */
233  if (scan->rs_parallel != NULL)
234  scan->rs_nblocks = scan->rs_parallel->phs_nblocks;
235  else
237 
238  /*
239  * If the table is large relative to NBuffers, use a bulk-read access
240  * strategy and enable synchronized scanning (see syncscan.c). Although
241  * the thresholds for these features could be different, we make them the
242  * same so that there are only two behaviors to tune rather than four.
243  * (However, some callers need to be able to disable one or both of these
244  * behaviors, independently of the size of the table; also there is a GUC
245  * variable that can disable synchronized scanning.)
246  *
247  * Note that heap_parallelscan_initialize has a very similar test; if you
248  * change this, consider changing that one, too.
249  */
250  if (!RelationUsesLocalBuffers(scan->rs_rd) &&
251  scan->rs_nblocks > NBuffers / 4)
252  {
253  allow_strat = scan->rs_allow_strat;
254  allow_sync = scan->rs_allow_sync;
255  }
256  else
257  allow_strat = allow_sync = false;
258 
259  if (allow_strat)
260  {
261  /* During a rescan, keep the previous strategy object. */
262  if (scan->rs_strategy == NULL)
264  }
265  else
266  {
267  if (scan->rs_strategy != NULL)
269  scan->rs_strategy = NULL;
270  }
271 
272  if (scan->rs_parallel != NULL)
273  {
274  /* For parallel scan, believe whatever ParallelHeapScanDesc says. */
275  scan->rs_syncscan = scan->rs_parallel->phs_syncscan;
276  }
277  else if (keep_startblock)
278  {
279  /*
280  * When rescanning, we want to keep the previous startblock setting,
281  * so that rewinding a cursor doesn't generate surprising results.
282  * Reset the active syncscan setting, though.
283  */
284  scan->rs_syncscan = (allow_sync && synchronize_seqscans);
285  }
286  else if (allow_sync && synchronize_seqscans)
287  {
288  scan->rs_syncscan = true;
289  scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
290  }
291  else
292  {
293  scan->rs_syncscan = false;
294  scan->rs_startblock = 0;
295  }
296 
298  scan->rs_inited = false;
299  scan->rs_ctup.t_data = NULL;
301  scan->rs_cbuf = InvalidBuffer;
303 
304  /* page-at-a-time fields are always invalid when not rs_inited */
305 
306  /*
307  * copy the scan key, if appropriate
308  */
309  if (key != NULL)
310  memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
311 
312  /*
313  * Currently, we don't have a stats counter for bitmap heap scans (but the
314  * underlying bitmap index scans will be counted) or sample scans (we only
315  * update stats for tuple fetches there)
316  */
317  if (!scan->rs_bitmapscan && !scan->rs_samplescan)
319 }
320 
321 /*
322  * heap_setscanlimits - restrict range of a heapscan
323  *
324  * startBlk is the page to start at
325  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
326  */
327 void
329 {
330  Assert(!scan->rs_inited); /* else too late to change */
331  Assert(!scan->rs_syncscan); /* else rs_startblock is significant */
332 
333  /* Check startBlk is valid (but allow case of zero blocks...) */
334  Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
335 
336  scan->rs_startblock = startBlk;
337  scan->rs_numblocks = numBlks;
338 }
339 
340 /*
341  * heapgetpage - subroutine for heapgettup()
342  *
343  * This routine reads and pins the specified page of the relation.
344  * In page-at-a-time mode it performs additional work, namely determining
345  * which tuples on the page are visible.
346  */
347 void
349 {
350  Buffer buffer;
351  Snapshot snapshot;
352  Page dp;
353  int lines;
354  int ntup;
355  OffsetNumber lineoff;
356  ItemId lpp;
357  bool all_visible;
358 
359  Assert(page < scan->rs_nblocks);
360 
361  /* release previous scan buffer, if any */
362  if (BufferIsValid(scan->rs_cbuf))
363  {
364  ReleaseBuffer(scan->rs_cbuf);
365  scan->rs_cbuf = InvalidBuffer;
366  }
367 
368  /*
369  * Be sure to check for interrupts at least once per page. Checks at
370  * higher code levels won't be able to stop a seqscan that encounters many
371  * pages' worth of consecutive dead tuples.
372  */
374 
375  /* read page using selected strategy */
376  scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
377  RBM_NORMAL, scan->rs_strategy);
378  scan->rs_cblock = page;
379 
380  if (!scan->rs_pageatatime)
381  return;
382 
383  buffer = scan->rs_cbuf;
384  snapshot = scan->rs_snapshot;
385 
386  /*
387  * Prune and repair fragmentation for the whole page, if possible.
388  */
389  heap_page_prune_opt(scan->rs_rd, buffer);
390 
391  /*
392  * We must hold share lock on the buffer content while examining tuple
393  * visibility. Afterwards, however, the tuples we have found to be
394  * visible are guaranteed good as long as we hold the buffer pin.
395  */
396  LockBuffer(buffer, BUFFER_LOCK_SHARE);
397 
398  dp = BufferGetPage(buffer);
399  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
400  lines = PageGetMaxOffsetNumber(dp);
401  ntup = 0;
402 
403  /*
404  * If the all-visible flag indicates that all tuples on the page are
405  * visible to everyone, we can skip the per-tuple visibility tests.
406  *
407  * Note: In hot standby, a tuple that's already visible to all
408  * transactions in the master might still be invisible to a read-only
409  * transaction in the standby. We partly handle this problem by tracking
410  * the minimum xmin of visible tuples as the cut-off XID while marking a
411  * page all-visible on master and WAL log that along with the visibility
412  * map SET operation. In hot standby, we wait for (or abort) all
413  * transactions that can potentially may not see one or more tuples on the
414  * page. That's how index-only scans work fine in hot standby. A crucial
415  * difference between index-only scans and heap scans is that the
416  * index-only scan completely relies on the visibility map where as heap
417  * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
418  * the page-level flag can be trusted in the same way, because it might
419  * get propagated somehow without being explicitly WAL-logged, e.g. via a
420  * full page write. Until we can prove that beyond doubt, let's check each
421  * tuple for visibility the hard way.
422  */
423  all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
424 
425  for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
426  lineoff <= lines;
427  lineoff++, lpp++)
428  {
429  if (ItemIdIsNormal(lpp))
430  {
431  HeapTupleData loctup;
432  bool valid;
433 
434  loctup.t_tableOid = RelationGetRelid(scan->rs_rd);
435  loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
436  loctup.t_len = ItemIdGetLength(lpp);
437  ItemPointerSet(&(loctup.t_self), page, lineoff);
438 
439  if (all_visible)
440  valid = true;
441  else
442  valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
443 
444  CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
445  buffer, snapshot);
446 
447  if (valid)
448  scan->rs_vistuples[ntup++] = lineoff;
449  }
450  }
451 
453 
454  Assert(ntup <= MaxHeapTuplesPerPage);
455  scan->rs_ntuples = ntup;
456 }
457 
458 /* ----------------
459  * heapgettup - fetch next heap tuple
460  *
461  * Initialize the scan if not already done; then advance to the next
462  * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
463  * or set scan->rs_ctup.t_data = NULL if no more tuples.
464  *
465  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
466  * by scan->rs_ctup".
467  *
468  * Note: the reason nkeys/key are passed separately, even though they are
469  * kept in the scan descriptor, is that the caller may not want us to check
470  * the scankeys.
471  *
472  * Note: when we fall off the end of the scan in either direction, we
473  * reset rs_inited. This means that a further request with the same
474  * scan direction will restart the scan, which is a bit odd, but a
475  * request with the opposite scan direction will start a fresh scan
476  * in the proper direction. The latter is required behavior for cursors,
477  * while the former case is generally undefined behavior in Postgres
478  * so we don't care too much.
479  * ----------------
480  */
481 static void
483  ScanDirection dir,
484  int nkeys,
485  ScanKey key)
486 {
487  HeapTuple tuple = &(scan->rs_ctup);
488  Snapshot snapshot = scan->rs_snapshot;
489  bool backward = ScanDirectionIsBackward(dir);
490  BlockNumber page;
491  bool finished;
492  Page dp;
493  int lines;
494  OffsetNumber lineoff;
495  int linesleft;
496  ItemId lpp;
497 
498  /*
499  * calculate next starting lineoff, given scan direction
500  */
501  if (ScanDirectionIsForward(dir))
502  {
503  if (!scan->rs_inited)
504  {
505  /*
506  * return null immediately if relation is empty
507  */
508  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
509  {
510  Assert(!BufferIsValid(scan->rs_cbuf));
511  tuple->t_data = NULL;
512  return;
513  }
514  if (scan->rs_parallel != NULL)
515  {
516  page = heap_parallelscan_nextpage(scan);
517 
518  /* Other processes might have already finished the scan. */
519  if (page == InvalidBlockNumber)
520  {
521  Assert(!BufferIsValid(scan->rs_cbuf));
522  tuple->t_data = NULL;
523  return;
524  }
525  }
526  else
527  page = scan->rs_startblock; /* first page */
528  heapgetpage(scan, page);
529  lineoff = FirstOffsetNumber; /* first offnum */
530  scan->rs_inited = true;
531  }
532  else
533  {
534  /* continue from previously returned page/tuple */
535  page = scan->rs_cblock; /* current page */
536  lineoff = /* next offnum */
538  }
539 
541 
542  dp = BufferGetPage(scan->rs_cbuf);
543  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
544  lines = PageGetMaxOffsetNumber(dp);
545  /* page and lineoff now reference the physically next tid */
546 
547  linesleft = lines - lineoff + 1;
548  }
549  else if (backward)
550  {
551  /* backward parallel scan not supported */
552  Assert(scan->rs_parallel == NULL);
553 
554  if (!scan->rs_inited)
555  {
556  /*
557  * return null immediately if relation is empty
558  */
559  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
560  {
561  Assert(!BufferIsValid(scan->rs_cbuf));
562  tuple->t_data = NULL;
563  return;
564  }
565 
566  /*
567  * Disable reporting to syncscan logic in a backwards scan; it's
568  * not very likely anyone else is doing the same thing at the same
569  * time, and much more likely that we'll just bollix things for
570  * forward scanners.
571  */
572  scan->rs_syncscan = false;
573  /* start from last page of the scan */
574  if (scan->rs_startblock > 0)
575  page = scan->rs_startblock - 1;
576  else
577  page = scan->rs_nblocks - 1;
578  heapgetpage(scan, page);
579  }
580  else
581  {
582  /* continue from previously returned page/tuple */
583  page = scan->rs_cblock; /* current page */
584  }
585 
587 
588  dp = BufferGetPage(scan->rs_cbuf);
589  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
590  lines = PageGetMaxOffsetNumber(dp);
591 
592  if (!scan->rs_inited)
593  {
594  lineoff = lines; /* final offnum */
595  scan->rs_inited = true;
596  }
597  else
598  {
599  lineoff = /* previous offnum */
601  }
602  /* page and lineoff now reference the physically previous tid */
603 
604  linesleft = lineoff;
605  }
606  else
607  {
608  /*
609  * ``no movement'' scan direction: refetch prior tuple
610  */
611  if (!scan->rs_inited)
612  {
613  Assert(!BufferIsValid(scan->rs_cbuf));
614  tuple->t_data = NULL;
615  return;
616  }
617 
618  page = ItemPointerGetBlockNumber(&(tuple->t_self));
619  if (page != scan->rs_cblock)
620  heapgetpage(scan, page);
621 
622  /* Since the tuple was previously fetched, needn't lock page here */
623  dp = BufferGetPage(scan->rs_cbuf);
624  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
625  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
626  lpp = PageGetItemId(dp, lineoff);
627  Assert(ItemIdIsNormal(lpp));
628 
629  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
630  tuple->t_len = ItemIdGetLength(lpp);
631 
632  return;
633  }
634 
635  /*
636  * advance the scan until we find a qualifying tuple or run out of stuff
637  * to scan
638  */
639  lpp = PageGetItemId(dp, lineoff);
640  for (;;)
641  {
642  while (linesleft > 0)
643  {
644  if (ItemIdIsNormal(lpp))
645  {
646  bool valid;
647 
648  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
649  tuple->t_len = ItemIdGetLength(lpp);
650  ItemPointerSet(&(tuple->t_self), page, lineoff);
651 
652  /*
653  * if current tuple qualifies, return it.
654  */
655  valid = HeapTupleSatisfiesVisibility(tuple,
656  snapshot,
657  scan->rs_cbuf);
658 
659  CheckForSerializableConflictOut(valid, scan->rs_rd, tuple,
660  scan->rs_cbuf, snapshot);
661 
662  if (valid && key != NULL)
663  HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
664  nkeys, key, valid);
665 
666  if (valid)
667  {
669  return;
670  }
671  }
672 
673  /*
674  * otherwise move to the next item on the page
675  */
676  --linesleft;
677  if (backward)
678  {
679  --lpp; /* move back in this page's ItemId array */
680  --lineoff;
681  }
682  else
683  {
684  ++lpp; /* move forward in this page's ItemId array */
685  ++lineoff;
686  }
687  }
688 
689  /*
690  * if we get here, it means we've exhausted the items on this page and
691  * it's time to move to the next.
692  */
694 
695  /*
696  * advance to next/prior page and detect end of scan
697  */
698  if (backward)
699  {
700  finished = (page == scan->rs_startblock) ||
701  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
702  if (page == 0)
703  page = scan->rs_nblocks;
704  page--;
705  }
706  else if (scan->rs_parallel != NULL)
707  {
708  page = heap_parallelscan_nextpage(scan);
709  finished = (page == InvalidBlockNumber);
710  }
711  else
712  {
713  page++;
714  if (page >= scan->rs_nblocks)
715  page = 0;
716  finished = (page == scan->rs_startblock) ||
717  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
718 
719  /*
720  * Report our new scan position for synchronization purposes. We
721  * don't do that when moving backwards, however. That would just
722  * mess up any other forward-moving scanners.
723  *
724  * Note: we do this before checking for end of scan so that the
725  * final state of the position hint is back at the start of the
726  * rel. That's not strictly necessary, but otherwise when you run
727  * the same query multiple times the starting position would shift
728  * a little bit backwards on every invocation, which is confusing.
729  * We don't guarantee any specific ordering in general, though.
730  */
731  if (scan->rs_syncscan)
732  ss_report_location(scan->rs_rd, page);
733  }
734 
735  /*
736  * return NULL if we've exhausted all the pages
737  */
738  if (finished)
739  {
740  if (BufferIsValid(scan->rs_cbuf))
741  ReleaseBuffer(scan->rs_cbuf);
742  scan->rs_cbuf = InvalidBuffer;
744  tuple->t_data = NULL;
745  scan->rs_inited = false;
746  return;
747  }
748 
749  heapgetpage(scan, page);
750 
752 
753  dp = BufferGetPage(scan->rs_cbuf);
754  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
755  lines = PageGetMaxOffsetNumber((Page) dp);
756  linesleft = lines;
757  if (backward)
758  {
759  lineoff = lines;
760  lpp = PageGetItemId(dp, lines);
761  }
762  else
763  {
764  lineoff = FirstOffsetNumber;
765  lpp = PageGetItemId(dp, FirstOffsetNumber);
766  }
767  }
768 }
769 
770 /* ----------------
771  * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
772  *
773  * Same API as heapgettup, but used in page-at-a-time mode
774  *
775  * The internal logic is much the same as heapgettup's too, but there are some
776  * differences: we do not take the buffer content lock (that only needs to
777  * happen inside heapgetpage), and we iterate through just the tuples listed
778  * in rs_vistuples[] rather than all tuples on the page. Notice that
779  * lineindex is 0-based, where the corresponding loop variable lineoff in
780  * heapgettup is 1-based.
781  * ----------------
782  */
783 static void
785  ScanDirection dir,
786  int nkeys,
787  ScanKey key)
788 {
789  HeapTuple tuple = &(scan->rs_ctup);
790  bool backward = ScanDirectionIsBackward(dir);
791  BlockNumber page;
792  bool finished;
793  Page dp;
794  int lines;
795  int lineindex;
796  OffsetNumber lineoff;
797  int linesleft;
798  ItemId lpp;
799 
800  /*
801  * calculate next starting lineindex, given scan direction
802  */
803  if (ScanDirectionIsForward(dir))
804  {
805  if (!scan->rs_inited)
806  {
807  /*
808  * return null immediately if relation is empty
809  */
810  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
811  {
812  Assert(!BufferIsValid(scan->rs_cbuf));
813  tuple->t_data = NULL;
814  return;
815  }
816  if (scan->rs_parallel != NULL)
817  {
818  page = heap_parallelscan_nextpage(scan);
819 
820  /* Other processes might have already finished the scan. */
821  if (page == InvalidBlockNumber)
822  {
823  Assert(!BufferIsValid(scan->rs_cbuf));
824  tuple->t_data = NULL;
825  return;
826  }
827  }
828  else
829  page = scan->rs_startblock; /* first page */
830  heapgetpage(scan, page);
831  lineindex = 0;
832  scan->rs_inited = true;
833  }
834  else
835  {
836  /* continue from previously returned page/tuple */
837  page = scan->rs_cblock; /* current page */
838  lineindex = scan->rs_cindex + 1;
839  }
840 
841  dp = BufferGetPage(scan->rs_cbuf);
842  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
843  lines = scan->rs_ntuples;
844  /* page and lineindex now reference the next visible tid */
845 
846  linesleft = lines - lineindex;
847  }
848  else if (backward)
849  {
850  /* backward parallel scan not supported */
851  Assert(scan->rs_parallel == NULL);
852 
853  if (!scan->rs_inited)
854  {
855  /*
856  * return null immediately if relation is empty
857  */
858  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
859  {
860  Assert(!BufferIsValid(scan->rs_cbuf));
861  tuple->t_data = NULL;
862  return;
863  }
864 
865  /*
866  * Disable reporting to syncscan logic in a backwards scan; it's
867  * not very likely anyone else is doing the same thing at the same
868  * time, and much more likely that we'll just bollix things for
869  * forward scanners.
870  */
871  scan->rs_syncscan = false;
872  /* start from last page of the scan */
873  if (scan->rs_startblock > 0)
874  page = scan->rs_startblock - 1;
875  else
876  page = scan->rs_nblocks - 1;
877  heapgetpage(scan, page);
878  }
879  else
880  {
881  /* continue from previously returned page/tuple */
882  page = scan->rs_cblock; /* current page */
883  }
884 
885  dp = BufferGetPage(scan->rs_cbuf);
886  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
887  lines = scan->rs_ntuples;
888 
889  if (!scan->rs_inited)
890  {
891  lineindex = lines - 1;
892  scan->rs_inited = true;
893  }
894  else
895  {
896  lineindex = scan->rs_cindex - 1;
897  }
898  /* page and lineindex now reference the previous visible tid */
899 
900  linesleft = lineindex + 1;
901  }
902  else
903  {
904  /*
905  * ``no movement'' scan direction: refetch prior tuple
906  */
907  if (!scan->rs_inited)
908  {
909  Assert(!BufferIsValid(scan->rs_cbuf));
910  tuple->t_data = NULL;
911  return;
912  }
913 
914  page = ItemPointerGetBlockNumber(&(tuple->t_self));
915  if (page != scan->rs_cblock)
916  heapgetpage(scan, page);
917 
918  /* Since the tuple was previously fetched, needn't lock page here */
919  dp = BufferGetPage(scan->rs_cbuf);
920  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
921  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
922  lpp = PageGetItemId(dp, lineoff);
923  Assert(ItemIdIsNormal(lpp));
924 
925  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
926  tuple->t_len = ItemIdGetLength(lpp);
927 
928  /* check that rs_cindex is in sync */
929  Assert(scan->rs_cindex < scan->rs_ntuples);
930  Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
931 
932  return;
933  }
934 
935  /*
936  * advance the scan until we find a qualifying tuple or run out of stuff
937  * to scan
938  */
939  for (;;)
940  {
941  while (linesleft > 0)
942  {
943  lineoff = scan->rs_vistuples[lineindex];
944  lpp = PageGetItemId(dp, lineoff);
945  Assert(ItemIdIsNormal(lpp));
946 
947  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
948  tuple->t_len = ItemIdGetLength(lpp);
949  ItemPointerSet(&(tuple->t_self), page, lineoff);
950 
951  /*
952  * if current tuple qualifies, return it.
953  */
954  if (key != NULL)
955  {
956  bool valid;
957 
958  HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
959  nkeys, key, valid);
960  if (valid)
961  {
962  scan->rs_cindex = lineindex;
963  return;
964  }
965  }
966  else
967  {
968  scan->rs_cindex = lineindex;
969  return;
970  }
971 
972  /*
973  * otherwise move to the next item on the page
974  */
975  --linesleft;
976  if (backward)
977  --lineindex;
978  else
979  ++lineindex;
980  }
981 
982  /*
983  * if we get here, it means we've exhausted the items on this page and
984  * it's time to move to the next.
985  */
986  if (backward)
987  {
988  finished = (page == scan->rs_startblock) ||
989  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
990  if (page == 0)
991  page = scan->rs_nblocks;
992  page--;
993  }
994  else if (scan->rs_parallel != NULL)
995  {
996  page = heap_parallelscan_nextpage(scan);
997  finished = (page == InvalidBlockNumber);
998  }
999  else
1000  {
1001  page++;
1002  if (page >= scan->rs_nblocks)
1003  page = 0;
1004  finished = (page == scan->rs_startblock) ||
1005  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1006 
1007  /*
1008  * Report our new scan position for synchronization purposes. We
1009  * don't do that when moving backwards, however. That would just
1010  * mess up any other forward-moving scanners.
1011  *
1012  * Note: we do this before checking for end of scan so that the
1013  * final state of the position hint is back at the start of the
1014  * rel. That's not strictly necessary, but otherwise when you run
1015  * the same query multiple times the starting position would shift
1016  * a little bit backwards on every invocation, which is confusing.
1017  * We don't guarantee any specific ordering in general, though.
1018  */
1019  if (scan->rs_syncscan)
1020  ss_report_location(scan->rs_rd, page);
1021  }
1022 
1023  /*
1024  * return NULL if we've exhausted all the pages
1025  */
1026  if (finished)
1027  {
1028  if (BufferIsValid(scan->rs_cbuf))
1029  ReleaseBuffer(scan->rs_cbuf);
1030  scan->rs_cbuf = InvalidBuffer;
1031  scan->rs_cblock = InvalidBlockNumber;
1032  tuple->t_data = NULL;
1033  scan->rs_inited = false;
1034  return;
1035  }
1036 
1037  heapgetpage(scan, page);
1038 
1039  dp = BufferGetPage(scan->rs_cbuf);
1040  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
1041  lines = scan->rs_ntuples;
1042  linesleft = lines;
1043  if (backward)
1044  lineindex = lines - 1;
1045  else
1046  lineindex = 0;
1047  }
1048 }
1049 
1050 
1051 #if defined(DISABLE_COMPLEX_MACRO)
1052 /*
1053  * This is formatted so oddly so that the correspondence to the macro
1054  * definition in access/htup_details.h is maintained.
1055  */
1056 Datum
1057 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1058  bool *isnull)
1059 {
1060  return (
1061  (attnum) > 0 ?
1062  (
1063  (*(isnull) = false),
1064  HeapTupleNoNulls(tup) ?
1065  (
1066  (tupleDesc)->attrs[(attnum) - 1]->attcacheoff >= 0 ?
1067  (
1068  fetchatt((tupleDesc)->attrs[(attnum) - 1],
1069  (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1070  (tupleDesc)->attrs[(attnum) - 1]->attcacheoff)
1071  )
1072  :
1073  nocachegetattr((tup), (attnum), (tupleDesc))
1074  )
1075  :
1076  (
1077  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1078  (
1079  (*(isnull) = true),
1080  (Datum) NULL
1081  )
1082  :
1083  (
1084  nocachegetattr((tup), (attnum), (tupleDesc))
1085  )
1086  )
1087  )
1088  :
1089  (
1090  (Datum) NULL
1091  )
1092  );
1093 }
1094 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1095 
1096 
1097 /* ----------------------------------------------------------------
1098  * heap access method interface
1099  * ----------------------------------------------------------------
1100  */
1101 
1102 /* ----------------
1103  * relation_open - open any relation by relation OID
1104  *
1105  * If lockmode is not "NoLock", the specified kind of lock is
1106  * obtained on the relation. (Generally, NoLock should only be
1107  * used if the caller knows it has some appropriate lock on the
1108  * relation already.)
1109  *
1110  * An error is raised if the relation does not exist.
1111  *
1112  * NB: a "relation" is anything with a pg_class entry. The caller is
1113  * expected to check whether the relkind is something it can handle.
1114  * ----------------
1115  */
1116 Relation
1117 relation_open(Oid relationId, LOCKMODE lockmode)
1118 {
1119  Relation r;
1120 
1121  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1122 
1123  /* Get the lock before trying to open the relcache entry */
1124  if (lockmode != NoLock)
1125  LockRelationOid(relationId, lockmode);
1126 
1127  /* The relcache does all the real work... */
1128  r = RelationIdGetRelation(relationId);
1129 
1130  if (!RelationIsValid(r))
1131  elog(ERROR, "could not open relation with OID %u", relationId);
1132 
1133  /* Make note that we've accessed a temporary relation */
1134  if (RelationUsesLocalBuffers(r))
1135  MyXactAccessedTempRel = true;
1136 
1137  pgstat_initstats(r);
1138 
1139  return r;
1140 }
1141 
1142 /* ----------------
1143  * try_relation_open - open any relation by relation OID
1144  *
1145  * Same as relation_open, except return NULL instead of failing
1146  * if the relation does not exist.
1147  * ----------------
1148  */
1149 Relation
1150 try_relation_open(Oid relationId, LOCKMODE lockmode)
1151 {
1152  Relation r;
1153 
1154  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1155 
1156  /* Get the lock first */
1157  if (lockmode != NoLock)
1158  LockRelationOid(relationId, lockmode);
1159 
1160  /*
1161  * Now that we have the lock, probe to see if the relation really exists
1162  * or not.
1163  */
1164  if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId)))
1165  {
1166  /* Release useless lock */
1167  if (lockmode != NoLock)
1168  UnlockRelationOid(relationId, lockmode);
1169 
1170  return NULL;
1171  }
1172 
1173  /* Should be safe to do a relcache load */
1174  r = RelationIdGetRelation(relationId);
1175 
1176  if (!RelationIsValid(r))
1177  elog(ERROR, "could not open relation with OID %u", relationId);
1178 
1179  /* Make note that we've accessed a temporary relation */
1180  if (RelationUsesLocalBuffers(r))
1181  MyXactAccessedTempRel = true;
1182 
1183  pgstat_initstats(r);
1184 
1185  return r;
1186 }
1187 
1188 /* ----------------
1189  * relation_openrv - open any relation specified by a RangeVar
1190  *
1191  * Same as relation_open, but the relation is specified by a RangeVar.
1192  * ----------------
1193  */
1194 Relation
1195 relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
1196 {
1197  Oid relOid;
1198 
1199  /*
1200  * Check for shared-cache-inval messages before trying to open the
1201  * relation. This is needed even if we already hold a lock on the
1202  * relation, because GRANT/REVOKE are executed without taking any lock on
1203  * the target relation, and we want to be sure we see current ACL
1204  * information. We can skip this if asked for NoLock, on the assumption
1205  * that such a call is not the first one in the current command, and so we
1206  * should be reasonably up-to-date already. (XXX this all could stand to
1207  * be redesigned, but for the moment we'll keep doing this like it's been
1208  * done historically.)
1209  */
1210  if (lockmode != NoLock)
1212 
1213  /* Look up and lock the appropriate relation using namespace search */
1214  relOid = RangeVarGetRelid(relation, lockmode, false);
1215 
1216  /* Let relation_open do the rest */
1217  return relation_open(relOid, NoLock);
1218 }
1219 
1220 /* ----------------
1221  * relation_openrv_extended - open any relation specified by a RangeVar
1222  *
1223  * Same as relation_openrv, but with an additional missing_ok argument
1224  * allowing a NULL return rather than an error if the relation is not
1225  * found. (Note that some other causes, such as permissions problems,
1226  * will still result in an ereport.)
1227  * ----------------
1228  */
1229 Relation
1230 relation_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1231  bool missing_ok)
1232 {
1233  Oid relOid;
1234 
1235  /*
1236  * Check for shared-cache-inval messages before trying to open the
1237  * relation. See comments in relation_openrv().
1238  */
1239  if (lockmode != NoLock)
1241 
1242  /* Look up and lock the appropriate relation using namespace search */
1243  relOid = RangeVarGetRelid(relation, lockmode, missing_ok);
1244 
1245  /* Return NULL on not-found */
1246  if (!OidIsValid(relOid))
1247  return NULL;
1248 
1249  /* Let relation_open do the rest */
1250  return relation_open(relOid, NoLock);
1251 }
1252 
1253 /* ----------------
1254  * relation_close - close any relation
1255  *
1256  * If lockmode is not "NoLock", we then release the specified lock.
1257  *
1258  * Note that it is often sensible to hold a lock beyond relation_close;
1259  * in that case, the lock is released automatically at xact end.
1260  * ----------------
1261  */
1262 void
1263 relation_close(Relation relation, LOCKMODE lockmode)
1264 {
1265  LockRelId relid = relation->rd_lockInfo.lockRelId;
1266 
1267  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1268 
1269  /* The relcache does the real work... */
1270  RelationClose(relation);
1271 
1272  if (lockmode != NoLock)
1273  UnlockRelationId(&relid, lockmode);
1274 }
1275 
1276 
1277 /* ----------------
1278  * heap_open - open a heap relation by relation OID
1279  *
1280  * This is essentially relation_open plus check that the relation
1281  * is not an index nor a composite type. (The caller should also
1282  * check that it's not a view or foreign table before assuming it has
1283  * storage.)
1284  * ----------------
1285  */
1286 Relation
1287 heap_open(Oid relationId, LOCKMODE lockmode)
1288 {
1289  Relation r;
1290 
1291  r = relation_open(relationId, lockmode);
1292 
1293  if (r->rd_rel->relkind == RELKIND_INDEX)
1294  ereport(ERROR,
1295  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1296  errmsg("\"%s\" is an index",
1298  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1299  ereport(ERROR,
1300  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1301  errmsg("\"%s\" is a composite type",
1303 
1304  return r;
1305 }
1306 
1307 /* ----------------
1308  * heap_openrv - open a heap relation specified
1309  * by a RangeVar node
1310  *
1311  * As above, but relation is specified by a RangeVar.
1312  * ----------------
1313  */
1314 Relation
1315 heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
1316 {
1317  Relation r;
1318 
1319  r = relation_openrv(relation, lockmode);
1320 
1321  if (r->rd_rel->relkind == RELKIND_INDEX)
1322  ereport(ERROR,
1323  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1324  errmsg("\"%s\" is an index",
1326  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1327  ereport(ERROR,
1328  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1329  errmsg("\"%s\" is a composite type",
1331 
1332  return r;
1333 }
1334 
1335 /* ----------------
1336  * heap_openrv_extended - open a heap relation specified
1337  * by a RangeVar node
1338  *
1339  * As above, but optionally return NULL instead of failing for
1340  * relation-not-found.
1341  * ----------------
1342  */
1343 Relation
1344 heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1345  bool missing_ok)
1346 {
1347  Relation r;
1348 
1349  r = relation_openrv_extended(relation, lockmode, missing_ok);
1350 
1351  if (r)
1352  {
1353  if (r->rd_rel->relkind == RELKIND_INDEX)
1354  ereport(ERROR,
1355  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1356  errmsg("\"%s\" is an index",
1358  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1359  ereport(ERROR,
1360  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1361  errmsg("\"%s\" is a composite type",
1363  }
1364 
1365  return r;
1366 }
1367 
1368 
1369 /* ----------------
1370  * heap_beginscan - begin relation scan
1371  *
1372  * heap_beginscan is the "standard" case.
1373  *
1374  * heap_beginscan_catalog differs in setting up its own temporary snapshot.
1375  *
1376  * heap_beginscan_strat offers an extended API that lets the caller control
1377  * whether a nondefault buffer access strategy can be used, and whether
1378  * syncscan can be chosen (possibly resulting in the scan not starting from
1379  * block zero). Both of these default to TRUE with plain heap_beginscan.
1380  *
1381  * heap_beginscan_bm is an alternative entry point for setting up a
1382  * HeapScanDesc for a bitmap heap scan. Although that scan technology is
1383  * really quite unlike a standard seqscan, there is just enough commonality
1384  * to make it worth using the same data structure.
1385  *
1386  * heap_beginscan_sampling is an alternative entry point for setting up a
1387  * HeapScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
1388  * using the same data structure although the behavior is rather different.
1389  * In addition to the options offered by heap_beginscan_strat, this call
1390  * also allows control of whether page-mode visibility checking is used.
1391  * ----------------
1392  */
1394 heap_beginscan(Relation relation, Snapshot snapshot,
1395  int nkeys, ScanKey key)
1396 {
1397  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1398  true, true, true, false, false, false);
1399 }
1400 
1402 heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key)
1403 {
1404  Oid relid = RelationGetRelid(relation);
1405  Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
1406 
1407  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1408  true, true, true, false, false, true);
1409 }
1410 
1413  int nkeys, ScanKey key,
1414  bool allow_strat, bool allow_sync)
1415 {
1416  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1417  allow_strat, allow_sync, true,
1418  false, false, false);
1419 }
1420 
1423  int nkeys, ScanKey key)
1424 {
1425  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1426  false, false, true, true, false, false);
1427 }
1428 
1431  int nkeys, ScanKey key,
1432  bool allow_strat, bool allow_sync, bool allow_pagemode)
1433 {
1434  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1435  allow_strat, allow_sync, allow_pagemode,
1436  false, true, false);
1437 }
1438 
1439 static HeapScanDesc
1441  int nkeys, ScanKey key,
1442  ParallelHeapScanDesc parallel_scan,
1443  bool allow_strat,
1444  bool allow_sync,
1445  bool allow_pagemode,
1446  bool is_bitmapscan,
1447  bool is_samplescan,
1448  bool temp_snap)
1449 {
1450  HeapScanDesc scan;
1451 
1452  /*
1453  * increment relation ref count while scanning relation
1454  *
1455  * This is just to make really sure the relcache entry won't go away while
1456  * the scan has a pointer to it. Caller should be holding the rel open
1457  * anyway, so this is redundant in all normal scenarios...
1458  */
1460 
1461  /*
1462  * allocate and initialize scan descriptor
1463  */
1464  scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1465 
1466  scan->rs_rd = relation;
1467  scan->rs_snapshot = snapshot;
1468  scan->rs_nkeys = nkeys;
1469  scan->rs_bitmapscan = is_bitmapscan;
1470  scan->rs_samplescan = is_samplescan;
1471  scan->rs_strategy = NULL; /* set in initscan */
1472  scan->rs_allow_strat = allow_strat;
1473  scan->rs_allow_sync = allow_sync;
1474  scan->rs_temp_snap = temp_snap;
1475  scan->rs_parallel = parallel_scan;
1476 
1477  /*
1478  * we can use page-at-a-time mode if it's an MVCC-safe snapshot
1479  */
1480  scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot);
1481 
1482  /*
1483  * For a seqscan in a serializable transaction, acquire a predicate lock
1484  * on the entire relation. This is required not only to lock all the
1485  * matching tuples, but also to conflict with new insertions into the
1486  * table. In an indexscan, we take page locks on the index pages covering
1487  * the range specified in the scan qual, but in a heap scan there is
1488  * nothing more fine-grained to lock. A bitmap scan is a different story,
1489  * there we have already scanned the index and locked the index pages
1490  * covering the predicate. But in that case we still have to lock any
1491  * matching heap tuples.
1492  */
1493  if (!is_bitmapscan)
1494  PredicateLockRelation(relation, snapshot);
1495 
1496  /* we only need to set this up once */
1497  scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1498 
1499  /*
1500  * we do this here instead of in initscan() because heap_rescan also calls
1501  * initscan() and we don't want to allocate memory again
1502  */
1503  if (nkeys > 0)
1504  scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1505  else
1506  scan->rs_key = NULL;
1507 
1508  initscan(scan, key, false);
1509 
1510  return scan;
1511 }
1512 
1513 /* ----------------
1514  * heap_rescan - restart a relation scan
1515  * ----------------
1516  */
1517 void
1519  ScanKey key)
1520 {
1521  /*
1522  * unpin scan buffers
1523  */
1524  if (BufferIsValid(scan->rs_cbuf))
1525  ReleaseBuffer(scan->rs_cbuf);
1526 
1527  /*
1528  * reinitialize scan descriptor
1529  */
1530  initscan(scan, key, true);
1531 
1532  /*
1533  * reset parallel scan, if present
1534  */
1535  if (scan->rs_parallel != NULL)
1536  {
1537  ParallelHeapScanDesc parallel_scan;
1538 
1539  /*
1540  * Caller is responsible for making sure that all workers have
1541  * finished the scan before calling this, so it really shouldn't be
1542  * necessary to acquire the mutex at all. We acquire it anyway, just
1543  * to be tidy.
1544  */
1545  parallel_scan = scan->rs_parallel;
1546  SpinLockAcquire(&parallel_scan->phs_mutex);
1547  parallel_scan->phs_cblock = parallel_scan->phs_startblock;
1548  SpinLockRelease(&parallel_scan->phs_mutex);
1549  }
1550 }
1551 
1552 /* ----------------
1553  * heap_rescan_set_params - restart a relation scan after changing params
1554  *
1555  * This call allows changing the buffer strategy, syncscan, and pagemode
1556  * options before starting a fresh scan. Note that although the actual use
1557  * of syncscan might change (effectively, enabling or disabling reporting),
1558  * the previously selected startblock will be kept.
1559  * ----------------
1560  */
1561 void
1563  bool allow_strat, bool allow_sync, bool allow_pagemode)
1564 {
1565  /* adjust parameters */
1566  scan->rs_allow_strat = allow_strat;
1567  scan->rs_allow_sync = allow_sync;
1568  scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(scan->rs_snapshot);
1569  /* ... and rescan */
1570  heap_rescan(scan, key);
1571 }
1572 
1573 /* ----------------
1574  * heap_endscan - end relation scan
1575  *
1576  * See how to integrate with index scans.
1577  * Check handling if reldesc caching.
1578  * ----------------
1579  */
1580 void
1582 {
1583  /* Note: no locking manipulations needed */
1584 
1585  /*
1586  * unpin scan buffers
1587  */
1588  if (BufferIsValid(scan->rs_cbuf))
1589  ReleaseBuffer(scan->rs_cbuf);
1590 
1591  /*
1592  * decrement relation reference count and free scan descriptor storage
1593  */
1595 
1596  if (scan->rs_key)
1597  pfree(scan->rs_key);
1598 
1599  if (scan->rs_strategy != NULL)
1601 
1602  if (scan->rs_temp_snap)
1604 
1605  pfree(scan);
1606 }
1607 
1608 /* ----------------
1609  * heap_parallelscan_estimate - estimate storage for ParallelHeapScanDesc
1610  *
1611  * Sadly, this doesn't reduce to a constant, because the size required
1612  * to serialize the snapshot can vary.
1613  * ----------------
1614  */
1615 Size
1617 {
1618  return add_size(offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1619  EstimateSnapshotSpace(snapshot));
1620 }
1621 
1622 /* ----------------
1623  * heap_parallelscan_initialize - initialize ParallelHeapScanDesc
1624  *
1625  * Must allow as many bytes of shared memory as returned by
1626  * heap_parallelscan_estimate. Call this just once in the leader
1627  * process; then, individual workers attach via heap_beginscan_parallel.
1628  * ----------------
1629  */
1630 void
1632  Snapshot snapshot)
1633 {
1634  target->phs_relid = RelationGetRelid(relation);
1635  target->phs_nblocks = RelationGetNumberOfBlocks(relation);
1636  /* compare phs_syncscan initialization to similar logic in initscan */
1637  target->phs_syncscan = synchronize_seqscans &&
1638  !RelationUsesLocalBuffers(relation) &&
1639  target->phs_nblocks > NBuffers / 4;
1640  SpinLockInit(&target->phs_mutex);
1641  target->phs_cblock = InvalidBlockNumber;
1643  SerializeSnapshot(snapshot, target->phs_snapshot_data);
1644 }
1645 
1646 /* ----------------
1647  * heap_beginscan_parallel - join a parallel scan
1648  *
1649  * Caller must hold a suitable lock on the correct relation.
1650  * ----------------
1651  */
1654 {
1655  Snapshot snapshot;
1656 
1657  Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
1658  snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data);
1659  RegisterSnapshot(snapshot);
1660 
1661  return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan,
1662  true, true, true, false, false, true);
1663 }
1664 
1665 /* ----------------
1666  * heap_parallelscan_nextpage - get the next page to scan
1667  *
1668  * Get the next page to scan. Even if there are no pages left to scan,
1669  * another backend could have grabbed a page to scan and not yet finished
1670  * looking at it, so it doesn't follow that the scan is done when the
1671  * first backend gets an InvalidBlockNumber return.
1672  * ----------------
1673  */
1674 static BlockNumber
1676 {
1678  BlockNumber sync_startpage = InvalidBlockNumber;
1679  BlockNumber report_page = InvalidBlockNumber;
1680  ParallelHeapScanDesc parallel_scan;
1681 
1682  Assert(scan->rs_parallel);
1683  parallel_scan = scan->rs_parallel;
1684 
1685 retry:
1686  /* Grab the spinlock. */
1687  SpinLockAcquire(&parallel_scan->phs_mutex);
1688 
1689  /*
1690  * If the scan's startblock has not yet been initialized, we must do so
1691  * now. If this is not a synchronized scan, we just start at block 0, but
1692  * if it is a synchronized scan, we must get the starting position from
1693  * the synchronized scan machinery. We can't hold the spinlock while
1694  * doing that, though, so release the spinlock, get the information we
1695  * need, and retry. If nobody else has initialized the scan in the
1696  * meantime, we'll fill in the value we fetched on the second time
1697  * through.
1698  */
1699  if (parallel_scan->phs_startblock == InvalidBlockNumber)
1700  {
1701  if (!parallel_scan->phs_syncscan)
1702  parallel_scan->phs_startblock = 0;
1703  else if (sync_startpage != InvalidBlockNumber)
1704  parallel_scan->phs_startblock = sync_startpage;
1705  else
1706  {
1707  SpinLockRelease(&parallel_scan->phs_mutex);
1708  sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks);
1709  goto retry;
1710  }
1711  parallel_scan->phs_cblock = parallel_scan->phs_startblock;
1712  }
1713 
1714  /*
1715  * The current block number is the next one that needs to be scanned,
1716  * unless it's InvalidBlockNumber already, in which case there are no more
1717  * blocks to scan. After remembering the current value, we must advance
1718  * it so that the next call to this function returns the next block to be
1719  * scanned.
1720  */
1721  page = parallel_scan->phs_cblock;
1722  if (page != InvalidBlockNumber)
1723  {
1724  parallel_scan->phs_cblock++;
1725  if (parallel_scan->phs_cblock >= scan->rs_nblocks)
1726  parallel_scan->phs_cblock = 0;
1727  if (parallel_scan->phs_cblock == parallel_scan->phs_startblock)
1728  {
1729  parallel_scan->phs_cblock = InvalidBlockNumber;
1730  report_page = parallel_scan->phs_startblock;
1731  }
1732  }
1733 
1734  /* Release the lock. */
1735  SpinLockRelease(&parallel_scan->phs_mutex);
1736 
1737  /*
1738  * Report scan location. Normally, we report the current page number.
1739  * When we reach the end of the scan, though, we report the starting page,
1740  * not the ending page, just so the starting positions for later scans
1741  * doesn't slew backwards. We only report the position at the end of the
1742  * scan once, though: subsequent callers will have report nothing, since
1743  * they will have page == InvalidBlockNumber.
1744  */
1745  if (scan->rs_syncscan)
1746  {
1747  if (report_page == InvalidBlockNumber)
1748  report_page = page;
1749  if (report_page != InvalidBlockNumber)
1750  ss_report_location(scan->rs_rd, report_page);
1751  }
1752 
1753  return page;
1754 }
1755 
1756 /* ----------------
1757  * heap_getnext - retrieve next tuple in scan
1758  *
1759  * Fix to work with index relations.
1760  * We don't return the buffer anymore, but you can get it from the
1761  * returned HeapTuple.
1762  * ----------------
1763  */
1764 
1765 #ifdef HEAPDEBUGALL
1766 #define HEAPDEBUG_1 \
1767  elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1768  RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1769 #define HEAPDEBUG_2 \
1770  elog(DEBUG2, "heap_getnext returning EOS")
1771 #define HEAPDEBUG_3 \
1772  elog(DEBUG2, "heap_getnext returning tuple")
1773 #else
1774 #define HEAPDEBUG_1
1775 #define HEAPDEBUG_2
1776 #define HEAPDEBUG_3
1777 #endif /* !defined(HEAPDEBUGALL) */
1778 
1779 
1780 HeapTuple
1782 {
1783  /* Note: no locking manipulations needed */
1784 
1785  HEAPDEBUG_1; /* heap_getnext( info ) */
1786 
1787  if (scan->rs_pageatatime)
1788  heapgettup_pagemode(scan, direction,
1789  scan->rs_nkeys, scan->rs_key);
1790  else
1791  heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);
1792 
1793  if (scan->rs_ctup.t_data == NULL)
1794  {
1795  HEAPDEBUG_2; /* heap_getnext returning EOS */
1796  return NULL;
1797  }
1798 
1799  /*
1800  * if we get here it means we have a new current scan tuple, so point to
1801  * the proper return buffer and return the tuple.
1802  */
1803  HEAPDEBUG_3; /* heap_getnext returning tuple */
1804 
1806 
1807  return &(scan->rs_ctup);
1808 }
1809 
1810 /*
1811  * heap_fetch - retrieve tuple with given tid
1812  *
1813  * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1814  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1815  * against the specified snapshot.
1816  *
1817  * If successful (tuple found and passes snapshot time qual), then *userbuf
1818  * is set to the buffer holding the tuple and TRUE is returned. The caller
1819  * must unpin the buffer when done with the tuple.
1820  *
1821  * If the tuple is not found (ie, item number references a deleted slot),
1822  * then tuple->t_data is set to NULL and FALSE is returned.
1823  *
1824  * If the tuple is found but fails the time qual check, then FALSE is returned
1825  * but tuple->t_data is left pointing to the tuple.
1826  *
1827  * keep_buf determines what is done with the buffer in the FALSE-result cases.
1828  * When the caller specifies keep_buf = true, we retain the pin on the buffer
1829  * and return it in *userbuf (so the caller must eventually unpin it); when
1830  * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
1831  *
1832  * stats_relation is the relation to charge the heap_fetch operation against
1833  * for statistical purposes. (This could be the heap rel itself, an
1834  * associated index, or NULL to not count the fetch at all.)
1835  *
1836  * heap_fetch does not follow HOT chains: only the exact TID requested will
1837  * be fetched.
1838  *
1839  * It is somewhat inconsistent that we ereport() on invalid block number but
1840  * return false on invalid item number. There are a couple of reasons though.
1841  * One is that the caller can relatively easily check the block number for
1842  * validity, but cannot check the item number without reading the page
1843  * himself. Another is that when we are following a t_ctid link, we can be
1844  * reasonably confident that the page number is valid (since VACUUM shouldn't
1845  * truncate off the destination page without having killed the referencing
1846  * tuple first), but the item number might well not be good.
1847  */
1848 bool
1850  Snapshot snapshot,
1851  HeapTuple tuple,
1852  Buffer *userbuf,
1853  bool keep_buf,
1854  Relation stats_relation)
1855 {
1856  ItemPointer tid = &(tuple->t_self);
1857  ItemId lp;
1858  Buffer buffer;
1859  Page page;
1860  OffsetNumber offnum;
1861  bool valid;
1862 
1863  /*
1864  * Fetch and pin the appropriate page of the relation.
1865  */
1866  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1867 
1868  /*
1869  * Need share lock on buffer to examine tuple commit status.
1870  */
1871  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1872  page = BufferGetPage(buffer);
1873  TestForOldSnapshot(snapshot, relation, page);
1874 
1875  /*
1876  * We'd better check for out-of-range offnum in case of VACUUM since the
1877  * TID was obtained.
1878  */
1879  offnum = ItemPointerGetOffsetNumber(tid);
1880  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1881  {
1882  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1883  if (keep_buf)
1884  *userbuf = buffer;
1885  else
1886  {
1887  ReleaseBuffer(buffer);
1888  *userbuf = InvalidBuffer;
1889  }
1890  tuple->t_data = NULL;
1891  return false;
1892  }
1893 
1894  /*
1895  * get the item line pointer corresponding to the requested tid
1896  */
1897  lp = PageGetItemId(page, offnum);
1898 
1899  /*
1900  * Must check for deleted tuple.
1901  */
1902  if (!ItemIdIsNormal(lp))
1903  {
1904  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1905  if (keep_buf)
1906  *userbuf = buffer;
1907  else
1908  {
1909  ReleaseBuffer(buffer);
1910  *userbuf = InvalidBuffer;
1911  }
1912  tuple->t_data = NULL;
1913  return false;
1914  }
1915 
1916  /*
1917  * fill in *tuple fields
1918  */
1919  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1920  tuple->t_len = ItemIdGetLength(lp);
1921  tuple->t_tableOid = RelationGetRelid(relation);
1922 
1923  /*
1924  * check time qualification of tuple, then release lock
1925  */
1926  valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1927 
1928  if (valid)
1929  PredicateLockTuple(relation, tuple, snapshot);
1930 
1931  CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1932 
1933  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1934 
1935  if (valid)
1936  {
1937  /*
1938  * All checks passed, so return the tuple as valid. Caller is now
1939  * responsible for releasing the buffer.
1940  */
1941  *userbuf = buffer;
1942 
1943  /* Count the successful fetch against appropriate rel, if any */
1944  if (stats_relation != NULL)
1945  pgstat_count_heap_fetch(stats_relation);
1946 
1947  return true;
1948  }
1949 
1950  /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1951  if (keep_buf)
1952  *userbuf = buffer;
1953  else
1954  {
1955  ReleaseBuffer(buffer);
1956  *userbuf = InvalidBuffer;
1957  }
1958 
1959  return false;
1960 }
1961 
1962 /*
1963  * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1964  *
1965  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1966  * of a HOT chain), and buffer is the buffer holding this tuple. We search
1967  * for the first chain member satisfying the given snapshot. If one is
1968  * found, we update *tid to reference that tuple's offset number, and
1969  * return TRUE. If no match, return FALSE without modifying *tid.
1970  *
1971  * heapTuple is a caller-supplied buffer. When a match is found, we return
1972  * the tuple here, in addition to updating *tid. If no match is found, the
1973  * contents of this buffer on return are undefined.
1974  *
1975  * If all_dead is not NULL, we check non-visible tuples to see if they are
1976  * globally dead; *all_dead is set TRUE if all members of the HOT chain
1977  * are vacuumable, FALSE if not.
1978  *
1979  * Unlike heap_fetch, the caller must already have pin and (at least) share
1980  * lock on the buffer; it is still pinned/locked at exit. Also unlike
1981  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
1982  */
1983 bool
1985  Snapshot snapshot, HeapTuple heapTuple,
1986  bool *all_dead, bool first_call)
1987 {
1988  Page dp = (Page) BufferGetPage(buffer);
1989  TransactionId prev_xmax = InvalidTransactionId;
1990  OffsetNumber offnum;
1991  bool at_chain_start;
1992  bool valid;
1993  bool skip;
1994 
1995  /* If this is not the first call, previous call returned a (live!) tuple */
1996  if (all_dead)
1997  *all_dead = first_call;
1998 
2000 
2002  offnum = ItemPointerGetOffsetNumber(tid);
2003  at_chain_start = first_call;
2004  skip = !first_call;
2005 
2006  heapTuple->t_self = *tid;
2007 
2008  /* Scan through possible multiple members of HOT-chain */
2009  for (;;)
2010  {
2011  ItemId lp;
2012 
2013  /* check for bogus TID */
2014  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
2015  break;
2016 
2017  lp = PageGetItemId(dp, offnum);
2018 
2019  /* check for unused, dead, or redirected items */
2020  if (!ItemIdIsNormal(lp))
2021  {
2022  /* We should only see a redirect at start of chain */
2023  if (ItemIdIsRedirected(lp) && at_chain_start)
2024  {
2025  /* Follow the redirect */
2026  offnum = ItemIdGetRedirect(lp);
2027  at_chain_start = false;
2028  continue;
2029  }
2030  /* else must be end of chain */
2031  break;
2032  }
2033 
2034  heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
2035  heapTuple->t_len = ItemIdGetLength(lp);
2036  heapTuple->t_tableOid = RelationGetRelid(relation);
2037  ItemPointerSetOffsetNumber(&heapTuple->t_self, offnum);
2038 
2039  /*
2040  * Shouldn't see a HEAP_ONLY tuple at chain start.
2041  */
2042  if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
2043  break;
2044 
2045  /*
2046  * The xmin should match the previous xmax value, else chain is
2047  * broken.
2048  */
2049  if (TransactionIdIsValid(prev_xmax) &&
2050  !TransactionIdEquals(prev_xmax,
2051  HeapTupleHeaderGetXmin(heapTuple->t_data)))
2052  break;
2053 
2054  /*
2055  * When first_call is true (and thus, skip is initially false) we'll
2056  * return the first tuple we find. But on later passes, heapTuple
2057  * will initially be pointing to the tuple we returned last time.
2058  * Returning it again would be incorrect (and would loop forever), so
2059  * we skip it and return the next match we find.
2060  */
2061  if (!skip)
2062  {
2063  /*
2064  * For the benefit of logical decoding, have t_self point at the
2065  * element of the HOT chain we're currently investigating instead
2066  * of the root tuple of the HOT chain. This is important because
2067  * the *Satisfies routine for historical mvcc snapshots needs the
2068  * correct tid to decide about the visibility in some cases.
2069  */
2070  ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum);
2071 
2072  /* If it's visible per the snapshot, we must return it */
2073  valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
2074  CheckForSerializableConflictOut(valid, relation, heapTuple,
2075  buffer, snapshot);
2076  /* reset to original, non-redirected, tid */
2077  heapTuple->t_self = *tid;
2078 
2079  if (valid)
2080  {
2081  ItemPointerSetOffsetNumber(tid, offnum);
2082  PredicateLockTuple(relation, heapTuple, snapshot);
2083  if (all_dead)
2084  *all_dead = false;
2085  return true;
2086  }
2087  }
2088  skip = false;
2089 
2090  /*
2091  * If we can't see it, maybe no one else can either. At caller
2092  * request, check whether all chain members are dead to all
2093  * transactions.
2094  */
2095  if (all_dead && *all_dead &&
2097  *all_dead = false;
2098 
2099  /*
2100  * Check to see if HOT chain continues past this tuple; if so fetch
2101  * the next offnum and loop around.
2102  */
2103  if (HeapTupleIsHotUpdated(heapTuple))
2104  {
2107  offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
2108  at_chain_start = false;
2109  prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
2110  }
2111  else
2112  break; /* end of chain */
2113  }
2114 
2115  return false;
2116 }
2117 
2118 /*
2119  * heap_hot_search - search HOT chain for tuple satisfying snapshot
2120  *
2121  * This has the same API as heap_hot_search_buffer, except that the caller
2122  * does not provide the buffer containing the page, rather we access it
2123  * locally.
2124  */
2125 bool
2127  bool *all_dead)
2128 {
2129  bool result;
2130  Buffer buffer;
2131  HeapTupleData heapTuple;
2132 
2133  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
2134  LockBuffer(buffer, BUFFER_LOCK_SHARE);
2135  result = heap_hot_search_buffer(tid, relation, buffer, snapshot,
2136  &heapTuple, all_dead, true);
2137  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2138  ReleaseBuffer(buffer);
2139  return result;
2140 }
2141 
2142 /*
2143  * heap_get_latest_tid - get the latest tid of a specified tuple
2144  *
2145  * Actually, this gets the latest version that is visible according to
2146  * the passed snapshot. You can pass SnapshotDirty to get the very latest,
2147  * possibly uncommitted version.
2148  *
2149  * *tid is both an input and an output parameter: it is updated to
2150  * show the latest version of the row. Note that it will not be changed
2151  * if no version of the row passes the snapshot test.
2152  */
2153 void
2155  Snapshot snapshot,
2156  ItemPointer tid)
2157 {
2158  BlockNumber blk;
2159  ItemPointerData ctid;
2160  TransactionId priorXmax;
2161 
2162  /* this is to avoid Assert failures on bad input */
2163  if (!ItemPointerIsValid(tid))
2164  return;
2165 
2166  /*
2167  * Since this can be called with user-supplied TID, don't trust the input
2168  * too much. (RelationGetNumberOfBlocks is an expensive check, so we
2169  * don't check t_ctid links again this way. Note that it would not do to
2170  * call it just once and save the result, either.)
2171  */
2172  blk = ItemPointerGetBlockNumber(tid);
2173  if (blk >= RelationGetNumberOfBlocks(relation))
2174  elog(ERROR, "block number %u is out of range for relation \"%s\"",
2175  blk, RelationGetRelationName(relation));
2176 
2177  /*
2178  * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
2179  * need to examine, and *tid is the TID we will return if ctid turns out
2180  * to be bogus.
2181  *
2182  * Note that we will loop until we reach the end of the t_ctid chain.
2183  * Depending on the snapshot passed, there might be at most one visible
2184  * version of the row, but we don't try to optimize for that.
2185  */
2186  ctid = *tid;
2187  priorXmax = InvalidTransactionId; /* cannot check first XMIN */
2188  for (;;)
2189  {
2190  Buffer buffer;
2191  Page page;
2192  OffsetNumber offnum;
2193  ItemId lp;
2194  HeapTupleData tp;
2195  bool valid;
2196 
2197  /*
2198  * Read, pin, and lock the page.
2199  */
2200  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
2201  LockBuffer(buffer, BUFFER_LOCK_SHARE);
2202  page = BufferGetPage(buffer);
2203  TestForOldSnapshot(snapshot, relation, page);
2204 
2205  /*
2206  * Check for bogus item number. This is not treated as an error
2207  * condition because it can happen while following a t_ctid link. We
2208  * just assume that the prior tid is OK and return it unchanged.
2209  */
2210  offnum = ItemPointerGetOffsetNumber(&ctid);
2211  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
2212  {
2213  UnlockReleaseBuffer(buffer);
2214  break;
2215  }
2216  lp = PageGetItemId(page, offnum);
2217  if (!ItemIdIsNormal(lp))
2218  {
2219  UnlockReleaseBuffer(buffer);
2220  break;
2221  }
2222 
2223  /* OK to access the tuple */
2224  tp.t_self = ctid;
2225  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2226  tp.t_len = ItemIdGetLength(lp);
2227  tp.t_tableOid = RelationGetRelid(relation);
2228 
2229  /*
2230  * After following a t_ctid link, we might arrive at an unrelated
2231  * tuple. Check for XMIN match.
2232  */
2233  if (TransactionIdIsValid(priorXmax) &&
2235  {
2236  UnlockReleaseBuffer(buffer);
2237  break;
2238  }
2239 
2240  /*
2241  * Check time qualification of tuple; if visible, set it as the new
2242  * result candidate.
2243  */
2244  valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2245  CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2246  if (valid)
2247  *tid = ctid;
2248 
2249  /*
2250  * If there's a valid t_ctid link, follow it, else we're done.
2251  */
2252  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2255  {
2256  UnlockReleaseBuffer(buffer);
2257  break;
2258  }
2259 
2260  ctid = tp.t_data->t_ctid;
2261  priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2262  UnlockReleaseBuffer(buffer);
2263  } /* end of loop */
2264 }
2265 
2266 
2267 /*
2268  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2269  *
2270  * This is called after we have waited for the XMAX transaction to terminate.
2271  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2272  * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2273  * hint bit if possible --- but beware that that may not yet be possible,
2274  * if the transaction committed asynchronously.
2275  *
2276  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2277  * even if it commits.
2278  *
2279  * Hence callers should look only at XMAX_INVALID.
2280  *
2281  * Note this is not allowed for tuples whose xmax is a multixact.
2282  */
2283 static void
2285 {
2287  Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
2288 
2289  if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
2290  {
2291  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2294  xid);
2295  else
2296  HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
2298  }
2299 }
2300 
2301 
2302 /*
2303  * GetBulkInsertState - prepare status object for a bulk insert
2304  */
2307 {
2308  BulkInsertState bistate;
2309 
2310  bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2312  bistate->current_buf = InvalidBuffer;
2313  return bistate;
2314 }
2315 
2316 /*
2317  * FreeBulkInsertState - clean up after finishing a bulk insert
2318  */
2319 void
2321 {
2322  if (bistate->current_buf != InvalidBuffer)
2323  ReleaseBuffer(bistate->current_buf);
2324  FreeAccessStrategy(bistate->strategy);
2325  pfree(bistate);
2326 }
2327 
2328 /*
2329  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2330  */
2331 void
2333 {
2334  if (bistate->current_buf != InvalidBuffer)
2335  ReleaseBuffer(bistate->current_buf);
2336  bistate->current_buf = InvalidBuffer;
2337 }
2338 
2339 
2340 /*
2341  * heap_insert - insert tuple into a heap
2342  *
2343  * The new tuple is stamped with current transaction ID and the specified
2344  * command ID.
2345  *
2346  * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
2347  * logged in WAL, even for a non-temp relation. Safe usage of this behavior
2348  * requires that we arrange that all new tuples go into new pages not
2349  * containing any tuples from other transactions, and that the relation gets
2350  * fsync'd before commit. (See also heap_sync() comments)
2351  *
2352  * The HEAP_INSERT_SKIP_FSM option is passed directly to
2353  * RelationGetBufferForTuple, which see for more info.
2354  *
2355  * HEAP_INSERT_FROZEN should only be specified for inserts into
2356  * relfilenodes created during the current subtransaction and when
2357  * there are no prior snapshots or pre-existing portals open.
2358  * This causes rows to be frozen, which is an MVCC violation and
2359  * requires explicit options chosen by user.
2360  *
2361  * HEAP_INSERT_IS_SPECULATIVE is used on so-called "speculative insertions",
2362  * which can be backed out afterwards without aborting the whole transaction.
2363  * Other sessions can wait for the speculative insertion to be confirmed,
2364  * turning it into a regular tuple, or aborted, as if it never existed.
2365  * Speculatively inserted tuples behave as "value locks" of short duration,
2366  * used to implement INSERT .. ON CONFLICT.
2367  *
2368  * Note that most of these options will be applied when inserting into the
2369  * heap's TOAST table, too, if the tuple requires any out-of-line data. Only
2370  * HEAP_INSERT_IS_SPECULATIVE is explicitly ignored, as the toast data does
2371  * not partake in speculative insertion.
2372  *
2373  * The BulkInsertState object (if any; bistate can be NULL for default
2374  * behavior) is also just passed through to RelationGetBufferForTuple.
2375  *
2376  * The return value is the OID assigned to the tuple (either here or by the
2377  * caller), or InvalidOid if no OID. The header fields of *tup are updated
2378  * to match the stored tuple; in particular tup->t_self receives the actual
2379  * TID where the tuple was stored. But note that any toasting of fields
2380  * within the tuple data is NOT reflected into *tup.
2381  */
2382 Oid
2384  int options, BulkInsertState bistate)
2385 {
2387  HeapTuple heaptup;
2388  Buffer buffer;
2389  Buffer vmbuffer = InvalidBuffer;
2390  bool all_visible_cleared = false;
2391 
2392  /*
2393  * Fill in tuple header fields, assign an OID, and toast the tuple if
2394  * necessary.
2395  *
2396  * Note: below this point, heaptup is the data we actually intend to store
2397  * into the relation; tup is the caller's original untoasted data.
2398  */
2399  heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2400 
2401  /*
2402  * Find buffer to insert this tuple into. If the page is all visible,
2403  * this will also pin the requisite visibility map page.
2404  */
2405  buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2406  InvalidBuffer, options, bistate,
2407  &vmbuffer, NULL);
2408 
2409  /*
2410  * We're about to do the actual insert -- but check for conflict first, to
2411  * avoid possibly having to roll back work we've just done.
2412  *
2413  * This is safe without a recheck as long as there is no possibility of
2414  * another process scanning the page between this check and the insert
2415  * being visible to the scan (i.e., an exclusive buffer content lock is
2416  * continuously held from this point until the tuple insert is visible).
2417  *
2418  * For a heap insert, we only need to check for table-level SSI locks. Our
2419  * new tuple can't possibly conflict with existing tuple locks, and heap
2420  * page locks are only consolidated versions of tuple locks; they do not
2421  * lock "gaps" as index page locks do. So we don't need to specify a
2422  * buffer when making the call, which makes for a faster check.
2423  */
2425 
2426  /* NO EREPORT(ERROR) from here till changes are logged */
2428 
2429  RelationPutHeapTuple(relation, buffer, heaptup,
2430  (options & HEAP_INSERT_SPECULATIVE) != 0);
2431 
2432  if (PageIsAllVisible(BufferGetPage(buffer)))
2433  {
2434  all_visible_cleared = true;
2436  visibilitymap_clear(relation,
2437  ItemPointerGetBlockNumber(&(heaptup->t_self)),
2438  vmbuffer, VISIBILITYMAP_VALID_BITS);
2439  }
2440 
2441  /*
2442  * XXX Should we set PageSetPrunable on this page ?
2443  *
2444  * The inserting transaction may eventually abort thus making this tuple
2445  * DEAD and hence available for pruning. Though we don't want to optimize
2446  * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2447  * aborted tuple will never be pruned until next vacuum is triggered.
2448  *
2449  * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2450  */
2451 
2452  MarkBufferDirty(buffer);
2453 
2454  /* XLOG stuff */
2455  if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
2456  {
2457  xl_heap_insert xlrec;
2458  xl_heap_header xlhdr;
2459  XLogRecPtr recptr;
2460  Page page = BufferGetPage(buffer);
2461  uint8 info = XLOG_HEAP_INSERT;
2462  int bufflags = 0;
2463 
2464  /*
2465  * If this is a catalog, we need to transmit combocids to properly
2466  * decode, so log that as well.
2467  */
2469  log_heap_new_cid(relation, heaptup);
2470 
2471  /*
2472  * If this is the single and first tuple on page, we can reinit the
2473  * page instead of restoring the whole thing. Set flag, and hide
2474  * buffer references from XLogInsert.
2475  */
2476  if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2478  {
2479  info |= XLOG_HEAP_INIT_PAGE;
2480  bufflags |= REGBUF_WILL_INIT;
2481  }
2482 
2483  xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2484  xlrec.flags = 0;
2485  if (all_visible_cleared)
2487  if (options & HEAP_INSERT_SPECULATIVE)
2490 
2491  /*
2492  * For logical decoding, we need the tuple even if we're doing a full
2493  * page write, so make sure it's included even if we take a full-page
2494  * image. (XXX We could alternatively store a pointer into the FPW).
2495  */
2496  if (RelationIsLogicallyLogged(relation))
2497  {
2499  bufflags |= REGBUF_KEEP_DATA;
2500  }
2501 
2502  XLogBeginInsert();
2503  XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2504 
2505  xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2506  xlhdr.t_infomask = heaptup->t_data->t_infomask;
2507  xlhdr.t_hoff = heaptup->t_data->t_hoff;
2508 
2509  /*
2510  * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2511  * write the whole page to the xlog, we don't need to store
2512  * xl_heap_header in the xlog.
2513  */
2514  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2515  XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2516  /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2518  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2519  heaptup->t_len - SizeofHeapTupleHeader);
2520 
2521  /* filtering by origin on a row level is much more efficient */
2523 
2524  recptr = XLogInsert(RM_HEAP_ID, info);
2525 
2526  PageSetLSN(page, recptr);
2527  }
2528 
2529  END_CRIT_SECTION();
2530 
2531  UnlockReleaseBuffer(buffer);
2532  if (vmbuffer != InvalidBuffer)
2533  ReleaseBuffer(vmbuffer);
2534 
2535  /*
2536  * If tuple is cachable, mark it for invalidation from the caches in case
2537  * we abort. Note it is OK to do this after releasing the buffer, because
2538  * the heaptup data structure is all in local memory, not in the shared
2539  * buffer.
2540  */
2541  CacheInvalidateHeapTuple(relation, heaptup, NULL);
2542 
2543  /* Note: speculative insertions are counted too, even if aborted later */
2544  pgstat_count_heap_insert(relation, 1);
2545 
2546  /*
2547  * If heaptup is a private copy, release it. Don't forget to copy t_self
2548  * back to the caller's image, too.
2549  */
2550  if (heaptup != tup)
2551  {
2552  tup->t_self = heaptup->t_self;
2553  heap_freetuple(heaptup);
2554  }
2555 
2556  return HeapTupleGetOid(tup);
2557 }
2558 
2559 /*
2560  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2561  * tuple header fields, assigns an OID, and toasts the tuple if necessary.
2562  * Returns a toasted version of the tuple if it was toasted, or the original
2563  * tuple if not. Note that in any case, the header fields are also set in
2564  * the original tuple.
2565  */
2566 static HeapTuple
2568  CommandId cid, int options)
2569 {
2570  /*
2571  * For now, parallel operations are required to be strictly read-only.
2572  * Unlike heap_update() and heap_delete(), an insert should never create a
2573  * combo CID, so it might be possible to relax this restriction, but not
2574  * without more thought and testing.
2575  */
2576  if (IsInParallelMode())
2577  ereport(ERROR,
2578  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2579  errmsg("cannot insert tuples during a parallel operation")));
2580 
2581  if (relation->rd_rel->relhasoids)
2582  {
2583 #ifdef NOT_USED
2584  /* this is redundant with an Assert in HeapTupleSetOid */
2586 #endif
2587 
2588  /*
2589  * If the object id of this tuple has already been assigned, trust the
2590  * caller. There are a couple of ways this can happen. At initial db
2591  * creation, the backend program sets oids for tuples. When we define
2592  * an index, we set the oid. Finally, in the future, we may allow
2593  * users to set their own object ids in order to support a persistent
2594  * object store (objects need to contain pointers to one another).
2595  */
2596  if (!OidIsValid(HeapTupleGetOid(tup)))
2597  HeapTupleSetOid(tup, GetNewOid(relation));
2598  }
2599  else
2600  {
2601  /* check there is not space for an OID */
2602  Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
2603  }
2604 
2605  tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2606  tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2608  HeapTupleHeaderSetXmin(tup->t_data, xid);
2609  if (options & HEAP_INSERT_FROZEN)
2611 
2612  HeapTupleHeaderSetCmin(tup->t_data, cid);
2613  HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2614  tup->t_tableOid = RelationGetRelid(relation);
2615 
2616  /*
2617  * If the new tuple is too big for storage or contains already toasted
2618  * out-of-line attributes from some other relation, invoke the toaster.
2619  */
2620  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2621  relation->rd_rel->relkind != RELKIND_MATVIEW)
2622  {
2623  /* toast table entries should never be recursively toasted */
2625  return tup;
2626  }
2627  else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2628  return toast_insert_or_update(relation, tup, NULL, options);
2629  else
2630  return tup;
2631 }
2632 
2633 /*
2634  * heap_multi_insert - insert multiple tuple into a heap
2635  *
2636  * This is like heap_insert(), but inserts multiple tuples in one operation.
2637  * That's faster than calling heap_insert() in a loop, because when multiple
2638  * tuples can be inserted on a single page, we can write just a single WAL
2639  * record covering all of them, and only need to lock/unlock the page once.
2640  *
2641  * Note: this leaks memory into the current memory context. You can create a
2642  * temporary context before calling this, if that's a problem.
2643  */
2644 void
2645 heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
2646  CommandId cid, int options, BulkInsertState bistate)
2647 {
2649  HeapTuple *heaptuples;
2650  int i;
2651  int ndone;
2652  char *scratch = NULL;
2653  Page page;
2654  bool needwal;
2655  Size saveFreeSpace;
2656  bool need_tuple_data = RelationIsLogicallyLogged(relation);
2657  bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2658 
2659  needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2660  saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2662 
2663  /* Toast and set header data in all the tuples */
2664  heaptuples = palloc(ntuples * sizeof(HeapTuple));
2665  for (i = 0; i < ntuples; i++)
2666  heaptuples[i] = heap_prepare_insert(relation, tuples[i],
2667  xid, cid, options);
2668 
2669  /*
2670  * Allocate some memory to use for constructing the WAL record. Using
2671  * palloc() within a critical section is not safe, so we allocate this
2672  * beforehand.
2673  */
2674  if (needwal)
2675  scratch = palloc(BLCKSZ);
2676 
2677  /*
2678  * We're about to do the actual inserts -- but check for conflict first,
2679  * to minimize the possibility of having to roll back work we've just
2680  * done.
2681  *
2682  * A check here does not definitively prevent a serialization anomaly;
2683  * that check MUST be done at least past the point of acquiring an
2684  * exclusive buffer content lock on every buffer that will be affected,
2685  * and MAY be done after all inserts are reflected in the buffers and
2686  * those locks are released; otherwise there race condition. Since
2687  * multiple buffers can be locked and unlocked in the loop below, and it
2688  * would not be feasible to identify and lock all of those buffers before
2689  * the loop, we must do a final check at the end.
2690  *
2691  * The check here could be omitted with no loss of correctness; it is
2692  * present strictly as an optimization.
2693  *
2694  * For heap inserts, we only need to check for table-level SSI locks. Our
2695  * new tuples can't possibly conflict with existing tuple locks, and heap
2696  * page locks are only consolidated versions of tuple locks; they do not
2697  * lock "gaps" as index page locks do. So we don't need to specify a
2698  * buffer when making the call, which makes for a faster check.
2699  */
2701 
2702  ndone = 0;
2703  while (ndone < ntuples)
2704  {
2705  Buffer buffer;
2706  Buffer vmbuffer = InvalidBuffer;
2707  bool all_visible_cleared = false;
2708  int nthispage;
2709 
2711 
2712  /*
2713  * Find buffer where at least the next tuple will fit. If the page is
2714  * all-visible, this will also pin the requisite visibility map page.
2715  */
2716  buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2717  InvalidBuffer, options, bistate,
2718  &vmbuffer, NULL);
2719  page = BufferGetPage(buffer);
2720 
2721  /* NO EREPORT(ERROR) from here till changes are logged */
2723 
2724  /*
2725  * RelationGetBufferForTuple has ensured that the first tuple fits.
2726  * Put that on the page, and then as many other tuples as fit.
2727  */
2728  RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2729  for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2730  {
2731  HeapTuple heaptup = heaptuples[ndone + nthispage];
2732 
2733  if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2734  break;
2735 
2736  RelationPutHeapTuple(relation, buffer, heaptup, false);
2737 
2738  /*
2739  * We don't use heap_multi_insert for catalog tuples yet, but
2740  * better be prepared...
2741  */
2742  if (needwal && need_cids)
2743  log_heap_new_cid(relation, heaptup);
2744  }
2745 
2746  if (PageIsAllVisible(page))
2747  {
2748  all_visible_cleared = true;
2749  PageClearAllVisible(page);
2750  visibilitymap_clear(relation,
2751  BufferGetBlockNumber(buffer),
2752  vmbuffer, VISIBILITYMAP_VALID_BITS);
2753  }
2754 
2755  /*
2756  * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2757  */
2758 
2759  MarkBufferDirty(buffer);
2760 
2761  /* XLOG stuff */
2762  if (needwal)
2763  {
2764  XLogRecPtr recptr;
2765  xl_heap_multi_insert *xlrec;
2767  char *tupledata;
2768  int totaldatalen;
2769  char *scratchptr = scratch;
2770  bool init;
2771  int bufflags = 0;
2772 
2773  /*
2774  * If the page was previously empty, we can reinit the page
2775  * instead of restoring the whole thing.
2776  */
2777  init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2778  PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2779 
2780  /* allocate xl_heap_multi_insert struct from the scratch area */
2781  xlrec = (xl_heap_multi_insert *) scratchptr;
2782  scratchptr += SizeOfHeapMultiInsert;
2783 
2784  /*
2785  * Allocate offsets array. Unless we're reinitializing the page,
2786  * in that case the tuples are stored in order starting at
2787  * FirstOffsetNumber and we don't need to store the offsets
2788  * explicitly.
2789  */
2790  if (!init)
2791  scratchptr += nthispage * sizeof(OffsetNumber);
2792 
2793  /* the rest of the scratch space is used for tuple data */
2794  tupledata = scratchptr;
2795 
2796  xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2797  xlrec->ntuples = nthispage;
2798 
2799  /*
2800  * Write out an xl_multi_insert_tuple and the tuple data itself
2801  * for each tuple.
2802  */
2803  for (i = 0; i < nthispage; i++)
2804  {
2805  HeapTuple heaptup = heaptuples[ndone + i];
2806  xl_multi_insert_tuple *tuphdr;
2807  int datalen;
2808 
2809  if (!init)
2810  xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2811  /* xl_multi_insert_tuple needs two-byte alignment. */
2812  tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2813  scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2814 
2815  tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2816  tuphdr->t_infomask = heaptup->t_data->t_infomask;
2817  tuphdr->t_hoff = heaptup->t_data->t_hoff;
2818 
2819  /* write bitmap [+ padding] [+ oid] + data */
2820  datalen = heaptup->t_len - SizeofHeapTupleHeader;
2821  memcpy(scratchptr,
2822  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2823  datalen);
2824  tuphdr->datalen = datalen;
2825  scratchptr += datalen;
2826  }
2827  totaldatalen = scratchptr - tupledata;
2828  Assert((scratchptr - scratch) < BLCKSZ);
2829 
2830  if (need_tuple_data)
2832 
2833  /*
2834  * Signal that this is the last xl_heap_multi_insert record
2835  * emitted by this call to heap_multi_insert(). Needed for logical
2836  * decoding so it knows when to cleanup temporary data.
2837  */
2838  if (ndone + nthispage == ntuples)
2839  xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2840 
2841  if (init)
2842  {
2843  info |= XLOG_HEAP_INIT_PAGE;
2844  bufflags |= REGBUF_WILL_INIT;
2845  }
2846 
2847  /*
2848  * If we're doing logical decoding, include the new tuple data
2849  * even if we take a full-page image of the page.
2850  */
2851  if (need_tuple_data)
2852  bufflags |= REGBUF_KEEP_DATA;
2853 
2854  XLogBeginInsert();
2855  XLogRegisterData((char *) xlrec, tupledata - scratch);
2856  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2857 
2858  XLogRegisterBufData(0, tupledata, totaldatalen);
2859 
2860  /* filtering by origin on a row level is much more efficient */
2862 
2863  recptr = XLogInsert(RM_HEAP2_ID, info);
2864 
2865  PageSetLSN(page, recptr);
2866  }
2867 
2868  END_CRIT_SECTION();
2869 
2870  UnlockReleaseBuffer(buffer);
2871  if (vmbuffer != InvalidBuffer)
2872  ReleaseBuffer(vmbuffer);
2873 
2874  ndone += nthispage;
2875  }
2876 
2877  /*
2878  * We're done with the actual inserts. Check for conflicts again, to
2879  * ensure that all rw-conflicts in to these inserts are detected. Without
2880  * this final check, a sequential scan of the heap may have locked the
2881  * table after the "before" check, missing one opportunity to detect the
2882  * conflict, and then scanned the table before the new tuples were there,
2883  * missing the other chance to detect the conflict.
2884  *
2885  * For heap inserts, we only need to check for table-level SSI locks. Our
2886  * new tuples can't possibly conflict with existing tuple locks, and heap
2887  * page locks are only consolidated versions of tuple locks; they do not
2888  * lock "gaps" as index page locks do. So we don't need to specify a
2889  * buffer when making the call.
2890  */
2892 
2893  /*
2894  * If tuples are cachable, mark them for invalidation from the caches in
2895  * case we abort. Note it is OK to do this after releasing the buffer,
2896  * because the heaptuples data structure is all in local memory, not in
2897  * the shared buffer.
2898  */
2899  if (IsCatalogRelation(relation))
2900  {
2901  for (i = 0; i < ntuples; i++)
2902  CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2903  }
2904 
2905  /*
2906  * Copy t_self fields back to the caller's original tuples. This does
2907  * nothing for untoasted tuples (tuples[i] == heaptuples[i)], but it's
2908  * probably faster to always copy than check.
2909  */
2910  for (i = 0; i < ntuples; i++)
2911  tuples[i]->t_self = heaptuples[i]->t_self;
2912 
2913  pgstat_count_heap_insert(relation, ntuples);
2914 }
2915 
2916 /*
2917  * simple_heap_insert - insert a tuple
2918  *
2919  * Currently, this routine differs from heap_insert only in supplying
2920  * a default command ID and not allowing access to the speedup options.
2921  *
2922  * This should be used rather than using heap_insert directly in most places
2923  * where we are modifying system catalogs.
2924  */
2925 Oid
2927 {
2928  return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2929 }
2930 
2931 /*
2932  * Given infomask/infomask2, compute the bits that must be saved in the
2933  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2934  * xl_heap_lock_updated WAL records.
2935  *
2936  * See fix_infomask_from_infobits.
2937  */
2938 static uint8
2939 compute_infobits(uint16 infomask, uint16 infomask2)
2940 {
2941  return
2942  ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2943  ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2944  ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2945  /* note we ignore HEAP_XMAX_SHR_LOCK here */
2946  ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2947  ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2948  XLHL_KEYS_UPDATED : 0);
2949 }
2950 
2951 /*
2952  * Given two versions of the same t_infomask for a tuple, compare them and
2953  * return whether the relevant status for a tuple Xmax has changed. This is
2954  * used after a buffer lock has been released and reacquired: we want to ensure
2955  * that the tuple state continues to be the same it was when we previously
2956  * examined it.
2957  *
2958  * Note the Xmax field itself must be compared separately.
2959  */
2960 static inline bool
2961 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2962 {
2963  const uint16 interesting =
2965 
2966  if ((new_infomask & interesting) != (old_infomask & interesting))
2967  return true;
2968 
2969  return false;
2970 }
2971 
2972 /*
2973  * heap_delete - delete a tuple
2974  *
2975  * NB: do not call this directly unless you are prepared to deal with
2976  * concurrent-update conditions. Use simple_heap_delete instead.
2977  *
2978  * relation - table to be modified (caller must hold suitable lock)
2979  * tid - TID of tuple to be deleted
2980  * cid - delete command ID (used for visibility test, and stored into
2981  * cmax if successful)
2982  * crosscheck - if not InvalidSnapshot, also check tuple against this
2983  * wait - true if should wait for any conflicting update to commit/abort
2984  * hufd - output parameter, filled in failure cases (see below)
2985  *
2986  * Normal, successful return value is HeapTupleMayBeUpdated, which
2987  * actually means we did delete it. Failure return codes are
2988  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
2989  * (the last only possible if wait == false).
2990  *
2991  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
2992  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
2993  * (the last only for HeapTupleSelfUpdated, since we
2994  * cannot obtain cmax from a combocid generated by another transaction).
2995  * See comments for struct HeapUpdateFailureData for additional info.
2996  */
2999  CommandId cid, Snapshot crosscheck, bool wait,
3000  HeapUpdateFailureData *hufd)
3001 {
3002  HTSU_Result result;
3004  ItemId lp;
3005  HeapTupleData tp;
3006  Page page;
3007  BlockNumber block;
3008  Buffer buffer;
3009  Buffer vmbuffer = InvalidBuffer;
3010  TransactionId new_xmax;
3011  uint16 new_infomask,
3012  new_infomask2;
3013  bool have_tuple_lock = false;
3014  bool iscombo;
3015  bool all_visible_cleared = false;
3016  HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
3017  bool old_key_copied = false;
3018 
3019  Assert(ItemPointerIsValid(tid));
3020 
3021  /*
3022  * Forbid this during a parallel operation, lest it allocate a combocid.
3023  * Other workers might need that combocid for visibility checks, and we
3024  * have no provision for broadcasting it to them.
3025  */
3026  if (IsInParallelMode())
3027  ereport(ERROR,
3028  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3029  errmsg("cannot delete tuples during a parallel operation")));
3030 
3031  block = ItemPointerGetBlockNumber(tid);
3032  buffer = ReadBuffer(relation, block);
3033  page = BufferGetPage(buffer);
3034 
3035  /*
3036  * Before locking the buffer, pin the visibility map page if it appears to
3037  * be necessary. Since we haven't got the lock yet, someone else might be
3038  * in the middle of changing this, so we'll need to recheck after we have
3039  * the lock.
3040  */
3041  if (PageIsAllVisible(page))
3042  visibilitymap_pin(relation, block, &vmbuffer);
3043 
3045 
3046  /*
3047  * If we didn't pin the visibility map page and the page has become all
3048  * visible while we were busy locking the buffer, we'll have to unlock and
3049  * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
3050  * unfortunate, but hopefully shouldn't happen often.
3051  */
3052  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3053  {
3054  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3055  visibilitymap_pin(relation, block, &vmbuffer);
3057  }
3058 
3059  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
3060  Assert(ItemIdIsNormal(lp));
3061 
3062  tp.t_tableOid = RelationGetRelid(relation);
3063  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3064  tp.t_len = ItemIdGetLength(lp);
3065  tp.t_self = *tid;
3066 
3067 l1:
3068  result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
3069 
3070  if (result == HeapTupleInvisible)
3071  {
3072  UnlockReleaseBuffer(buffer);
3073  ereport(ERROR,
3074  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3075  errmsg("attempted to delete invisible tuple")));
3076  }
3077  else if (result == HeapTupleBeingUpdated && wait)
3078  {
3079  TransactionId xwait;
3080  uint16 infomask;
3081 
3082  /* must copy state data before unlocking buffer */
3083  xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
3084  infomask = tp.t_data->t_infomask;
3085 
3086  /*
3087  * Sleep until concurrent transaction ends -- except when there's a
3088  * single locker and it's our own transaction. Note we don't care
3089  * which lock mode the locker has, because we need the strongest one.
3090  *
3091  * Before sleeping, we need to acquire tuple lock to establish our
3092  * priority for the tuple (see heap_lock_tuple). LockTuple will
3093  * release us when we are next-in-line for the tuple.
3094  *
3095  * If we are forced to "start over" below, we keep the tuple lock;
3096  * this arranges that we stay at the head of the line while rechecking
3097  * tuple state.
3098  */
3099  if (infomask & HEAP_XMAX_IS_MULTI)
3100  {
3101  /* wait for multixact */
3102  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3104  {
3105  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3106 
3107  /* acquire tuple lock, if necessary */
3109  LockWaitBlock, &have_tuple_lock);
3110 
3111  /* wait for multixact */
3113  relation, &(tp.t_self), XLTW_Delete,
3114  NULL);
3116 
3117  /*
3118  * If xwait had just locked the tuple then some other xact
3119  * could update this tuple before we get to this point. Check
3120  * for xmax change, and start over if so.
3121  */
3122  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3124  xwait))
3125  goto l1;
3126  }
3127 
3128  /*
3129  * You might think the multixact is necessarily done here, but not
3130  * so: it could have surviving members, namely our own xact or
3131  * other subxacts of this backend. It is legal for us to delete
3132  * the tuple in either case, however (the latter case is
3133  * essentially a situation of upgrading our former shared lock to
3134  * exclusive). We don't bother changing the on-disk hint bits
3135  * since we are about to overwrite the xmax altogether.
3136  */
3137  }
3138  else if (!TransactionIdIsCurrentTransactionId(xwait))
3139  {
3140  /*
3141  * Wait for regular transaction to end; but first, acquire tuple
3142  * lock.
3143  */
3144  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3146  LockWaitBlock, &have_tuple_lock);
3147  XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3149 
3150  /*
3151  * xwait is done, but if xwait had just locked the tuple then some
3152  * other xact could update this tuple before we get to this point.
3153  * Check for xmax change, and start over if so.
3154  */
3155  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3157  xwait))
3158  goto l1;
3159 
3160  /* Otherwise check if it committed or aborted */
3161  UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3162  }
3163 
3164  /*
3165  * We may overwrite if previous xmax aborted, or if it committed but
3166  * only locked the tuple without updating it.
3167  */
3168  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3171  result = HeapTupleMayBeUpdated;
3172  else
3173  result = HeapTupleUpdated;
3174  }
3175 
3176  if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3177  {
3178  /* Perform additional check for transaction-snapshot mode RI updates */
3179  if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3180  result = HeapTupleUpdated;
3181  }
3182 
3183  if (result != HeapTupleMayBeUpdated)
3184  {
3185  Assert(result == HeapTupleSelfUpdated ||
3186  result == HeapTupleUpdated ||
3187  result == HeapTupleBeingUpdated);
3189  hufd->ctid = tp.t_data->t_ctid;
3191  if (result == HeapTupleSelfUpdated)
3192  hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
3193  else
3194  hufd->cmax = InvalidCommandId;
3195  UnlockReleaseBuffer(buffer);
3196  if (have_tuple_lock)
3197  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3198  if (vmbuffer != InvalidBuffer)
3199  ReleaseBuffer(vmbuffer);
3200  return result;
3201  }
3202 
3203  /*
3204  * We're about to do the actual delete -- check for conflict first, to
3205  * avoid possibly having to roll back work we've just done.
3206  *
3207  * This is safe without a recheck as long as there is no possibility of
3208  * another process scanning the page between this check and the delete
3209  * being visible to the scan (i.e., an exclusive buffer content lock is
3210  * continuously held from this point until the tuple delete is visible).
3211  */
3212  CheckForSerializableConflictIn(relation, &tp, buffer);
3213 
3214  /* replace cid with a combo cid if necessary */
3215  HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
3216 
3217  /*
3218  * Compute replica identity tuple before entering the critical section so
3219  * we don't PANIC upon a memory allocation failure.
3220  */
3221  old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3222 
3223  /*
3224  * If this is the first possibly-multixact-able operation in the current
3225  * transaction, set my per-backend OldestMemberMXactId setting. We can be
3226  * certain that the transaction will never become a member of any older
3227  * MultiXactIds than that. (We have to do this even if we end up just
3228  * using our own TransactionId below, since some other backend could
3229  * incorporate our XID into a MultiXact immediately afterwards.)
3230  */
3232 
3235  xid, LockTupleExclusive, true,
3236  &new_xmax, &new_infomask, &new_infomask2);
3237 
3239 
3240  /*
3241  * If this transaction commits, the tuple will become DEAD sooner or
3242  * later. Set flag that this page is a candidate for pruning once our xid
3243  * falls below the OldestXmin horizon. If the transaction finally aborts,
3244  * the subsequent page pruning will be a no-op and the hint will be
3245  * cleared.
3246  */
3247  PageSetPrunable(page, xid);
3248 
3249  if (PageIsAllVisible(page))
3250  {
3251  all_visible_cleared = true;
3252  PageClearAllVisible(page);
3253  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3254  vmbuffer, VISIBILITYMAP_VALID_BITS);
3255  }
3256 
3257  /* store transaction information of xact deleting the tuple */
3260  tp.t_data->t_infomask |= new_infomask;
3261  tp.t_data->t_infomask2 |= new_infomask2;
3263  HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3264  HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
3265  /* Make sure there is no forward chain link in t_ctid */
3266  tp.t_data->t_ctid = tp.t_self;
3267 
3268  MarkBufferDirty(buffer);
3269 
3270  /*
3271  * XLOG stuff
3272  *
3273  * NB: heap_abort_speculative() uses the same xlog record and replay
3274  * routines.
3275  */
3276  if (RelationNeedsWAL(relation))
3277  {
3278  xl_heap_delete xlrec;
3279  XLogRecPtr recptr;
3280 
3281  /* For logical decode we need combocids to properly decode the catalog */
3283  log_heap_new_cid(relation, &tp);
3284 
3285  xlrec.flags = all_visible_cleared ? XLH_DELETE_ALL_VISIBLE_CLEARED : 0;
3287  tp.t_data->t_infomask2);
3289  xlrec.xmax = new_xmax;
3290 
3291  if (old_key_tuple != NULL)
3292  {
3293  if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3295  else
3297  }
3298 
3299  XLogBeginInsert();
3300  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3301 
3302  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3303 
3304  /*
3305  * Log replica identity of the deleted tuple if there is one
3306  */
3307  if (old_key_tuple != NULL)
3308  {
3309  xl_heap_header xlhdr;
3310 
3311  xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3312  xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3313  xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3314 
3315  XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3316  XLogRegisterData((char *) old_key_tuple->t_data
3318  old_key_tuple->t_len
3320  }
3321 
3322  /* filtering by origin on a row level is much more efficient */
3324 
3325  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3326 
3327  PageSetLSN(page, recptr);
3328  }
3329 
3330  END_CRIT_SECTION();
3331 
3332  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3333 
3334  if (vmbuffer != InvalidBuffer)
3335  ReleaseBuffer(vmbuffer);
3336 
3337  /*
3338  * If the tuple has toasted out-of-line attributes, we need to delete
3339  * those items too. We have to do this before releasing the buffer
3340  * because we need to look at the contents of the tuple, but it's OK to
3341  * release the content lock on the buffer first.
3342  */
3343  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3344  relation->rd_rel->relkind != RELKIND_MATVIEW)
3345  {
3346  /* toast table entries should never be recursively toasted */
3348  }
3349  else if (HeapTupleHasExternal(&tp))
3350  toast_delete(relation, &tp, false);
3351 
3352  /*
3353  * Mark tuple for invalidation from system caches at next command
3354  * boundary. We have to do this before releasing the buffer because we
3355  * need to look at the contents of the tuple.
3356  */
3357  CacheInvalidateHeapTuple(relation, &tp, NULL);
3358 
3359  /* Now we can release the buffer */
3360  ReleaseBuffer(buffer);
3361 
3362  /*
3363  * Release the lmgr tuple lock, if we had it.
3364  */
3365  if (have_tuple_lock)
3366  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3367 
3368  pgstat_count_heap_delete(relation);
3369 
3370  if (old_key_tuple != NULL && old_key_copied)
3371  heap_freetuple(old_key_tuple);
3372 
3373  return HeapTupleMayBeUpdated;
3374 }
3375 
3376 /*
3377  * simple_heap_delete - delete a tuple
3378  *
3379  * This routine may be used to delete a tuple when concurrent updates of
3380  * the target tuple are not expected (for example, because we have a lock
3381  * on the relation associated with the tuple). Any failure is reported
3382  * via ereport().
3383  */
3384 void
3386 {
3387  HTSU_Result result;
3388  HeapUpdateFailureData hufd;
3389 
3390  result = heap_delete(relation, tid,
3392  true /* wait for commit */ ,
3393  &hufd);
3394  switch (result)
3395  {
3396  case HeapTupleSelfUpdated:
3397  /* Tuple was already updated in current command? */
3398  elog(ERROR, "tuple already updated by self");
3399  break;
3400 
3401  case HeapTupleMayBeUpdated:
3402  /* done successfully */
3403  break;
3404 
3405  case HeapTupleUpdated:
3406  elog(ERROR, "tuple concurrently updated");
3407  break;
3408 
3409  default:
3410  elog(ERROR, "unrecognized heap_delete status: %u", result);
3411  break;
3412  }
3413 }
3414 
3415 /*
3416  * heap_update - replace a tuple
3417  *
3418  * NB: do not call this directly unless you are prepared to deal with
3419  * concurrent-update conditions. Use simple_heap_update instead.
3420  *
3421  * relation - table to be modified (caller must hold suitable lock)
3422  * otid - TID of old tuple to be replaced
3423  * newtup - newly constructed tuple data to store
3424  * cid - update command ID (used for visibility test, and stored into
3425  * cmax/cmin if successful)
3426  * crosscheck - if not InvalidSnapshot, also check old tuple against this
3427  * wait - true if should wait for any conflicting update to commit/abort
3428  * hufd - output parameter, filled in failure cases (see below)
3429  * lockmode - output parameter, filled with lock mode acquired on tuple
3430  *
3431  * Normal, successful return value is HeapTupleMayBeUpdated, which
3432  * actually means we *did* update it. Failure return codes are
3433  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3434  * (the last only possible if wait == false).
3435  *
3436  * On success, the header fields of *newtup are updated to match the new
3437  * stored tuple; in particular, newtup->t_self is set to the TID where the
3438  * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
3439  * update was done. However, any TOAST changes in the new tuple's
3440  * data are not reflected into *newtup.
3441  *
3442  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3443  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3444  * (the last only for HeapTupleSelfUpdated, since we
3445  * cannot obtain cmax from a combocid generated by another transaction).
3446  * See comments for struct HeapUpdateFailureData for additional info.
3447  */
3450  CommandId cid, Snapshot crosscheck, bool wait,
3451  HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
3452 {
3453  HTSU_Result result;
3455  Bitmapset *hot_attrs;
3456  Bitmapset *key_attrs;
3457  Bitmapset *id_attrs;
3458  ItemId lp;
3459  HeapTupleData oldtup;
3460  HeapTuple heaptup;
3461  HeapTuple old_key_tuple = NULL;
3462  bool old_key_copied = false;
3463  Page page;
3464  BlockNumber block;
3465  MultiXactStatus mxact_status;
3466  Buffer buffer,
3467  newbuf,
3468  vmbuffer = InvalidBuffer,
3469  vmbuffer_new = InvalidBuffer;
3470  bool need_toast;
3471  Size newtupsize,
3472  pagefree;
3473  bool have_tuple_lock = false;
3474  bool iscombo;
3475  bool satisfies_hot;
3476  bool satisfies_key;
3477  bool satisfies_id;
3478  bool use_hot_update = false;
3479  bool key_intact;
3480  bool all_visible_cleared = false;
3481  bool all_visible_cleared_new = false;
3482  bool checked_lockers;
3483  bool locker_remains;
3484  TransactionId xmax_new_tuple,
3485  xmax_old_tuple;
3486  uint16 infomask_old_tuple,
3487  infomask2_old_tuple,
3488  infomask_new_tuple,
3489  infomask2_new_tuple;
3490 
3491  Assert(ItemPointerIsValid(otid));
3492 
3493  /*
3494  * Forbid this during a parallel operation, lest it allocate a combocid.
3495  * Other workers might need that combocid for visibility checks, and we
3496  * have no provision for broadcasting it to them.
3497  */
3498  if (IsInParallelMode())
3499  ereport(ERROR,
3500  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3501  errmsg("cannot update tuples during a parallel operation")));
3502 
3503  /*
3504  * Fetch the list of attributes to be checked for HOT update. This is
3505  * wasted effort if we fail to update or have to put the new tuple on a
3506  * different page. But we must compute the list before obtaining buffer
3507  * lock --- in the worst case, if we are doing an update on one of the
3508  * relevant system catalogs, we could deadlock if we try to fetch the list
3509  * later. In any case, the relcache caches the data so this is usually
3510  * pretty cheap.
3511  *
3512  * Note that we get a copy here, so we need not worry about relcache flush
3513  * happening midway through.
3514  */
3515  hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
3516  key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3517  id_attrs = RelationGetIndexAttrBitmap(relation,
3519 
3520  block = ItemPointerGetBlockNumber(otid);
3521  buffer = ReadBuffer(relation, block);
3522  page = BufferGetPage(buffer);
3523 
3524  /*
3525  * Before locking the buffer, pin the visibility map page if it appears to
3526  * be necessary. Since we haven't got the lock yet, someone else might be
3527  * in the middle of changing this, so we'll need to recheck after we have
3528  * the lock.
3529  */
3530  if (PageIsAllVisible(page))
3531  visibilitymap_pin(relation, block, &vmbuffer);
3532 
3534 
3535  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3536  Assert(ItemIdIsNormal(lp));
3537 
3538  /*
3539  * Fill in enough data in oldtup for HeapSatisfiesHOTandKeyUpdate to work
3540  * properly.
3541  */
3542  oldtup.t_tableOid = RelationGetRelid(relation);
3543  oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3544  oldtup.t_len = ItemIdGetLength(lp);
3545  oldtup.t_self = *otid;
3546 
3547  /* the new tuple is ready, except for this: */
3548  newtup->t_tableOid = RelationGetRelid(relation);
3549 
3550  /* Fill in OID for newtup */
3551  if (relation->rd_rel->relhasoids)
3552  {
3553 #ifdef NOT_USED
3554  /* this is redundant with an Assert in HeapTupleSetOid */
3555  Assert(newtup->t_data->t_infomask & HEAP_HASOID);
3556 #endif
3557  HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
3558  }
3559  else
3560  {
3561  /* check there is not space for an OID */
3562  Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
3563  }
3564 
3565  /*
3566  * If we're not updating any "key" column, we can grab a weaker lock type.
3567  * This allows for more concurrency when we are running simultaneously
3568  * with foreign key checks.
3569  *
3570  * Note that if a column gets detoasted while executing the update, but
3571  * the value ends up being the same, this test will fail and we will use
3572  * the stronger lock. This is acceptable; the important case to optimize
3573  * is updates that don't manipulate key columns, not those that
3574  * serendipitiously arrive at the same key values.
3575  */
3576  HeapSatisfiesHOTandKeyUpdate(relation, hot_attrs, key_attrs, id_attrs,
3577  &satisfies_hot, &satisfies_key,
3578  &satisfies_id, &oldtup, newtup);
3579  if (satisfies_key)
3580  {
3581  *lockmode = LockTupleNoKeyExclusive;
3582  mxact_status = MultiXactStatusNoKeyUpdate;
3583  key_intact = true;
3584 
3585  /*
3586  * If this is the first possibly-multixact-able operation in the
3587  * current transaction, set my per-backend OldestMemberMXactId
3588  * setting. We can be certain that the transaction will never become a
3589  * member of any older MultiXactIds than that. (We have to do this
3590  * even if we end up just using our own TransactionId below, since
3591  * some other backend could incorporate our XID into a MultiXact
3592  * immediately afterwards.)
3593  */
3595  }
3596  else
3597  {
3598  *lockmode = LockTupleExclusive;
3599  mxact_status = MultiXactStatusUpdate;
3600  key_intact = false;
3601  }
3602 
3603  /*
3604  * Note: beyond this point, use oldtup not otid to refer to old tuple.
3605  * otid may very well point at newtup->t_self, which we will overwrite
3606  * with the new tuple's location, so there's great risk of confusion if we
3607  * use otid anymore.
3608  */
3609 
3610 l2:
3611  checked_lockers = false;
3612  locker_remains = false;
3613  result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3614 
3615  /* see below about the "no wait" case */
3616  Assert(result != HeapTupleBeingUpdated || wait);
3617 
3618  if (result == HeapTupleInvisible)
3619  {
3620  UnlockReleaseBuffer(buffer);
3621  ereport(ERROR,
3622  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3623  errmsg("attempted to update invisible tuple")));
3624  }
3625  else if (result == HeapTupleBeingUpdated && wait)
3626  {
3627  TransactionId xwait;
3628  uint16 infomask;
3629  bool can_continue = false;
3630 
3631  /*
3632  * XXX note that we don't consider the "no wait" case here. This
3633  * isn't a problem currently because no caller uses that case, but it
3634  * should be fixed if such a caller is introduced. It wasn't a
3635  * problem previously because this code would always wait, but now
3636  * that some tuple locks do not conflict with one of the lock modes we
3637  * use, it is possible that this case is interesting to handle
3638  * specially.
3639  *
3640  * This may cause failures with third-party code that calls
3641  * heap_update directly.
3642  */
3643 
3644  /* must copy state data before unlocking buffer */
3645  xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3646  infomask = oldtup.t_data->t_infomask;
3647 
3648  /*
3649  * Now we have to do something about the existing locker. If it's a
3650  * multi, sleep on it; we might be awakened before it is completely
3651  * gone (or even not sleep at all in some cases); we need to preserve
3652  * it as locker, unless it is gone completely.
3653  *
3654  * If it's not a multi, we need to check for sleeping conditions
3655  * before actually going to sleep. If the update doesn't conflict
3656  * with the locks, we just continue without sleeping (but making sure
3657  * it is preserved).
3658  *
3659  * Before sleeping, we need to acquire tuple lock to establish our
3660  * priority for the tuple (see heap_lock_tuple). LockTuple will
3661  * release us when we are next-in-line for the tuple. Note we must
3662  * not acquire the tuple lock until we're sure we're going to sleep;
3663  * otherwise we're open for race conditions with other transactions
3664  * holding the tuple lock which sleep on us.
3665  *
3666  * If we are forced to "start over" below, we keep the tuple lock;
3667  * this arranges that we stay at the head of the line while rechecking
3668  * tuple state.
3669  */
3670  if (infomask & HEAP_XMAX_IS_MULTI)
3671  {
3672  TransactionId update_xact;
3673  int remain;
3674 
3675  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3676  *lockmode))
3677  {
3678  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3679 
3680  /* acquire tuple lock, if necessary */
3681  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3682  LockWaitBlock, &have_tuple_lock);
3683 
3684  /* wait for multixact */
3685  MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3686  relation, &oldtup.t_self, XLTW_Update,
3687  &remain);
3688  checked_lockers = true;
3689  locker_remains = remain != 0;
3691 
3692  /*
3693  * If xwait had just locked the tuple then some other xact
3694  * could update this tuple before we get to this point. Check
3695  * for xmax change, and start over if so.
3696  */
3698  infomask) ||
3700  xwait))
3701  goto l2;
3702  }
3703 
3704  /*
3705  * Note that the multixact may not be done by now. It could have
3706  * surviving members; our own xact or other subxacts of this
3707  * backend, and also any other concurrent transaction that locked
3708  * the tuple with KeyShare if we only got TupleLockUpdate. If
3709  * this is the case, we have to be careful to mark the updated
3710  * tuple with the surviving members in Xmax.
3711  *
3712  * Note that there could have been another update in the
3713  * MultiXact. In that case, we need to check whether it committed
3714  * or aborted. If it aborted we are safe to update it again;
3715  * otherwise there is an update conflict, and we have to return
3716  * HeapTupleUpdated below.
3717  *
3718  * In the LockTupleExclusive case, we still need to preserve the
3719  * surviving members: those would include the tuple locks we had
3720  * before this one, which are important to keep in case this
3721  * subxact aborts.
3722  */
3724  update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3725  else
3726  update_xact = InvalidTransactionId;
3727 
3728  /*
3729  * There was no UPDATE in the MultiXact; or it aborted. No
3730  * TransactionIdIsInProgress() call needed here, since we called
3731  * MultiXactIdWait() above.
3732  */
3733  if (!TransactionIdIsValid(update_xact) ||
3734  TransactionIdDidAbort(update_xact))
3735  can_continue = true;
3736  }
3737  else if (TransactionIdIsCurrentTransactionId(xwait))
3738  {
3739  /*
3740  * The only locker is ourselves; we can avoid grabbing the tuple
3741  * lock here, but must preserve our locking information.
3742  */
3743  checked_lockers = true;
3744  locker_remains = true;
3745  can_continue = true;
3746  }
3747  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3748  {
3749  /*
3750  * If it's just a key-share locker, and we're not changing the key
3751  * columns, we don't need to wait for it to end; but we need to
3752  * preserve it as locker.
3753  */
3754  checked_lockers = true;
3755  locker_remains = true;
3756  can_continue = true;
3757  }
3758  else
3759  {
3760  /*
3761  * Wait for regular transaction to end; but first, acquire tuple
3762  * lock.
3763  */
3764  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3765  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3766  LockWaitBlock, &have_tuple_lock);
3767  XactLockTableWait(xwait, relation, &oldtup.t_self,
3768  XLTW_Update);
3769  checked_lockers = true;
3771 
3772  /*
3773  * xwait is done, but if xwait had just locked the tuple then some
3774  * other xact could update this tuple before we get to this point.
3775  * Check for xmax change, and start over if so.
3776  */
3777  if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3778  !TransactionIdEquals(xwait,
3780  goto l2;
3781 
3782  /* Otherwise check if it committed or aborted */
3783  UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3784  if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3785  can_continue = true;
3786  }
3787 
3788  result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
3789  }
3790 
3791  if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3792  {
3793  /* Perform additional check for transaction-snapshot mode RI updates */
3794  if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3795  result = HeapTupleUpdated;
3796  }
3797 
3798  if (result != HeapTupleMayBeUpdated)
3799  {
3800  Assert(result == HeapTupleSelfUpdated ||
3801  result == HeapTupleUpdated ||
3802  result == HeapTupleBeingUpdated);
3803  Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3804  hufd->ctid = oldtup.t_data->t_ctid;
3805  hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3806  if (result == HeapTupleSelfUpdated)
3807  hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3808  else
3809  hufd->cmax = InvalidCommandId;
3810  UnlockReleaseBuffer(buffer);
3811  if (have_tuple_lock)
3812  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3813  if (vmbuffer != InvalidBuffer)
3814  ReleaseBuffer(vmbuffer);
3815  bms_free(hot_attrs);
3816  bms_free(key_attrs);
3817  bms_free(id_attrs);
3818  return result;
3819  }
3820 
3821  /*
3822  * If we didn't pin the visibility map page and the page has become all
3823  * visible while we were busy locking the buffer, or during some
3824  * subsequent window during which we had it unlocked, we'll have to unlock
3825  * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3826  * bit unfortunate, especially since we'll now have to recheck whether the
3827  * tuple has been locked or updated under us, but hopefully it won't
3828  * happen very often.
3829  */
3830  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3831  {
3832  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3833  visibilitymap_pin(relation, block, &vmbuffer);
3835  goto l2;
3836  }
3837 
3838  /* Fill in transaction status data */
3839 
3840  /*
3841  * If the tuple we're updating is locked, we need to preserve the locking
3842  * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3843  */
3845  oldtup.t_data->t_infomask,
3846  oldtup.t_data->t_infomask2,
3847  xid, *lockmode, true,
3848  &xmax_old_tuple, &infomask_old_tuple,
3849  &infomask2_old_tuple);
3850 
3851  /*
3852  * And also prepare an Xmax value for the new copy of the tuple. If there
3853  * was no xmax previously, or there was one but all lockers are now gone,
3854  * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3855  * rare cases that might also be InvalidXid and yet not have the
3856  * HEAP_XMAX_INVALID bit set; that's fine.)
3857  */
3858  if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3860  (checked_lockers && !locker_remains))
3861  xmax_new_tuple = InvalidTransactionId;
3862  else
3863  xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3864 
3865  if (!TransactionIdIsValid(xmax_new_tuple))
3866  {
3867  infomask_new_tuple = HEAP_XMAX_INVALID;
3868  infomask2_new_tuple = 0;
3869  }
3870  else
3871  {
3872  /*
3873  * If we found a valid Xmax for the new tuple, then the infomask bits
3874  * to use on the new tuple depend on what was there on the old one.
3875  * Note that since we're doing an update, the only possibility is that
3876  * the lockers had FOR KEY SHARE lock.
3877  */
3878  if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3879  {
3880  GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3881  &infomask2_new_tuple);
3882  }
3883  else
3884  {
3885  infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3886  infomask2_new_tuple = 0;
3887  }
3888  }
3889 
3890  /*
3891  * Prepare the new tuple with the appropriate initial values of Xmin and
3892  * Xmax, as well as initial infomask bits as computed above.
3893  */
3894  newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3895  newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3896  HeapTupleHeaderSetXmin(newtup->t_data, xid);
3897  HeapTupleHeaderSetCmin(newtup->t_data, cid);
3898  newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3899  newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3900  HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3901 
3902  /*
3903  * Replace cid with a combo cid if necessary. Note that we already put
3904  * the plain cid into the new tuple.
3905  */
3906  HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3907 
3908  /*
3909  * If the toaster needs to be activated, OR if the new tuple will not fit
3910  * on the same page as the old, then we need to release the content lock
3911  * (but not the pin!) on the old tuple's buffer while we are off doing
3912  * TOAST and/or table-file-extension work. We must mark the old tuple to
3913  * show that it's locked, else other processes may try to update it
3914  * themselves.
3915  *
3916  * We need to invoke the toaster if there are already any out-of-line
3917  * toasted values present, or if the new tuple is over-threshold.
3918  */
3919  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3920  relation->rd_rel->relkind != RELKIND_MATVIEW)
3921  {
3922  /* toast table entries should never be recursively toasted */
3923  Assert(!HeapTupleHasExternal(&oldtup));
3924  Assert(!HeapTupleHasExternal(newtup));
3925  need_toast = false;
3926  }
3927  else
3928  need_toast = (HeapTupleHasExternal(&oldtup) ||
3929  HeapTupleHasExternal(newtup) ||
3930  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3931 
3932  pagefree = PageGetHeapFreeSpace(page);
3933 
3934  newtupsize = MAXALIGN(newtup->t_len);
3935 
3936  if (need_toast || newtupsize > pagefree)
3937  {
3938  TransactionId xmax_lock_old_tuple;
3939  uint16 infomask_lock_old_tuple,
3940  infomask2_lock_old_tuple;
3941  bool cleared_all_frozen = false;
3942 
3943  /*
3944  * To prevent concurrent sessions from updating the tuple, we have to
3945  * temporarily mark it locked, while we release the lock.
3946  *
3947  * To satisfy the rule that any xid potentially appearing in a buffer
3948  * written out to disk, we unfortunately have to WAL log this
3949  * temporary modification. We can reuse xl_heap_lock for this
3950  * purpose. If we crash/error before following through with the
3951  * actual update, xmax will be of an aborted transaction, allowing
3952  * other sessions to proceed.
3953  */
3954 
3955  /*
3956  * Compute xmax / infomask appropriate for locking the tuple. This has
3957  * to be done separately from the lock, because the potentially
3958  * created multixact would otherwise be wrong.
3959  */
3961  oldtup.t_data->t_infomask,
3962  oldtup.t_data->t_infomask2,
3963  xid, *lockmode, false,
3964  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3965  &infomask2_lock_old_tuple);
3966 
3967  Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3968 
3970 
3971  /* Clear obsolete visibility flags ... */
3972  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3973  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3974  HeapTupleClearHotUpdated(&oldtup);
3975  /* ... and store info about transaction updating this tuple */
3976  Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3977  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3978  oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3979  oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3980  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3981 
3982  /* temporarily make it look not-updated, but locked */
3983  oldtup.t_data->t_ctid = oldtup.t_self;
3984 
3985  /*
3986  * Clear all-frozen bit on visibility map if needed. We could
3987  * immediately reset ALL_VISIBLE, but given that the WAL logging
3988  * overhead would be unchanged, that doesn't seem necessarily
3989  * worthwhile.
3990  */
3991  if (PageIsAllVisible(BufferGetPage(buffer)) &&
3992  visibilitymap_clear(relation, block, vmbuffer,
3994  cleared_all_frozen = true;
3995 
3996  MarkBufferDirty(buffer);
3997 
3998  if (RelationNeedsWAL(relation))
3999  {
4000  xl_heap_lock xlrec;
4001  XLogRecPtr recptr;
4002 
4003  XLogBeginInsert();
4004  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
4005 
4006  xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
4007  xlrec.locking_xid = xmax_lock_old_tuple;
4009  oldtup.t_data->t_infomask2);
4010  xlrec.flags =
4011  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4012  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4013  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4014  PageSetLSN(page, recptr);
4015  }
4016 
4017  END_CRIT_SECTION();
4018 
4019  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4020 
4021  /*
4022  * Let the toaster do its thing, if needed.
4023  *
4024  * Note: below this point, heaptup is the data we actually intend to
4025  * store into the relation; newtup is the caller's original untoasted
4026  * data.
4027  */
4028  if (need_toast)
4029  {
4030  /* Note we always use WAL and FSM during updates */
4031  heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
4032  newtupsize = MAXALIGN(heaptup->t_len);
4033  }
4034  else
4035  heaptup = newtup;
4036 
4037  /*
4038  * Now, do we need a new page for the tuple, or not? This is a bit
4039  * tricky since someone else could have added tuples to the page while
4040  * we weren't looking. We have to recheck the available space after
4041  * reacquiring the buffer lock. But don't bother to do that if the
4042  * former amount of free space is still not enough; it's unlikely
4043  * there's more free now than before.
4044  *
4045  * What's more, if we need to get a new page, we will need to acquire
4046  * buffer locks on both old and new pages. To avoid deadlock against
4047  * some other backend trying to get the same two locks in the other
4048  * order, we must be consistent about the order we get the locks in.
4049  * We use the rule "lock the lower-numbered page of the relation
4050  * first". To implement this, we must do RelationGetBufferForTuple
4051  * while not holding the lock on the old page, and we must rely on it
4052  * to get the locks on both pages in the correct order.
4053  */
4054  if (newtupsize > pagefree)
4055  {
4056  /* Assume there's no chance to put heaptup on same page. */
4057  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4058  buffer, 0, NULL,
4059  &vmbuffer_new, &vmbuffer);
4060  }
4061  else
4062  {
4063  /* Re-acquire the lock on the old tuple's page. */
4065  /* Re-check using the up-to-date free space */
4066  pagefree = PageGetHeapFreeSpace(page);
4067  if (newtupsize > pagefree)
4068  {
4069  /*
4070  * Rats, it doesn't fit anymore. We must now unlock and
4071  * relock to avoid deadlock. Fortunately, this path should
4072  * seldom be taken.
4073  */
4074  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4075  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4076  buffer, 0, NULL,
4077  &vmbuffer_new, &vmbuffer);
4078  }
4079  else
4080  {
4081  /* OK, it fits here, so we're done. */
4082  newbuf = buffer;
4083  }
4084  }
4085  }
4086  else
4087  {
4088  /* No TOAST work needed, and it'll fit on same page */
4089  newbuf = buffer;
4090  heaptup = newtup;
4091  }
4092 
4093  /*
4094  * We're about to do the actual update -- check for conflict first, to
4095  * avoid possibly having to roll back work we've just done.
4096  *
4097  * This is safe without a recheck as long as there is no possibility of
4098  * another process scanning the pages between this check and the update
4099  * being visible to the scan (i.e., exclusive buffer content lock(s) are
4100  * continuously held from this point until the tuple update is visible).
4101  *
4102  * For the new tuple the only check needed is at the relation level, but
4103  * since both tuples are in the same relation and the check for oldtup
4104  * will include checking the relation level, there is no benefit to a
4105  * separate check for the new tuple.
4106  */
4107  CheckForSerializableConflictIn(relation, &oldtup, buffer);
4108 
4109  /*
4110  * At this point newbuf and buffer are both pinned and locked, and newbuf
4111  * has enough space for the new tuple. If they are the same buffer, only
4112  * one pin is held.
4113  */
4114 
4115  if (newbuf == buffer)
4116  {
4117  /*
4118  * Since the new tuple is going into the same page, we might be able
4119  * to do a HOT update. Check if any of the index columns have been
4120  * changed. If not, then HOT update is possible.
4121  */
4122  if (satisfies_hot)
4123  use_hot_update = true;
4124  }
4125  else
4126  {
4127  /* Set a hint that the old page could use prune/defrag */
4128  PageSetFull(page);
4129  }
4130 
4131  /*
4132  * Compute replica identity tuple before entering the critical section so
4133  * we don't PANIC upon a memory allocation failure.
4134  * ExtractReplicaIdentity() will return NULL if nothing needs to be
4135  * logged.
4136  */
4137  old_key_tuple = ExtractReplicaIdentity(relation, &oldtup, !satisfies_id, &old_key_copied);
4138 
4139  /* NO EREPORT(ERROR) from here till changes are logged */
4141 
4142  /*
4143  * If this transaction commits, the old tuple will become DEAD sooner or
4144  * later. Set flag that this page is a candidate for pruning once our xid
4145  * falls below the OldestXmin horizon. If the transaction finally aborts,
4146  * the subsequent page pruning will be a no-op and the hint will be
4147  * cleared.
4148  *
4149  * XXX Should we set hint on newbuf as well? If the transaction aborts,
4150  * there would be a prunable tuple in the newbuf; but for now we choose
4151  * not to optimize for aborts. Note that heap_xlog_update must be kept in
4152  * sync if this decision changes.
4153  */
4154  PageSetPrunable(page, xid);
4155 
4156  if (use_hot_update)
4157  {
4158  /* Mark the old tuple as HOT-updated */
4159  HeapTupleSetHotUpdated(&oldtup);
4160  /* And mark the new tuple as heap-only */
4161  HeapTupleSetHeapOnly(heaptup);
4162  /* Mark the caller's copy too, in case different from heaptup */
4163  HeapTupleSetHeapOnly(newtup);
4164  }
4165  else
4166  {
4167  /* Make sure tuples are correctly marked as not-HOT */
4168  HeapTupleClearHotUpdated(&oldtup);
4169  HeapTupleClearHeapOnly(heaptup);
4170  HeapTupleClearHeapOnly(newtup);
4171  }
4172 
4173  RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4174 
4175 
4176  /* Clear obsolete visibility flags, possibly set by ourselves above... */
4177  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4178  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4179  /* ... and store info about transaction updating this tuple */
4180  Assert(TransactionIdIsValid(xmax_old_tuple));
4181  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
4182  oldtup.t_data->t_infomask |= infomask_old_tuple;
4183  oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4184  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4185 
4186  /* record address of new tuple in t_ctid of old one */
4187  oldtup.t_data->t_ctid = heaptup->t_self;
4188 
4189  /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4190  if (PageIsAllVisible(BufferGetPage(buffer)))
4191  {
4192  all_visible_cleared = true;
4194  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4195  vmbuffer, VISIBILITYMAP_VALID_BITS);
4196  }
4197  if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4198  {
4199  all_visible_cleared_new = true;
4201  visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
4202  vmbuffer_new, VISIBILITYMAP_VALID_BITS);
4203  }
4204 
4205  if (newbuf != buffer)
4206  MarkBufferDirty(newbuf);
4207  MarkBufferDirty(buffer);
4208 
4209  /* XLOG stuff */
4210  if (RelationNeedsWAL(relation))
4211  {
4212  XLogRecPtr recptr;
4213 
4214  /*
4215  * For logical decoding we need combocids to properly decode the
4216  * catalog.
4217  */
4219  {
4220  log_heap_new_cid(relation, &oldtup);
4221  log_heap_new_cid(relation, heaptup);
4222  }
4223 
4224  recptr = log_heap_update(relation, buffer,
4225  newbuf, &oldtup, heaptup,
4226  old_key_tuple,
4227  all_visible_cleared,
4228  all_visible_cleared_new);
4229  if (newbuf != buffer)
4230  {
4231  PageSetLSN(BufferGetPage(newbuf), recptr);
4232  }
4233  PageSetLSN(BufferGetPage(buffer), recptr);
4234  }
4235 
4236  END_CRIT_SECTION();
4237 
4238  if (newbuf != buffer)
4239  LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
4240  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4241 
4242  /*
4243  * Mark old tuple for invalidation from system caches at next command
4244  * boundary, and mark the new tuple for invalidation in case we abort. We
4245  * have to do this before releasing the buffer because oldtup is in the
4246  * buffer. (heaptup is all in local memory, but it's necessary to process
4247  * both tuple versions in one call to inval.c so we can avoid redundant
4248  * sinval messages.)
4249  */
4250  CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4251 
4252  /* Now we can release the buffer(s) */
4253  if (newbuf != buffer)
4254  ReleaseBuffer(newbuf);
4255  ReleaseBuffer(buffer);
4256  if (BufferIsValid(vmbuffer_new))
4257  ReleaseBuffer(vmbuffer_new);
4258  if (BufferIsValid(vmbuffer))
4259  ReleaseBuffer(vmbuffer);
4260 
4261  /*
4262  * Release the lmgr tuple lock, if we had it.
4263  */
4264  if (have_tuple_lock)
4265  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4266 
4267  pgstat_count_heap_update(relation, use_hot_update);
4268 
4269  /*
4270  * If heaptup is a private copy, release it. Don't forget to copy t_self
4271  * back to the caller's image, too.
4272  */
4273  if (heaptup != newtup)
4274  {
4275  newtup->t_self = heaptup->t_self;
4276  heap_freetuple(heaptup);
4277  }
4278 
4279  if (old_key_tuple != NULL && old_key_copied)
4280  heap_freetuple(old_key_tuple);
4281 
4282  bms_free(hot_attrs);
4283  bms_free(key_attrs);
4284  bms_free(id_attrs);
4285 
4286  return HeapTupleMayBeUpdated;
4287 }
4288 
4289 /*
4290  * Check if the specified attribute's value is same in both given tuples.
4291  * Subroutine for HeapSatisfiesHOTandKeyUpdate.
4292  */
4293 static bool
4294 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4295  HeapTuple tup1, HeapTuple tup2)
4296 {
4297  Datum value1,
4298  value2;
4299  bool isnull1,
4300  isnull2;
4301  Form_pg_attribute att;
4302 
4303  /*
4304  * If it's a whole-tuple reference, say "not equal". It's not really
4305  * worth supporting this case, since it could only succeed after a no-op
4306  * update, which is hardly a case worth optimizing for.
4307  */
4308  if (attrnum == 0)
4309  return false;
4310 
4311  /*
4312  * Likewise, automatically say "not equal" for any system attribute other
4313  * than OID and tableOID; we cannot expect these to be consistent in a HOT
4314  * chain, or even to be set correctly yet in the new tuple.
4315  */
4316  if (attrnum < 0)
4317  {
4318  if (attrnum != ObjectIdAttributeNumber &&
4319  attrnum != TableOidAttributeNumber)
4320  return false;
4321  }
4322 
4323  /*
4324  * Extract the corresponding values. XXX this is pretty inefficient if
4325  * there are many indexed columns. Should HeapSatisfiesHOTandKeyUpdate do
4326  * a single heap_deform_tuple call on each tuple, instead? But that
4327  * doesn't work for system columns ...
4328  */
4329  value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4330  value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4331 
4332  /*
4333  * If one value is NULL and other is not, then they are certainly not
4334  * equal
4335  */
4336  if (isnull1 != isnull2)
4337  return false;
4338 
4339  /*
4340  * If both are NULL, they can be considered equal.
4341  */
4342  if (isnull1)
4343  return true;
4344 
4345  /*
4346  * We do simple binary comparison of the two datums. This may be overly
4347  * strict because there can be multiple binary representations for the
4348  * same logical value. But we should be OK as long as there are no false
4349  * positives. Using a type-specific equality operator is messy because
4350  * there could be multiple notions of equality in different operator
4351  * classes; furthermore, we cannot safely invoke user-defined functions
4352  * while holding exclusive buffer lock.
4353  */
4354  if (attrnum <= 0)
4355  {
4356  /* The only allowed system columns are OIDs, so do this */
4357  return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4358  }
4359  else
4360  {
4361  Assert(attrnum <= tupdesc->natts);
4362  att = tupdesc->attrs[attrnum - 1];
4363  return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4364  }
4365 }
4366 
4367 /*
4368  * Check which columns are being updated.
4369  *
4370  * This simultaneously checks conditions for HOT updates, for FOR KEY
4371  * SHARE updates, and REPLICA IDENTITY concerns. Since much of the time they
4372  * will be checking very similar sets of columns, and doing the same tests on
4373  * them, it makes sense to optimize and do them together.
4374  *
4375  * We receive three bitmapsets comprising the three sets of columns we're
4376  * interested in. Note these are destructively modified; that is OK since
4377  * this is invoked at most once in heap_update.
4378  *
4379  * hot_result is set to TRUE if it's okay to do a HOT update (i.e. it does not
4380  * modified indexed columns); key_result is set to TRUE if the update does not
4381  * modify columns used in the key; id_result is set to TRUE if the update does
4382  * not modify columns in any index marked as the REPLICA IDENTITY.
4383  */
4384 static void
4386  Bitmapset *key_attrs, Bitmapset *id_attrs,
4387  bool *satisfies_hot, bool *satisfies_key,
4388  bool *satisfies_id,
4389  HeapTuple oldtup, HeapTuple newtup)
4390 {
4391  int next_hot_attnum;
4392  int next_key_attnum;
4393  int next_id_attnum;
4394  bool hot_result = true;
4395  bool key_result = true;
4396  bool id_result = true;
4397 
4398  /* If REPLICA IDENTITY is set to FULL, id_attrs will be empty. */
4399  Assert(bms_is_subset(id_attrs, key_attrs));
4400  Assert(bms_is_subset(key_attrs, hot_attrs));
4401 
4402  /*
4403  * If one of these sets contains no remaining bits, bms_first_member will
4404  * return -1, and after adding FirstLowInvalidHeapAttributeNumber (which
4405  * is negative!) we'll get an attribute number that can't possibly be
4406  * real, and thus won't match any actual attribute number.
4407  */
4408  next_hot_attnum = bms_first_member(hot_attrs);
4409  next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
4410  next_key_attnum = bms_first_member(key_attrs);
4411  next_key_attnum += FirstLowInvalidHeapAttributeNumber;
4412  next_id_attnum = bms_first_member(id_attrs);
4413  next_id_attnum += FirstLowInvalidHeapAttributeNumber;
4414 
4415  for (;;)
4416  {
4417  bool changed;
4418  int check_now;
4419 
4420  /*
4421  * Since the HOT attributes are a superset of the key attributes and
4422  * the key attributes are a superset of the id attributes, this logic
4423  * is guaranteed to identify the next column that needs to be checked.
4424  */
4425  if (hot_result && next_hot_attnum > FirstLowInvalidHeapAttributeNumber)
4426  check_now = next_hot_attnum;
4427  else if (key_result && next_key_attnum > FirstLowInvalidHeapAttributeNumber)
4428  check_now = next_key_attnum;
4429  else if (id_result && next_id_attnum > FirstLowInvalidHeapAttributeNumber)
4430  check_now = next_id_attnum;
4431  else
4432  break;
4433 
4434  /* See whether it changed. */
4435  changed = !heap_tuple_attr_equals(RelationGetDescr(relation),
4436  check_now, oldtup, newtup);
4437  if (changed)
4438  {
4439  if (check_now == next_hot_attnum)
4440  hot_result = false;
4441  if (check_now == next_key_attnum)
4442  key_result = false;
4443  if (check_now == next_id_attnum)
4444  id_result = false;
4445 
4446  /* if all are false now, we can stop checking */
4447  if (!hot_result && !key_result && !id_result)
4448  break;
4449  }
4450 
4451  /*
4452  * Advance the next attribute numbers for the sets that contain the
4453  * attribute we just checked. As we work our way through the columns,
4454  * the next_attnum values will rise; but when each set becomes empty,
4455  * bms_first_member() will return -1 and the attribute number will end
4456  * up with a value less than FirstLowInvalidHeapAttributeNumber.
4457  */
4458  if (hot_result && check_now == next_hot_attnum)
4459  {
4460  next_hot_attnum = bms_first_member(hot_attrs);
4461  next_hot_attnum += FirstLowInvalidHeapAttributeNumber;
4462  }
4463  if (key_result && check_now == next_key_attnum)
4464  {
4465  next_key_attnum = bms_first_member(key_attrs);
4466  next_key_attnum += FirstLowInvalidHeapAttributeNumber;
4467  }
4468  if (id_result && check_now == next_id_attnum)
4469  {
4470  next_id_attnum = bms_first_member(id_attrs);
4471  next_id_attnum += FirstLowInvalidHeapAttributeNumber;
4472  }
4473  }
4474 
4475  *satisfies_hot = hot_result;
4476  *satisfies_key = key_result;
4477  *satisfies_id = id_result;
4478 }
4479 
4480 /*
4481  * simple_heap_update - replace a tuple
4482  *
4483  * This routine may be used to update a tuple when concurrent updates of
4484  * the target tuple are not expected (for example, because we have a lock
4485  * on the relation associated with the tuple). Any failure is reported
4486  * via ereport().
4487  */
4488 void
4490 {
4491  HTSU_Result result;
4492  HeapUpdateFailureData hufd;
4493  LockTupleMode lockmode;
4494 
4495  result = heap_update(relation, otid, tup,
4497  true /* wait for commit */ ,
4498  &hufd, &lockmode);
4499  switch (result)
4500  {
4501  case HeapTupleSelfUpdated:
4502  /* Tuple was already updated in current command? */
4503  elog(ERROR, "tuple already updated by self");
4504  break;
4505 
4506  case HeapTupleMayBeUpdated:
4507  /* done successfully */
4508  break;
4509 
4510  case HeapTupleUpdated:
4511  elog(ERROR, "tuple concurrently updated");
4512  break;
4513 
4514  default:
4515  elog(ERROR, "unrecognized heap_update status: %u", result);
4516  break;
4517  }
4518 }
4519 
4520 
4521 /*
4522  * Return the MultiXactStatus corresponding to the given tuple lock mode.
4523  */
4524 static MultiXactStatus
4526 {
4527  int retval;
4528 
4529  if (is_update)
4530  retval = tupleLockExtraInfo[mode].updstatus;
4531  else
4532  retval = tupleLockExtraInfo[mode].lockstatus;
4533 
4534  if (retval == -1)
4535  elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4536  is_update ? "true" : "false");
4537 
4538  return (MultiXactStatus) retval;
4539 }
4540 
4541 /*
4542  * heap_lock_tuple - lock a tuple in shared or exclusive mode
4543  *
4544  * Note that this acquires a buffer pin, which the caller must release.
4545  *
4546  * Input parameters:
4547  * relation: relation containing tuple (caller must hold suitable lock)
4548  * tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
4549  * cid: current command ID (used for visibility test, and stored into
4550  * tuple's cmax if lock is successful)
4551  * mode: indicates if shared or exclusive tuple lock is desired
4552  * wait_policy: what to do if tuple lock is not available
4553  * follow_updates: if true, follow the update chain to also lock descendant
4554  * tuples.
4555  *
4556  * Output parameters:
4557  * *tuple: all fields filled in
4558  * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4559  * *hufd: filled in failure cases (see below)
4560  *
4561  * Function result may be:
4562  * HeapTupleMayBeUpdated: lock was successfully acquired
4563  * HeapTupleInvisible: lock failed because tuple was never visible to us
4564  * HeapTupleSelfUpdated: lock failed because tuple updated by self
4565  * HeapTupleUpdated: lock failed because tuple updated by other xact
4566  * HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip
4567  *
4568  * In the failure cases other than HeapTupleInvisible, the routine fills
4569  * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4570  * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated,
4571  * since we cannot obtain cmax from a combocid generated by another
4572  * transaction).
4573  * See comments for struct HeapUpdateFailureData for additional info.
4574  *
4575  * See README.tuplock for a thorough explanation of this mechanism.
4576  */
4579  CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4580  bool follow_updates,
4581  Buffer *buffer, HeapUpdateFailureData *hufd)
4582 {
4583  HTSU_Result result;
4584  ItemPointer tid = &(tuple->t_self);
4585  ItemId lp;
4586  Page page;
4587  Buffer vmbuffer = InvalidBuffer;
4588  BlockNumber block;
4589  TransactionId xid,
4590  xmax;
4591  uint16 old_infomask,
4592  new_infomask,
4593  new_infomask2;
4594  bool first_time = true;
4595  bool have_tuple_lock = false;
4596  bool cleared_all_frozen = false;
4597 
4598  *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4599  block = ItemPointerGetBlockNumber(tid);
4600 
4601  /*
4602  * Before locking the buffer, pin the visibility map page if it appears to
4603  * be necessary. Since we haven't got the lock yet, someone else might be
4604  * in the middle of changing this, so we'll need to recheck after we have
4605  * the lock.
4606  */
4607  if (PageIsAllVisible(BufferGetPage(*buffer)))
4608  visibilitymap_pin(relation, block, &vmbuffer);
4609 
4611 
4612  page = BufferGetPage(*buffer);
4613  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4614  Assert(ItemIdIsNormal(lp));
4615 
4616  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4617  tuple->t_len = ItemIdGetLength(lp);
4618  tuple->t_tableOid = RelationGetRelid(relation);
4619 
4620 l3:
4621  result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4622 
4623  if (result == HeapTupleInvisible)
4624  {
4625  /*
4626  * This is possible, but only when locking a tuple for ON CONFLICT
4627  * UPDATE. We return this value here rather than throwing an error in
4628  * order to give that case the opportunity to throw a more specific
4629  * error.
4630  */
4631  result = HeapTupleInvisible;
4632  goto out_locked;
4633  }
4634  else if (result == HeapTupleBeingUpdated || result == HeapTupleUpdated)
4635  {
4636  TransactionId xwait;
4637  uint16 infomask;
4638  uint16 infomask2;
4639  bool require_sleep;
4640  ItemPointerData t_ctid;
4641 
4642  /* must copy state data before unlocking buffer */
4643  xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4644  infomask = tuple->t_data->t_infomask;
4645  infomask2 = tuple->t_data->t_infomask2;
4646  ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4647 
4648  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4649 
4650  /*
4651  * If any subtransaction of the current top transaction already holds
4652  * a lock as strong as or stronger than what we're requesting, we
4653  * effectively hold the desired lock already. We *must* succeed
4654  * without trying to take the tuple lock, else we will deadlock
4655  * against anyone wanting to acquire a stronger lock.
4656  *
4657  * Note we only do this the first time we loop on the HTSU result;
4658  * there is no point in testing in subsequent passes, because
4659  * evidently our own transaction cannot have acquired a new lock after
4660  * the first time we checked.
4661  */
4662  if (first_time)
4663  {
4664  first_time = false;
4665 
4666  if (infomask & HEAP_XMAX_IS_MULTI)
4667  {
4668  int i;
4669  int nmembers;
4670  MultiXactMember *members;
4671 
4672  /*
4673  * We don't need to allow old multixacts here; if that had
4674  * been the case, HeapTupleSatisfiesUpdate would have returned
4675  * MayBeUpdated and we wouldn't be here.
4676  */
4677  nmembers =
4678  GetMultiXactIdMembers(xwait, &members, false,
4679  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4680 
4681  for (i = 0; i < nmembers; i++)
4682  {
4683  /* only consider members of our own transaction */
4684  if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4685  continue;
4686 
4687  if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4688  {
4689  pfree(members);
4690  result = HeapTupleMayBeUpdated;
4691  goto out_unlocked;
4692  }
4693  }
4694 
4695  if (members)
4696  pfree(members);
4697  }
4698  else if (TransactionIdIsCurrentTransactionId(xwait))
4699  {
4700  switch (mode)
4701  {
4702  case LockTupleKeyShare:
4703  Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4704  HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4705  HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4706  result = HeapTupleMayBeUpdated;
4707  goto out_unlocked;
4708  case LockTupleShare:
4709  if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4710  HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4711  {
4712  result = HeapTupleMayBeUpdated;
4713  goto out_unlocked;
4714  }
4715  break;
4717  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4718  {
4719  result = HeapTupleMayBeUpdated;
4720  goto out_unlocked;
4721  }
4722  break;
4723  case LockTupleExclusive:
4724  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4725  infomask2 & HEAP_KEYS_UPDATED)
4726  {
4727  result = HeapTupleMayBeUpdated;
4728  goto out_unlocked;
4729  }
4730  break;
4731  }
4732  }
4733  }
4734 
4735  /*
4736  * Initially assume that we will have to wait for the locking
4737  * transaction(s) to finish. We check various cases below in which
4738  * this can be turned off.
4739  */
4740  require_sleep = true;
4741  if (mode == LockTupleKeyShare)
4742  {
4743  /*
4744  * If we're requesting KeyShare, and there's no update present, we
4745  * don't need to wait. Even if there is an update, we can still
4746  * continue if the key hasn't been modified.
4747  *
4748  * However, if there are updates, we need to walk the update chain
4749  * to mark future versions of the row as locked, too. That way,
4750  * if somebody deletes that future version, we're protected
4751  * against the key going away. This locking of future versions
4752  * could block momentarily, if a concurrent transaction is
4753  * deleting a key; or it could return a value to the effect that
4754  * the transaction deleting the key has already committed. So we
4755  * do this before re-locking the buffer; otherwise this would be
4756  * prone to deadlocks.
4757  *
4758  * Note that the TID we're locking was grabbed before we unlocked
4759  * the buffer. For it to change while we're not looking, the
4760  * other properties we're testing for below after re-locking the
4761  * buffer would also change, in which case we would restart this
4762  * loop above.
4763  */
4764  if (!(infomask2 & HEAP_KEYS_UPDATED))
4765  {
4766  bool updated;
4767 
4768  updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4769 
4770  /*
4771  * If there are updates, follow the update chain; bail out if
4772  * that cannot be done.
4773  */
4774  if (follow_updates && updated)
4775  {
4776  HTSU_Result res;
4777 
4778  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4780  mode);
4781  if (res != HeapTupleMayBeUpdated)
4782  {
4783  result = res;
4784  /* recovery code expects to have buffer lock held */
4786  goto failed;
4787  }
4788  }
4789 
4791 
4792  /*
4793  * Make sure it's still an appropriate lock, else start over.
4794  * Also, if it wasn't updated before we released the lock, but
4795  * is updated now, we start over too; the reason is that we
4796  * now need to follow the update chain to lock the new
4797  * versions.
4798  */
4799  if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4800  ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4801  !updated))
4802  goto l3;
4803 
4804  /* Things look okay, so we can skip sleeping */
4805  require_sleep = false;
4806 
4807  /*
4808  * Note we allow Xmax to change here; other updaters/lockers
4809  * could have modified it before we grabbed the buffer lock.
4810  * However, this is not a problem, because with the recheck we
4811  * just did we ensure that they still don't conflict with the
4812  * lock we want.
4813  */
4814  }
4815  }
4816  else if (mode == LockTupleShare)
4817  {
4818  /*
4819  * If we're requesting Share, we can similarly avoid sleeping if
4820  * there's no update and no exclusive lock present.
4821  */
4822  if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4823  !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4824  {
4826 
4827  /*
4828  * Make sure it's still an appropriate lock, else start over.
4829  * See above about allowing xmax to change.
4830  */
4831  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4833  goto l3;
4834  require_sleep = false;
4835  }
4836  }
4837  else if (mode == LockTupleNoKeyExclusive)
4838  {
4839  /*
4840  * If we're requesting NoKeyExclusive, we might also be able to
4841  * avoid sleeping; just ensure that there no conflicting lock
4842  * already acquired.
4843  */
4844  if (infomask & HEAP_XMAX_IS_MULTI)
4845  {
4846  if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4847  mode))
4848  {
4849  /*
4850  * No conflict, but if the xmax changed under us in the
4851  * meantime, start over.
4852  */
4854  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4856  xwait))
4857  goto l3;
4858 
4859  /* otherwise, we're good */
4860  require_sleep = false;
4861  }
4862  }
4863  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4864  {
4866 
4867  /* if the xmax changed in the meantime, start over */
4868  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4871  xwait))
4872  goto l3;
4873  /* otherwise, we're good */
4874  require_sleep = false;
4875  }
4876  }
4877 
4878  /*
4879  * As a check independent from those above, we can also avoid sleeping
4880  * if the current transaction is the sole locker of the tuple. Note
4881  * that the strength of the lock already held is irrelevant; this is
4882  * not about recording the lock in Xmax (which will be done regardless
4883  * of this optimization, below). Also, note that the cases where we
4884  * hold a lock stronger than we are requesting are already handled
4885  * above by not doing anything.
4886  *
4887  * Note we only deal with the non-multixact case here; MultiXactIdWait
4888  * is well equipped to deal with this situation on its own.
4889  */
4890  if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4892  {
4893  /* ... but if the xmax changed in the meantime, start over */
4895  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4897  xwait))
4898  goto l3;
4900  require_sleep = false;
4901  }
4902 
4903  /*
4904  * Time to sleep on the other transaction/multixact, if necessary.
4905  *
4906  * If the other transaction is an update that's already committed,
4907  * then sleeping cannot possibly do any good: if we're required to
4908  * sleep, get out to raise an error instead.
4909  *
4910  * By here, we either have already acquired the buffer exclusive lock,
4911  * or we must wait for the locking transaction or multixact; so below
4912  * we ensure that we grab buffer lock after the sleep.
4913  */
4914  if (require_sleep && result == HeapTupleUpdated)
4915  {
4917  goto failed;
4918  }
4919  else if (require_sleep)
4920  {
4921  /*
4922  * Acquire tuple lock to establish our priority for the tuple, or
4923  * die trying. LockTuple will release us when we are next-in-line
4924  * for the tuple. We must do this even if we are share-locking.
4925  *
4926  * If we are forced to "start over" below, we keep the tuple lock;
4927  * this arranges that we stay at the head of the line while
4928  * rechecking tuple state.
4929  */
4930  if (!heap_acquire_tuplock(relation, tid, mode, wait_policy,
4931  &have_tuple_lock))
4932  {
4933  /*
4934  * This can only happen if wait_policy is Skip and the lock
4935  * couldn't be obtained.
4936  */
4937  result = HeapTupleWouldBlock;
4938  /* recovery code expects to have buffer lock held */
4940  goto failed;
4941  }
4942 
4943  if (infomask & HEAP_XMAX_IS_MULTI)
4944  {
4946 
4947  /* We only ever lock tuples, never update them */
4948  if (status >= MultiXactStatusNoKeyUpdate)
4949  elog(ERROR, "invalid lock mode in heap_lock_tuple");
4950 
4951  /* wait for multixact to end, or die trying */
4952  switch (wait_policy)
4953  {
4954  case LockWaitBlock:
4955  MultiXactIdWait((MultiXactId) xwait, status, infomask,
4956  relation, &tuple->t_self, XLTW_Lock, NULL);
4957  break;
4958  case LockWaitSkip:
4960  status, infomask, relation,
4961  NULL))
4962  {
4963  result = HeapTupleWouldBlock;
4964  /* recovery code expects to have buffer lock held */
4966  goto failed;
4967  }
4968  break;
4969  case LockWaitError:
4971  status, infomask, relation,
4972  NULL))
4973  ereport(ERROR,
4974  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4975  errmsg("could not obtain lock on row in relation \"%s\"",
4976  RelationGetRelationName(relation))));
4977 
4978  break;
4979  }
4980 
4981  /*
4982  * Of course, the multixact might not be done here: if we're
4983  * requesting a light lock mode, other transactions with light
4984  * locks could still be alive, as well as locks owned by our
4985  * own xact or other subxacts of this backend. We need to
4986  * preserve the surviving MultiXact members. Note that it
4987  * isn't absolutely necessary in the latter case, but doing so
4988  * is simpler.
4989  */
4990  }
4991  else
4992  {
4993  /* wait for regular transaction to end, or die trying */
4994  switch (wait_policy)
4995  {
4996  case LockWaitBlock:
4997  XactLockTableWait(xwait, relation, &tuple->t_self,
4998  XLTW_Lock);
4999  break;
5000  case LockWaitSkip:
5001  if (!ConditionalXactLockTableWait(xwait))
5002  {
5003  result = HeapTupleWouldBlock;
5004  /* recovery code expects to have buffer lock held */
5006  goto failed;
5007  }
5008  break;
5009  case LockWaitError:
5010  if (!ConditionalXactLockTableWait(xwait))
5011  ereport(ERROR,
5012  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5013  errmsg("could not obtain lock on row in relation \"%s\"",
5014  RelationGetRelationName(relation))));
5015  break;
5016  }
5017  }
5018 
5019  /* if there are updates, follow the update chain */
5020  if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
5021  {
5022  HTSU_Result res;
5023 
5024  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
5026  mode);
5027  if (res != HeapTupleMayBeUpdated)
5028  {
5029  result = res;
5030  /* recovery code expects to have buffer lock held */
5032  goto failed;
5033  }
5034  }
5035 
5037 
5038  /*
5039  * xwait is done, but if xwait had just locked the tuple then some
5040  * other xact could update this tuple before we get to this point.
5041  * Check for xmax change, and start over if so.
5042  */
5043  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5045  xwait))
5046  goto l3;
5047 
5048  if (!(infomask & HEAP_XMAX_IS_MULTI))
5049  {
5050  /*
5051  * Otherwise check if it committed or aborted. Note we cannot
5052  * be here if the tuple was only locked by somebody who didn't
5053  * conflict with us; that would have been handled above. So
5054  * that transaction must necessarily be gone by now. But
5055  * don't check for this in the multixact case, because some
5056  * locker transactions might still be running.
5057  */
5058  UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5059  }
5060  }
5061 
5062  /* By here, we're certain that we hold buffer exclusive lock again */
5063 
5064  /*
5065  * We may lock if previous xmax aborted, or if it committed but only
5066  * locked the tuple without updating it; or if we didn't have to wait
5067  * at all for whatever reason.
5068  */
5069  if (!require_sleep ||
5070  (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5073  result = HeapTupleMayBeUpdated;
5074  else
5075  result = HeapTupleUpdated;
5076  }
5077 
5078 failed:
5079  if (result != HeapTupleMayBeUpdated)
5080  {
5081  Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated ||
5082  result == HeapTupleWouldBlock);
5083  Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5084  hufd->ctid = tuple->t_data->t_ctid;
5085  hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5086  if (result == HeapTupleSelfUpdated)
5087  hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5088  else
5089  hufd->cmax = InvalidCommandId;
5090  goto out_locked;
5091  }
5092 
5093  /*
5094  * If we didn't pin the visibility map page and the page has become all
5095  * visible while we were busy locking the buffer, or during some
5096  * subsequent window during which we had it unlocked, we'll have to unlock
5097  * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5098  * unfortunate, especially since we'll now have to recheck whether the
5099  * tuple has been locked or updated under us, but hopefully it won't
5100  * happen very often.
5101  */
5102  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5103  {
5104  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5105  visibilitymap_pin(relation, block, &vmbuffer);
5107  goto l3;
5108  }
5109 
5110  xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5111  old_infomask = tuple->t_data->t_infomask;
5112 
5113  /*
5114  * If this is the first possibly-multixact-able operation in the current
5115  * transaction, set my per-backend OldestMemberMXactId setting. We can be
5116  * certain that the transaction will never become a member of any older
5117  * MultiXactIds than that. (We have to do this even if we end up just
5118  * using our own TransactionId below, since some other backend could
5119  * incorporate our XID into a MultiXact immediately afterwards.)
5120  */
5122 
5123  /*
5124  * Compute the new xmax and infomask to store into the tuple. Note we do
5125  * not modify the tuple just yet, because that would leave it in the wrong
5126  * state if multixact.c elogs.
5127  */
5128  compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
5129  GetCurrentTransactionId(), mode, false,
5130  &xid, &new_infomask, &new_infomask2);
5131 
5133 
5134  /*
5135  * Store transaction information of xact locking the tuple.
5136  *
5137  * Note: Cmax is meaningless in this context, so don't set it; this avoids
5138  * possibly generating a useless combo CID. Moreover, if we're locking a
5139  * previously updated tuple, it's important to preserve the Cmax.
5140  *
5141  * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5142  * we would break the HOT chain.
5143  */
5144  tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
5145  tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5146  tuple->t_data->t_infomask |= new_infomask;
5147  tuple->t_data->t_infomask2 |= new_infomask2;
5148  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5150  HeapTupleHeaderSetXmax(tuple->t_data, xid);
5151 
5152  /*
5153  * Make sure there is no forward chain link in t_ctid. Note that in the
5154  * cases where the tuple has been updated, we must not overwrite t_ctid,
5155  * because it was set by the updater. Moreover, if the tuple has been
5156  * updated, we need to follow the update chain to lock the new versions of
5157  * the tuple as well.
5158  */
5159  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5160  tuple->t_data->t_ctid = *tid;
5161 
5162  /* Clear only the all-frozen bit on visibility map if needed */
5163  if (PageIsAllVisible(page) &&
5164  visibilitymap_clear(relation, block, vmbuffer,
5166  cleared_all_frozen = true;
5167 
5168 
5169  MarkBufferDirty(*buffer);
5170 
5171  /*
5172  * XLOG stuff. You might think that we don't need an XLOG record because
5173  * there is no state change worth restoring after a crash. You would be
5174  * wrong however: we have just written either a TransactionId or a
5175  * MultiXactId that may never have been seen on disk before, and we need
5176  * to make sure that there are XLOG entries covering those ID numbers.
5177  * Else the same IDs might be re-used after a crash, which would be
5178  * disastrous if this page made it to disk before the crash. Essentially
5179  * we have to enforce the WAL log-before-data rule even in this case.
5180  * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5181  * entries for everything anyway.)
5182  */
5183  if (RelationNeedsWAL(relation))
5184  {
5185  xl_heap_lock xlrec;
5186  XLogRecPtr recptr;
5187 
5188  XLogBeginInsert();
5189  XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
5190 
5191  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5192  xlrec.locking_xid = xid;
5193  xlrec.infobits_set = compute_infobits(new_infomask,
5194  tuple->t_data->t_infomask2);
5195  xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5196  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
5197 
5198  /* we don't decode row locks atm, so no need to log the origin */
5199 
5200  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
5201 
5202  PageSetLSN(page, recptr);
5203  }
5204 
5205  END_CRIT_SECTION();
5206 
5207  result = HeapTupleMayBeUpdated;
5208 
5209 out_locked:
5210  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5211 
5212 out_unlocked:
5213  if (BufferIsValid(vmbuffer))
5214  ReleaseBuffer(vmbuffer);
5215 
5216  /*
5217  * Don't update the visibility map here. Locking a tuple doesn't change
5218  * visibility info.
5219  */
5220 
5221  /*
5222  * Now that we have successfully marked the tuple as locked, we can
5223  * release the lmgr tuple lock, if we had it.
5224  */
5225  if (have_tuple_lock)
5226  UnlockTupleTuplock(relation, tid, mode);
5227 
5228  return result;
5229 }
5230 
5231 /*
5232  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5233  * its normal, Xmax-based tuple lock.
5234  *
5235  * have_tuple_lock is an input and output parameter: on input, it indicates
5236  * whether the lock has previously been acquired (and this function does
5237  * nothing in that case). If this function returns success, have_tuple_lock
5238  * has been flipped to true.
5239  *
5240  * Returns false if it was unable to obtain the lock; this can only happen if
5241  * wait_policy is Skip.
5242  */
5243 static bool
5245  LockWaitPolicy wait_policy, bool *have_tuple_lock)
5246 {
5247  if (*have_tuple_lock)
5248  return true;
5249 
5250  switch (wait_policy)
5251  {
5252  case LockWaitBlock:
5253  LockTupleTuplock(relation, tid, mode);
5254  break;
5255 
5256  case LockWaitSkip:
5257  if (!ConditionalLockTupleTuplock(relation, tid, mode))
5258  return false;
5259  break;
5260 
5261  case LockWaitError:
5262  if (!ConditionalLockTupleTuplock(relation, tid, mode))
5263  ereport(ERROR,
5264  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5265  errmsg("could not obtain lock on row in relation \"%s\"",
5266  RelationGetRelationName(relation))));
5267  break;
5268  }
5269  *have_tuple_lock = true;
5270 
5271  return true;
5272 }
5273 
5274 /*
5275  * Given an original set of Xmax and infomask, and a transaction (identified by
5276  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5277  * corresponding infomasks to use on the tuple.
5278  *
5279  * Note that this might have side effects such as creating a new MultiXactId.
5280  *
5281  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5282  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5283  * but it was not running anymore. There is a race condition, which is that the
5284  * MultiXactId may have finished since then, but that uncommon case is handled
5285  * either here, or within MultiXactIdExpand.
5286  *
5287  * There is a similar race condition possible when the old xmax was a regular
5288  * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5289  * window, but it's still possible to end up creating an unnecessary
5290  * MultiXactId. Fortunately this is harmless.
5291  */
5292 static void
5294  uint16 old_infomask2, TransactionId add_to_xmax,
5295  LockTupleMode mode, bool is_update,
5296  TransactionId *result_xmax, uint16 *result_infomask,
5297  uint16 *result_infomask2)
5298 {
5299  TransactionId new_xmax;
5300  uint16 new_infomask,
5301  new_infomask2;
5302 
5304 
5305 l5:
5306  new_infomask = 0;
5307  new_infomask2 = 0;
5308  if (old_infomask & HEAP_XMAX_INVALID)
5309  {
5310  /*
5311  * No previous locker; we just insert our own TransactionId.
5312  *
5313  * Note that it's critical that this case be the first one checked,
5314  * because there are several blocks below that come back to this one
5315  * to implement certain optimizations; old_infomask might contain
5316  * other dirty bits in those cases, but we don't really care.
5317  */
5318  if (is_update)
5319  {
5320  new_xmax = add_to_xmax;
5321  if (mode == LockTupleExclusive)
5322  new_infomask2 |= HEAP_KEYS_UPDATED;
5323  }
5324  else
5325  {
5326  new_infomask |= HEAP_XMAX_LOCK_ONLY;
5327  switch (mode)
5328  {
5329  case LockTupleKeyShare:
5330  new_xmax = add_to_xmax;
5331  new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5332  break;
5333  case LockTupleShare:
5334  new_xmax = add_to_xmax;
5335  new_infomask |= HEAP_XMAX_SHR_LOCK;
5336  break;
5338  new_xmax = add_to_xmax;
5339  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5340  break;
5341  case LockTupleExclusive:
5342  new_xmax = add_to_xmax;
5343  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5344  new_infomask2 |= HEAP_KEYS_UPDATED;
5345  break;
5346  default:
5347  new_xmax = InvalidTransactionId; /* silence compiler */
5348  elog(ERROR, "invalid lock mode");
5349  }
5350  }
5351  }
5352  else if (old_infomask & HEAP_XMAX_IS_MULTI)
5353  {
5354  MultiXactStatus new_status;
5355 
5356  /*
5357  * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5358  * cross-check.
5359  */
5360  Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5361 
5362  /*
5363  * A multixact together with LOCK_ONLY set but neither lock bit set
5364  * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5365  * anymore. This check is critical for databases upgraded by
5366  * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5367  * that such multis are never passed.
5368  */
5369  if (HEAP_LOCKED_UPGRADED(old_infomask))
5370  {
5371  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5372  old_infomask |= HEAP_XMAX_INVALID;
5373  goto l5;
5374  }
5375 
5376  /*
5377  * If the XMAX is already a MultiXactId, then we need to expand it to
5378  * include add_to_xmax; but if all the members were lockers and are
5379  * all gone, we can do away with the IS_MULTI bit and just set
5380  * add_to_xmax as the only locker/updater. If all lockers are gone
5381  * and we have an updater that aborted, we can also do without a
5382  * multi.
5383  *
5384  * The cost of doing GetMultiXactIdMembers would be paid by
5385  * MultiXactIdExpand if we weren't to do this, so this check is not
5386  * incurring extra work anyhow.
5387  */
5388  if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5389  {
5390  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5392  old_infomask)))
5393  {
5394  /*
5395  * Reset these bits and restart; otherwise fall through to
5396  * create a new multi below.
5397  */
5398  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5399  old_infomask |= HEAP_XMAX_INVALID;
5400  goto l5;
5401  }
5402  }
5403 
5404  new_status = get_mxact_status_for_lock(mode, is_update);
5405 
5406  new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5407  new_status);
5408  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5409  }
5410  else if (old_infomask & HEAP_XMAX_COMMITTED)
5411  {
5412  /*
5413  * It's a committed update, so we need to preserve him as updater of
5414  * the tuple.
5415  */
5417  MultiXactStatus new_status;
5418 
5419  if (old_infomask2 & HEAP_KEYS_UPDATED)
5420  status = MultiXactStatusUpdate;
5421  else
5422  status = MultiXactStatusNoKeyUpdate;
5423 
5424  new_status = get_mxact_status_for_lock(mode, is_update);
5425 
5426  /*
5427  * since it's not running, it's obviously impossible for the old
5428  * updater to be identical to the current one, so we need not check
5429  * for that case as we do in the block above.
5430  */
5431  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5432  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5433  }
5434  else if (TransactionIdIsInProgress(xmax))
5435  {
5436  /*
5437  * If the XMAX is a valid, in-progress TransactionId, then we need to
5438  * create a new MultiXactId that includes both the old locker or
5439  * updater and our own TransactionId.
5440  */
5441  MultiXactStatus new_status;
5442  MultiXactStatus old_status;
5443  LockTupleMode old_mode;
5444 
5445  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5446  {
5447  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5448  old_status = MultiXactStatusForKeyShare;
5449  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5450  old_status = MultiXactStatusForShare;
5451  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5452  {
5453  if (old_infomask2 & HEAP_KEYS_UPDATED)
5454  old_status = MultiXactStatusForUpdate;
5455  else
5456  old_status = MultiXactStatusForNoKeyUpdate;
5457  }
5458  else
5459  {
5460  /*
5461  * LOCK_ONLY can be present alone only when a page has been
5462  * upgraded by pg_upgrade. But in that case,
5463  * TransactionIdIsInProgress() should have returned false. We
5464  * assume it's no longer locked in this case.
5465  */
5466  elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5467  old_infomask |= HEAP_XMAX_INVALID;
5468  old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5469  goto l5;
5470  }
5471  }
5472  else
5473  {
5474  /* it's an update, but which kind? */
5475  if (old_infomask2 & HEAP_KEYS_UPDATED)
5476  old_status = MultiXactStatusUpdate;
5477  else
5478  old_status = MultiXactStatusNoKeyUpdate;
5479  }
5480 
5481  old_mode = TUPLOCK_from_mxstatus(old_status);
5482 
5483  /*
5484  * If the lock to be acquired is for the same TransactionId as the
5485  * existing lock, there's an optimization possible: consider only the
5486  * strongest of both locks as the only one present, and restart.
5487  */
5488  if (xmax == add_to_xmax)
5489  {
5490  /*
5491  * Note that it's not possible for the original tuple to be
5492  * updated: we wouldn't be here because the tuple would have been
5493  * invisible and we wouldn't try to update it. As a subtlety,
5494  * this code can also run when traversing an update chain to lock
5495  * future versions of a tuple. But we wouldn't be here either,
5496  * because the add_to_xmax would be different from the original
5497  * updater.
5498  */
5499  Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5500 
5501  /* acquire the strongest of both */
5502  if (mode < old_mode)
5503  mode = old_mode;
5504  /* mustn't touch is_update */
5505 
5506  old_infomask |= HEAP_XMAX_INVALID;
5507  goto l5;
5508  }
5509 
5510  /* otherwise, just fall back to creating a new multixact */
5511  new_status = get_mxact_status_for_lock(mode, is_update);
5512  new_xmax = MultiXactIdCreate(xmax, old_status,
5513  add_to_xmax, new_status);
5514  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5515  }
5516  else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5517  TransactionIdDidCommit(xmax))
5518  {
5519  /*
5520  * It's a committed update, so we gotta preserve him as updater of the
5521  * tuple.
5522  */
5524  MultiXactStatus new_status;
5525 
5526  if (old_infomask2 & HEAP_KEYS_UPDATED)
5527  status = MultiXactStatusUpdate;
5528  else
5529  status = MultiXactStatusNoKeyUpdate;
5530 
5531  new_status = get_mxact_status_for_lock(mode, is_update);
5532 
5533  /*
5534  * since it's not running, it's obviously impossible for the old
5535  * updater to be identical to the current one, so we need not check
5536  * for that case as we do in the block above.
5537  */
5538  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5539  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5540  }
5541  else
5542  {
5543  /*
5544  * Can get here iff the locking/updating transaction was running when
5545  * the infomask was extracted from the tuple, but finished before
5546  * TransactionIdIsInProgress got to run. Deal with it as if there was
5547  * no locker at all in the first place.
5548  */
5549  old_infomask |= HEAP_XMAX_INVALID;
5550  goto l5;
5551  }
5552 
5553  *result_infomask = new_infomask;
5554  *result_infomask2 = new_infomask2;
5555  *result_xmax = new_xmax;
5556 }
5557 
5558 /*
5559  * Subroutine for heap_lock_updated_tuple_rec.
5560  *
5561  * Given a hypothetical multixact status held by the transaction identified
5562  * with the given xid, does the current transaction need to wait, fail, or can
5563  * it continue if it wanted to acquire a lock of the given mode? "needwait"
5564  * is set to true if waiting is necessary; if it can continue, then
5565  * HeapTupleMayBeUpdated is returned. In case of a conflict, a different
5566  * HeapTupleSatisfiesUpdate return code is returned.
5567  *
5568  * The held status is said to be hypothetical because it might correspond to a
5569  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5570  * way for simplicity of API.
5571  */
5572 static HTSU_Result
5574  LockTupleMode mode, bool *needwait)
5575 {
5576  MultiXactStatus wantedstatus;
5577 
5578  *needwait = false;
5579  wantedstatus = get_mxact_status_for_lock(mode, false);
5580 
5581  /*
5582  * Note: we *must* check TransactionIdIsInProgress before
5583  * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an
5584  * explanation.
5585  */
5587  {
5588  /*
5589  * Updated by our own transaction? Just return failure. This
5590  * shouldn't normally happen.
5591  */
5592  return HeapTupleSelfUpdated;
5593  }
5594  else if (TransactionIdIsInProgress(xid))
5595  {
5596  /*
5597  * If the locking transaction is running, what we do depends on
5598  * whether the lock modes conflict: if they do, then we must wait for
5599  * it to finish; otherwise we can fall through to lock this tuple
5600  * version without waiting.
5601  */
5603  LOCKMODE_from_mxstatus(wantedstatus)))
5604  {
5605  *needwait = true;
5606  }
5607 
5608  /*
5609  * If we set needwait above, then this value doesn't matter;
5610  * otherwise, this value signals to caller that it's okay to proceed.
5611  */
5612  return HeapTupleMayBeUpdated;
5613  }
5614  else if (TransactionIdDidAbort(xid))
5615  return HeapTupleMayBeUpdated;
5616  else if (TransactionIdDidCommit(xid))
5617  {
5618  /*
5619  * The other transaction committed. If it was only a locker, then the
5620  * lock is completely gone now and we can return success; but if it
5621  * was an update, then what we do depends on whether the two lock
5622  * modes conflict. If they conflict, then we must report error to
5623  * caller. But if they don't, we can fall through to allow the current
5624  * transaction to lock the tuple.
5625  *
5626  * Note: the reason we worry about ISUPDATE here is because as soon as
5627  * a transaction ends, all its locks are gone and meaningless, and
5628  * thus we can ignore them; whereas its updates persist. In the
5629  * TransactionIdIsInProgress case, above, we don't need to check
5630  * because we know the lock is still "alive" and thus a conflict needs
5631  * always be checked.
5632  */
5633  if (!ISUPDATE_from_mxstatus(status))
5634  return HeapTupleMayBeUpdated;
5635 
5637  LOCKMODE_from_mxstatus(wantedstatus)))
5638  /* bummer */
5639  return HeapTupleUpdated;
5640 
5641  return HeapTupleMayBeUpdated;
5642  }
5643 
5644  /* Not in progress, not aborted, not committed -- must have crashed */
5645  return HeapTupleMayBeUpdated;
5646 }
5647 
5648 
5649 /*
5650  * Recursive part of heap_lock_updated_tuple
5651  *
5652  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5653  * xid with the given mode; if this tuple is updated, recurse to lock the new
5654  * version as well.
5655  */
5656 static HTSU_Result
5658  LockTupleMode mode)
5659 {
5660  HTSU_Result result;
5661  ItemPointerData tupid;
5662  HeapTupleData mytup;
5663  Buffer buf;
5664  uint16 new_infomask,
5665  new_infomask2,
5666  old_infomask,
5667  old_infomask2;
5668  TransactionId xmax,
5669  new_xmax;
5670  TransactionId priorXmax = InvalidTransactionId;
5671  bool cleared_all_frozen = false;
5672  Buffer vmbuffer = InvalidBuffer;
5673  BlockNumber block;
5674 
5675  ItemPointerCopy(tid, &tupid);
5676 
5677  for (;;)
5678  {
5679  new_infomask = 0;
5680  new_xmax = InvalidTransactionId;
5681  block = ItemPointerGetBlockNumber(&tupid);
5682  ItemPointerCopy(&tupid, &(mytup.t_self));
5683 
5684  if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
5685  {
5686  /*
5687  * if we fail to find the updated version of the tuple, it's
5688  * because it was vacuumed/pruned away after its creator
5689  * transaction aborted. So behave as if we got to the end of the
5690  * chain, and there's no further tuple to lock: return success to
5691  * caller.
5692  */
5693  return HeapTupleMayBeUpdated;
5694  }
5695 
5696 l4:
5698 
5699  /*
5700  * Before locking the buffer, pin the visibility map page if it
5701  * appears to be necessary. Since we haven't got the lock yet,
5702  * someone else might be in the middle of changing this, so we'll need
5703  * to recheck after we have the lock.
5704  */
5705  if (PageIsAllVisible(BufferGetPage(buf)))
5706  visibilitymap_pin(rel, block, &vmbuffer);
5707  else
5708  vmbuffer = InvalidBuffer;
5709 
5711 
5712  /*
5713  * If we didn't pin the visibility map page and the page has become
5714  * all visible while we were busy locking the buffer, we'll have to
5715  * unlock and re-lock, to avoid holding the buffer lock across I/O.
5716  * That's a bit unfortunate, but hopefully shouldn't happen often.
5717  */
5718  if (vmbuffer == InvalidBuffer && PageIsAllVisible(BufferGetPage(buf)))
5719  {
5721  visibilitymap_pin(rel, block, &vmbuffer);
5723  }
5724 
5725  /*
5726  * Check the tuple XMIN against prior XMAX, if any. If we reached the
5727  * end of the chain, we're done, so return success.
5728  */
5729  if (TransactionIdIsValid(priorXmax) &&
5731  priorXmax))
5732  {
5733  result = HeapTupleMayBeUpdated;
5734  goto out_locked;
5735  }
5736 
5737  /*
5738  * Also check Xmin: if this tuple was created by an aborted
5739  * (sub)transaction, then we already locked the last live one in the
5740  * chain, thus we're done, so return success.
5741  */
5743  {
5744  UnlockReleaseBuffer(buf);
5745  return HeapTupleMayBeUpdated;
5746  }
5747 
5748  old_infomask = mytup.t_data->t_infomask;
5749  old_infomask2 = mytup.t_data->t_infomask2;
5750  xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5751 
5752  /*
5753  * If this tuple version has been updated or locked by some concurrent
5754  * transaction(s), what we do depends on whether our lock mode
5755  * conflicts with what those other transactions hold, and also on the
5756  * status of them.
5757  */
5758  if (!(old_infomask & HEAP_XMAX_INVALID))
5759  {
5760  TransactionId rawxmax;
5761  bool needwait;
5762 
5763  rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5764  if (old_infomask & HEAP_XMAX_IS_MULTI)
5765  {
5766  int nmembers;
5767  int i;
5768  MultiXactMember *members;
5769 
5770  /*
5771  * We don't need a test for pg_upgrade'd tuples: this is only
5772  * applied to tuples after the first in an update chain. Said
5773  * first tuple in the chain may well be locked-in-9.2-and-
5774  * pg_upgraded, but that one was already locked by our caller,
5775  * not us; and any subsequent ones cannot be because our
5776  * caller must necessarily have obtained a snapshot later than
5777  * the pg_upgrade itself.
5778  */
5780 
5781  nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5782  HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5783  for (i = 0; i < nmembers; i++)
5784  {
5785  result = test_lockmode_for_conflict(members[i].status,
5786  members[i].xid,
5787  mode, &needwait);
5788 
5789  if (needwait)
5790  {
5792  XactLockTableWait(members[i].xid, rel,
5793  &mytup.t_self,
5795  pfree(members);
5796  goto l4;
5797  }
5798  if (result != HeapTupleMayBeUpdated)
5799  {
5800  pfree(members);
5801  goto out_locked;
5802  }
5803  }
5804  if (members)
5805  pfree(members);
5806  }
5807  else
5808  {
5810 
5811  /*
5812  * For a non-multi Xmax, we first need to compute the
5813  * corresponding MultiXactStatus by using the infomask bits.
5814  */
5815  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5816  {
5817  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5818  status = MultiXactStatusForKeyShare;
5819  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5820  status = MultiXactStatusForShare;
5821  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5822  {
5823  if (old_infomask2 & HEAP_KEYS_UPDATED)
5824  status = MultiXactStatusForUpdate;
5825  else
5827  }
5828  else
5829  {
5830  /*
5831  * LOCK_ONLY present alone (a pg_upgraded tuple marked
5832  * as share-locked in the old cluster) shouldn't be
5833  * seen in the middle of an update chain.
5834  */
5835  elog(ERROR, "invalid lock status in tuple");
5836  }
5837  }
5838  else
5839  {
5840  /* it's an update, but which kind? */
5841  if (old_infomask2 & HEAP_KEYS_UPDATED)
5842  status = MultiXactStatusUpdate;
5843  else
5844  status = MultiXactStatusNoKeyUpdate;
5845  }
5846 
5847  result = test_lockmode_for_conflict(status, rawxmax, mode,
5848  &needwait);
5849  if (needwait)
5850  {
5852  XactLockTableWait(rawxmax, rel, &mytup.t_self,
5854  goto l4;
5855  }
5856  if (result != HeapTupleMayBeUpdated)
5857  {
5858  goto out_locked;
5859  }
5860  }
5861  }
5862 
5863  /* compute the new Xmax and infomask values for the tuple ... */
5864  compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
5865