PostgreSQL Source Code  git master
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros
heapam.c
Go to the documentation of this file.
1 /*-------------------------------------------------------------------------
2  *
3  * heapam.c
4  * heap access method code
5  *
6  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  * src/backend/access/heap/heapam.c
12  *
13  *
14  * INTERFACE ROUTINES
15  * relation_open - open any relation by relation OID
16  * relation_openrv - open any relation specified by a RangeVar
17  * relation_close - close any relation
18  * heap_open - open a heap relation by relation OID
19  * heap_openrv - open a heap relation specified by a RangeVar
20  * heap_close - (now just a macro for relation_close)
21  * heap_beginscan - begin relation scan
22  * heap_rescan - restart a relation scan
23  * heap_endscan - end relation scan
24  * heap_getnext - retrieve next tuple in scan
25  * heap_fetch - retrieve tuple with given tid
26  * heap_insert - insert tuple into a relation
27  * heap_multi_insert - insert multiple tuples into a relation
28  * heap_delete - delete a tuple from a relation
29  * heap_update - replace a tuple in a relation with another tuple
30  * heap_sync - sync heap, for when no WAL has been written
31  *
32  * NOTES
33  * This file contains the heap_ routines which implement
34  * the POSTGRES heap access method used for all POSTGRES
35  * relations.
36  *
37  *-------------------------------------------------------------------------
38  */
39 #include "postgres.h"
40 
41 #include "access/bufmask.h"
42 #include "access/heapam.h"
43 #include "access/heapam_xlog.h"
44 #include "access/hio.h"
45 #include "access/multixact.h"
46 #include "access/parallel.h"
47 #include "access/relscan.h"
48 #include "access/sysattr.h"
49 #include "access/transam.h"
50 #include "access/tuptoaster.h"
51 #include "access/valid.h"
52 #include "access/visibilitymap.h"
53 #include "access/xact.h"
54 #include "access/xlog.h"
55 #include "access/xloginsert.h"
56 #include "access/xlogutils.h"
57 #include "catalog/catalog.h"
58 #include "catalog/namespace.h"
59 #include "miscadmin.h"
60 #include "pgstat.h"
61 #include "port/atomics.h"
62 #include "storage/bufmgr.h"
63 #include "storage/freespace.h"
64 #include "storage/lmgr.h"
65 #include "storage/predicate.h"
66 #include "storage/procarray.h"
67 #include "storage/smgr.h"
68 #include "storage/spin.h"
69 #include "storage/standby.h"
70 #include "utils/datum.h"
71 #include "utils/inval.h"
72 #include "utils/lsyscache.h"
73 #include "utils/relcache.h"
74 #include "utils/snapmgr.h"
75 #include "utils/syscache.h"
76 #include "utils/tqual.h"
77 
78 
79 /* GUC variable */
81 
82 
84  Snapshot snapshot,
85  int nkeys, ScanKey key,
86  ParallelHeapScanDesc parallel_scan,
87  bool allow_strat,
88  bool allow_sync,
89  bool allow_pagemode,
90  bool is_bitmapscan,
91  bool is_samplescan,
92  bool temp_snap);
95 static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
96  TransactionId xid, CommandId cid, int options);
97 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
98  Buffer newbuf, HeapTuple oldtup,
99  HeapTuple newtup, HeapTuple old_key_tup,
100  bool all_visible_cleared, bool new_all_visible_cleared);
102  Bitmapset *interesting_cols,
103  HeapTuple oldtup, HeapTuple newtup);
104 static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
105  LockTupleMode mode, LockWaitPolicy wait_policy,
106  bool *have_tuple_lock);
107 static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
108  uint16 old_infomask2, TransactionId add_to_xmax,
109  LockTupleMode mode, bool is_update,
110  TransactionId *result_xmax, uint16 *result_infomask,
111  uint16 *result_infomask2);
113  ItemPointer ctid, TransactionId xid,
114  LockTupleMode mode);
115 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
116  uint16 *new_infomask2);
118  uint16 t_infomask);
119 static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
120  LockTupleMode lockmode);
121 static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
122  Relation rel, ItemPointer ctid, XLTW_Oper oper,
123  int *remaining);
125  uint16 infomask, Relation rel, int *remaining);
126 static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
127 static HeapTuple ExtractReplicaIdentity(Relation rel, HeapTuple tup, bool key_modified,
128  bool *copy);
129 
130 
131 /*
132  * Each tuple lock mode has a corresponding heavyweight lock, and one or two
133  * corresponding MultiXactStatuses (one to merely lock tuples, another one to
134  * update them). This table (and the macros below) helps us determine the
135  * heavyweight lock mode and MultiXactStatus values to use for any particular
136  * tuple lock strength.
137  *
138  * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
139  * instead.
140  */
141 static const struct
142 {
146 }
147 
149 {
150  { /* LockTupleKeyShare */
153  -1 /* KeyShare does not allow updating tuples */
154  },
155  { /* LockTupleShare */
156  RowShareLock,
158  -1 /* Share does not allow updating tuples */
159  },
160  { /* LockTupleNoKeyExclusive */
164  },
165  { /* LockTupleExclusive */
169  }
170 };
171 
172 /* Get the LOCKMODE for a given MultiXactStatus */
173 #define LOCKMODE_from_mxstatus(status) \
174  (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
175 
176 /*
177  * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
178  * This is more readable than having every caller translate it to lock.h's
179  * LOCKMODE.
180  */
181 #define LockTupleTuplock(rel, tup, mode) \
182  LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
183 #define UnlockTupleTuplock(rel, tup, mode) \
184  UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
185 #define ConditionalLockTupleTuplock(rel, tup, mode) \
186  ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
187 
188 /*
189  * This table maps tuple lock strength values for each particular
190  * MultiXactStatus value.
191  */
193 {
194  LockTupleKeyShare, /* ForKeyShare */
195  LockTupleShare, /* ForShare */
196  LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
197  LockTupleExclusive, /* ForUpdate */
198  LockTupleNoKeyExclusive, /* NoKeyUpdate */
199  LockTupleExclusive /* Update */
200 };
201 
202 /* Get the LockTupleMode for a given MultiXactStatus */
203 #define TUPLOCK_from_mxstatus(status) \
204  (MultiXactStatusLock[(status)])
205 
206 /* ----------------------------------------------------------------
207  * heap support routines
208  * ----------------------------------------------------------------
209  */
210 
211 /* ----------------
212  * initscan - scan code common to heap_beginscan and heap_rescan
213  * ----------------
214  */
215 static void
216 initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
217 {
218  bool allow_strat;
219  bool allow_sync;
220 
221  /*
222  * Determine the number of blocks we have to scan.
223  *
224  * It is sufficient to do this once at scan start, since any tuples added
225  * while the scan is in progress will be invisible to my snapshot anyway.
226  * (That is not true when using a non-MVCC snapshot. However, we couldn't
227  * guarantee to return tuples added after scan start anyway, since they
228  * might go into pages we already scanned. To guarantee consistent
229  * results for a non-MVCC snapshot, the caller must hold some higher-level
230  * lock that ensures the interesting tuple(s) won't change.)
231  */
232  if (scan->rs_parallel != NULL)
233  scan->rs_nblocks = scan->rs_parallel->phs_nblocks;
234  else
236 
237  /*
238  * If the table is large relative to NBuffers, use a bulk-read access
239  * strategy and enable synchronized scanning (see syncscan.c). Although
240  * the thresholds for these features could be different, we make them the
241  * same so that there are only two behaviors to tune rather than four.
242  * (However, some callers need to be able to disable one or both of these
243  * behaviors, independently of the size of the table; also there is a GUC
244  * variable that can disable synchronized scanning.)
245  *
246  * Note that heap_parallelscan_initialize has a very similar test; if you
247  * change this, consider changing that one, too.
248  */
249  if (!RelationUsesLocalBuffers(scan->rs_rd) &&
250  scan->rs_nblocks > NBuffers / 4)
251  {
252  allow_strat = scan->rs_allow_strat;
253  allow_sync = scan->rs_allow_sync;
254  }
255  else
256  allow_strat = allow_sync = false;
257 
258  if (allow_strat)
259  {
260  /* During a rescan, keep the previous strategy object. */
261  if (scan->rs_strategy == NULL)
263  }
264  else
265  {
266  if (scan->rs_strategy != NULL)
268  scan->rs_strategy = NULL;
269  }
270 
271  if (scan->rs_parallel != NULL)
272  {
273  /* For parallel scan, believe whatever ParallelHeapScanDesc says. */
274  scan->rs_syncscan = scan->rs_parallel->phs_syncscan;
275  }
276  else if (keep_startblock)
277  {
278  /*
279  * When rescanning, we want to keep the previous startblock setting,
280  * so that rewinding a cursor doesn't generate surprising results.
281  * Reset the active syncscan setting, though.
282  */
283  scan->rs_syncscan = (allow_sync && synchronize_seqscans);
284  }
285  else if (allow_sync && synchronize_seqscans)
286  {
287  scan->rs_syncscan = true;
288  scan->rs_startblock = ss_get_location(scan->rs_rd, scan->rs_nblocks);
289  }
290  else
291  {
292  scan->rs_syncscan = false;
293  scan->rs_startblock = 0;
294  }
295 
297  scan->rs_inited = false;
298  scan->rs_ctup.t_data = NULL;
300  scan->rs_cbuf = InvalidBuffer;
302 
303  /* page-at-a-time fields are always invalid when not rs_inited */
304 
305  /*
306  * copy the scan key, if appropriate
307  */
308  if (key != NULL)
309  memcpy(scan->rs_key, key, scan->rs_nkeys * sizeof(ScanKeyData));
310 
311  /*
312  * Currently, we don't have a stats counter for bitmap heap scans (but the
313  * underlying bitmap index scans will be counted) or sample scans (we only
314  * update stats for tuple fetches there)
315  */
316  if (!scan->rs_bitmapscan && !scan->rs_samplescan)
318 }
319 
320 /*
321  * heap_setscanlimits - restrict range of a heapscan
322  *
323  * startBlk is the page to start at
324  * numBlks is number of pages to scan (InvalidBlockNumber means "all")
325  */
326 void
328 {
329  Assert(!scan->rs_inited); /* else too late to change */
330  Assert(!scan->rs_syncscan); /* else rs_startblock is significant */
331 
332  /* Check startBlk is valid (but allow case of zero blocks...) */
333  Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
334 
335  scan->rs_startblock = startBlk;
336  scan->rs_numblocks = numBlks;
337 }
338 
339 /*
340  * heapgetpage - subroutine for heapgettup()
341  *
342  * This routine reads and pins the specified page of the relation.
343  * In page-at-a-time mode it performs additional work, namely determining
344  * which tuples on the page are visible.
345  */
346 void
348 {
349  Buffer buffer;
350  Snapshot snapshot;
351  Page dp;
352  int lines;
353  int ntup;
354  OffsetNumber lineoff;
355  ItemId lpp;
356  bool all_visible;
357 
358  Assert(page < scan->rs_nblocks);
359 
360  /* release previous scan buffer, if any */
361  if (BufferIsValid(scan->rs_cbuf))
362  {
363  ReleaseBuffer(scan->rs_cbuf);
364  scan->rs_cbuf = InvalidBuffer;
365  }
366 
367  /*
368  * Be sure to check for interrupts at least once per page. Checks at
369  * higher code levels won't be able to stop a seqscan that encounters many
370  * pages' worth of consecutive dead tuples.
371  */
373 
374  /* read page using selected strategy */
375  scan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM, page,
376  RBM_NORMAL, scan->rs_strategy);
377  scan->rs_cblock = page;
378 
379  if (!scan->rs_pageatatime)
380  return;
381 
382  buffer = scan->rs_cbuf;
383  snapshot = scan->rs_snapshot;
384 
385  /*
386  * Prune and repair fragmentation for the whole page, if possible.
387  */
388  heap_page_prune_opt(scan->rs_rd, buffer);
389 
390  /*
391  * We must hold share lock on the buffer content while examining tuple
392  * visibility. Afterwards, however, the tuples we have found to be
393  * visible are guaranteed good as long as we hold the buffer pin.
394  */
395  LockBuffer(buffer, BUFFER_LOCK_SHARE);
396 
397  dp = BufferGetPage(buffer);
398  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
399  lines = PageGetMaxOffsetNumber(dp);
400  ntup = 0;
401 
402  /*
403  * If the all-visible flag indicates that all tuples on the page are
404  * visible to everyone, we can skip the per-tuple visibility tests.
405  *
406  * Note: In hot standby, a tuple that's already visible to all
407  * transactions in the master might still be invisible to a read-only
408  * transaction in the standby. We partly handle this problem by tracking
409  * the minimum xmin of visible tuples as the cut-off XID while marking a
410  * page all-visible on master and WAL log that along with the visibility
411  * map SET operation. In hot standby, we wait for (or abort) all
412  * transactions that can potentially may not see one or more tuples on the
413  * page. That's how index-only scans work fine in hot standby. A crucial
414  * difference between index-only scans and heap scans is that the
415  * index-only scan completely relies on the visibility map where as heap
416  * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
417  * the page-level flag can be trusted in the same way, because it might
418  * get propagated somehow without being explicitly WAL-logged, e.g. via a
419  * full page write. Until we can prove that beyond doubt, let's check each
420  * tuple for visibility the hard way.
421  */
422  all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
423 
424  for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
425  lineoff <= lines;
426  lineoff++, lpp++)
427  {
428  if (ItemIdIsNormal(lpp))
429  {
430  HeapTupleData loctup;
431  bool valid;
432 
433  loctup.t_tableOid = RelationGetRelid(scan->rs_rd);
434  loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
435  loctup.t_len = ItemIdGetLength(lpp);
436  ItemPointerSet(&(loctup.t_self), page, lineoff);
437 
438  if (all_visible)
439  valid = true;
440  else
441  valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
442 
443  CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
444  buffer, snapshot);
445 
446  if (valid)
447  scan->rs_vistuples[ntup++] = lineoff;
448  }
449  }
450 
452 
453  Assert(ntup <= MaxHeapTuplesPerPage);
454  scan->rs_ntuples = ntup;
455 }
456 
457 /* ----------------
458  * heapgettup - fetch next heap tuple
459  *
460  * Initialize the scan if not already done; then advance to the next
461  * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
462  * or set scan->rs_ctup.t_data = NULL if no more tuples.
463  *
464  * dir == NoMovementScanDirection means "re-fetch the tuple indicated
465  * by scan->rs_ctup".
466  *
467  * Note: the reason nkeys/key are passed separately, even though they are
468  * kept in the scan descriptor, is that the caller may not want us to check
469  * the scankeys.
470  *
471  * Note: when we fall off the end of the scan in either direction, we
472  * reset rs_inited. This means that a further request with the same
473  * scan direction will restart the scan, which is a bit odd, but a
474  * request with the opposite scan direction will start a fresh scan
475  * in the proper direction. The latter is required behavior for cursors,
476  * while the former case is generally undefined behavior in Postgres
477  * so we don't care too much.
478  * ----------------
479  */
480 static void
482  ScanDirection dir,
483  int nkeys,
484  ScanKey key)
485 {
486  HeapTuple tuple = &(scan->rs_ctup);
487  Snapshot snapshot = scan->rs_snapshot;
488  bool backward = ScanDirectionIsBackward(dir);
489  BlockNumber page;
490  bool finished;
491  Page dp;
492  int lines;
493  OffsetNumber lineoff;
494  int linesleft;
495  ItemId lpp;
496 
497  /*
498  * calculate next starting lineoff, given scan direction
499  */
500  if (ScanDirectionIsForward(dir))
501  {
502  if (!scan->rs_inited)
503  {
504  /*
505  * return null immediately if relation is empty
506  */
507  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
508  {
509  Assert(!BufferIsValid(scan->rs_cbuf));
510  tuple->t_data = NULL;
511  return;
512  }
513  if (scan->rs_parallel != NULL)
514  {
516 
517  page = heap_parallelscan_nextpage(scan);
518 
519  /* Other processes might have already finished the scan. */
520  if (page == InvalidBlockNumber)
521  {
522  Assert(!BufferIsValid(scan->rs_cbuf));
523  tuple->t_data = NULL;
524  return;
525  }
526  }
527  else
528  page = scan->rs_startblock; /* first page */
529  heapgetpage(scan, page);
530  lineoff = FirstOffsetNumber; /* first offnum */
531  scan->rs_inited = true;
532  }
533  else
534  {
535  /* continue from previously returned page/tuple */
536  page = scan->rs_cblock; /* current page */
537  lineoff = /* next offnum */
539  }
540 
542 
543  dp = BufferGetPage(scan->rs_cbuf);
544  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
545  lines = PageGetMaxOffsetNumber(dp);
546  /* page and lineoff now reference the physically next tid */
547 
548  linesleft = lines - lineoff + 1;
549  }
550  else if (backward)
551  {
552  /* backward parallel scan not supported */
553  Assert(scan->rs_parallel == NULL);
554 
555  if (!scan->rs_inited)
556  {
557  /*
558  * return null immediately if relation is empty
559  */
560  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
561  {
562  Assert(!BufferIsValid(scan->rs_cbuf));
563  tuple->t_data = NULL;
564  return;
565  }
566 
567  /*
568  * Disable reporting to syncscan logic in a backwards scan; it's
569  * not very likely anyone else is doing the same thing at the same
570  * time, and much more likely that we'll just bollix things for
571  * forward scanners.
572  */
573  scan->rs_syncscan = false;
574  /* start from last page of the scan */
575  if (scan->rs_startblock > 0)
576  page = scan->rs_startblock - 1;
577  else
578  page = scan->rs_nblocks - 1;
579  heapgetpage(scan, page);
580  }
581  else
582  {
583  /* continue from previously returned page/tuple */
584  page = scan->rs_cblock; /* current page */
585  }
586 
588 
589  dp = BufferGetPage(scan->rs_cbuf);
590  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
591  lines = PageGetMaxOffsetNumber(dp);
592 
593  if (!scan->rs_inited)
594  {
595  lineoff = lines; /* final offnum */
596  scan->rs_inited = true;
597  }
598  else
599  {
600  lineoff = /* previous offnum */
602  }
603  /* page and lineoff now reference the physically previous tid */
604 
605  linesleft = lineoff;
606  }
607  else
608  {
609  /*
610  * ``no movement'' scan direction: refetch prior tuple
611  */
612  if (!scan->rs_inited)
613  {
614  Assert(!BufferIsValid(scan->rs_cbuf));
615  tuple->t_data = NULL;
616  return;
617  }
618 
619  page = ItemPointerGetBlockNumber(&(tuple->t_self));
620  if (page != scan->rs_cblock)
621  heapgetpage(scan, page);
622 
623  /* Since the tuple was previously fetched, needn't lock page here */
624  dp = BufferGetPage(scan->rs_cbuf);
625  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
626  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
627  lpp = PageGetItemId(dp, lineoff);
628  Assert(ItemIdIsNormal(lpp));
629 
630  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
631  tuple->t_len = ItemIdGetLength(lpp);
632 
633  return;
634  }
635 
636  /*
637  * advance the scan until we find a qualifying tuple or run out of stuff
638  * to scan
639  */
640  lpp = PageGetItemId(dp, lineoff);
641  for (;;)
642  {
643  while (linesleft > 0)
644  {
645  if (ItemIdIsNormal(lpp))
646  {
647  bool valid;
648 
649  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
650  tuple->t_len = ItemIdGetLength(lpp);
651  ItemPointerSet(&(tuple->t_self), page, lineoff);
652 
653  /*
654  * if current tuple qualifies, return it.
655  */
656  valid = HeapTupleSatisfiesVisibility(tuple,
657  snapshot,
658  scan->rs_cbuf);
659 
660  CheckForSerializableConflictOut(valid, scan->rs_rd, tuple,
661  scan->rs_cbuf, snapshot);
662 
663  if (valid && key != NULL)
664  HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
665  nkeys, key, valid);
666 
667  if (valid)
668  {
670  return;
671  }
672  }
673 
674  /*
675  * otherwise move to the next item on the page
676  */
677  --linesleft;
678  if (backward)
679  {
680  --lpp; /* move back in this page's ItemId array */
681  --lineoff;
682  }
683  else
684  {
685  ++lpp; /* move forward in this page's ItemId array */
686  ++lineoff;
687  }
688  }
689 
690  /*
691  * if we get here, it means we've exhausted the items on this page and
692  * it's time to move to the next.
693  */
695 
696  /*
697  * advance to next/prior page and detect end of scan
698  */
699  if (backward)
700  {
701  finished = (page == scan->rs_startblock) ||
702  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
703  if (page == 0)
704  page = scan->rs_nblocks;
705  page--;
706  }
707  else if (scan->rs_parallel != NULL)
708  {
709  page = heap_parallelscan_nextpage(scan);
710  finished = (page == InvalidBlockNumber);
711  }
712  else
713  {
714  page++;
715  if (page >= scan->rs_nblocks)
716  page = 0;
717  finished = (page == scan->rs_startblock) ||
718  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
719 
720  /*
721  * Report our new scan position for synchronization purposes. We
722  * don't do that when moving backwards, however. That would just
723  * mess up any other forward-moving scanners.
724  *
725  * Note: we do this before checking for end of scan so that the
726  * final state of the position hint is back at the start of the
727  * rel. That's not strictly necessary, but otherwise when you run
728  * the same query multiple times the starting position would shift
729  * a little bit backwards on every invocation, which is confusing.
730  * We don't guarantee any specific ordering in general, though.
731  */
732  if (scan->rs_syncscan)
733  ss_report_location(scan->rs_rd, page);
734  }
735 
736  /*
737  * return NULL if we've exhausted all the pages
738  */
739  if (finished)
740  {
741  if (BufferIsValid(scan->rs_cbuf))
742  ReleaseBuffer(scan->rs_cbuf);
743  scan->rs_cbuf = InvalidBuffer;
745  tuple->t_data = NULL;
746  scan->rs_inited = false;
747  return;
748  }
749 
750  heapgetpage(scan, page);
751 
753 
754  dp = BufferGetPage(scan->rs_cbuf);
755  TestForOldSnapshot(snapshot, scan->rs_rd, dp);
756  lines = PageGetMaxOffsetNumber((Page) dp);
757  linesleft = lines;
758  if (backward)
759  {
760  lineoff = lines;
761  lpp = PageGetItemId(dp, lines);
762  }
763  else
764  {
765  lineoff = FirstOffsetNumber;
766  lpp = PageGetItemId(dp, FirstOffsetNumber);
767  }
768  }
769 }
770 
771 /* ----------------
772  * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
773  *
774  * Same API as heapgettup, but used in page-at-a-time mode
775  *
776  * The internal logic is much the same as heapgettup's too, but there are some
777  * differences: we do not take the buffer content lock (that only needs to
778  * happen inside heapgetpage), and we iterate through just the tuples listed
779  * in rs_vistuples[] rather than all tuples on the page. Notice that
780  * lineindex is 0-based, where the corresponding loop variable lineoff in
781  * heapgettup is 1-based.
782  * ----------------
783  */
784 static void
786  ScanDirection dir,
787  int nkeys,
788  ScanKey key)
789 {
790  HeapTuple tuple = &(scan->rs_ctup);
791  bool backward = ScanDirectionIsBackward(dir);
792  BlockNumber page;
793  bool finished;
794  Page dp;
795  int lines;
796  int lineindex;
797  OffsetNumber lineoff;
798  int linesleft;
799  ItemId lpp;
800 
801  /*
802  * calculate next starting lineindex, given scan direction
803  */
804  if (ScanDirectionIsForward(dir))
805  {
806  if (!scan->rs_inited)
807  {
808  /*
809  * return null immediately if relation is empty
810  */
811  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
812  {
813  Assert(!BufferIsValid(scan->rs_cbuf));
814  tuple->t_data = NULL;
815  return;
816  }
817  if (scan->rs_parallel != NULL)
818  {
820 
821  page = heap_parallelscan_nextpage(scan);
822 
823  /* Other processes might have already finished the scan. */
824  if (page == InvalidBlockNumber)
825  {
826  Assert(!BufferIsValid(scan->rs_cbuf));
827  tuple->t_data = NULL;
828  return;
829  }
830  }
831  else
832  page = scan->rs_startblock; /* first page */
833  heapgetpage(scan, page);
834  lineindex = 0;
835  scan->rs_inited = true;
836  }
837  else
838  {
839  /* continue from previously returned page/tuple */
840  page = scan->rs_cblock; /* current page */
841  lineindex = scan->rs_cindex + 1;
842  }
843 
844  dp = BufferGetPage(scan->rs_cbuf);
845  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
846  lines = scan->rs_ntuples;
847  /* page and lineindex now reference the next visible tid */
848 
849  linesleft = lines - lineindex;
850  }
851  else if (backward)
852  {
853  /* backward parallel scan not supported */
854  Assert(scan->rs_parallel == NULL);
855 
856  if (!scan->rs_inited)
857  {
858  /*
859  * return null immediately if relation is empty
860  */
861  if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
862  {
863  Assert(!BufferIsValid(scan->rs_cbuf));
864  tuple->t_data = NULL;
865  return;
866  }
867 
868  /*
869  * Disable reporting to syncscan logic in a backwards scan; it's
870  * not very likely anyone else is doing the same thing at the same
871  * time, and much more likely that we'll just bollix things for
872  * forward scanners.
873  */
874  scan->rs_syncscan = false;
875  /* start from last page of the scan */
876  if (scan->rs_startblock > 0)
877  page = scan->rs_startblock - 1;
878  else
879  page = scan->rs_nblocks - 1;
880  heapgetpage(scan, page);
881  }
882  else
883  {
884  /* continue from previously returned page/tuple */
885  page = scan->rs_cblock; /* current page */
886  }
887 
888  dp = BufferGetPage(scan->rs_cbuf);
889  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
890  lines = scan->rs_ntuples;
891 
892  if (!scan->rs_inited)
893  {
894  lineindex = lines - 1;
895  scan->rs_inited = true;
896  }
897  else
898  {
899  lineindex = scan->rs_cindex - 1;
900  }
901  /* page and lineindex now reference the previous visible tid */
902 
903  linesleft = lineindex + 1;
904  }
905  else
906  {
907  /*
908  * ``no movement'' scan direction: refetch prior tuple
909  */
910  if (!scan->rs_inited)
911  {
912  Assert(!BufferIsValid(scan->rs_cbuf));
913  tuple->t_data = NULL;
914  return;
915  }
916 
917  page = ItemPointerGetBlockNumber(&(tuple->t_self));
918  if (page != scan->rs_cblock)
919  heapgetpage(scan, page);
920 
921  /* Since the tuple was previously fetched, needn't lock page here */
922  dp = BufferGetPage(scan->rs_cbuf);
923  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
924  lineoff = ItemPointerGetOffsetNumber(&(tuple->t_self));
925  lpp = PageGetItemId(dp, lineoff);
926  Assert(ItemIdIsNormal(lpp));
927 
928  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
929  tuple->t_len = ItemIdGetLength(lpp);
930 
931  /* check that rs_cindex is in sync */
932  Assert(scan->rs_cindex < scan->rs_ntuples);
933  Assert(lineoff == scan->rs_vistuples[scan->rs_cindex]);
934 
935  return;
936  }
937 
938  /*
939  * advance the scan until we find a qualifying tuple or run out of stuff
940  * to scan
941  */
942  for (;;)
943  {
944  while (linesleft > 0)
945  {
946  lineoff = scan->rs_vistuples[lineindex];
947  lpp = PageGetItemId(dp, lineoff);
948  Assert(ItemIdIsNormal(lpp));
949 
950  tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
951  tuple->t_len = ItemIdGetLength(lpp);
952  ItemPointerSet(&(tuple->t_self), page, lineoff);
953 
954  /*
955  * if current tuple qualifies, return it.
956  */
957  if (key != NULL)
958  {
959  bool valid;
960 
961  HeapKeyTest(tuple, RelationGetDescr(scan->rs_rd),
962  nkeys, key, valid);
963  if (valid)
964  {
965  scan->rs_cindex = lineindex;
966  return;
967  }
968  }
969  else
970  {
971  scan->rs_cindex = lineindex;
972  return;
973  }
974 
975  /*
976  * otherwise move to the next item on the page
977  */
978  --linesleft;
979  if (backward)
980  --lineindex;
981  else
982  ++lineindex;
983  }
984 
985  /*
986  * if we get here, it means we've exhausted the items on this page and
987  * it's time to move to the next.
988  */
989  if (backward)
990  {
991  finished = (page == scan->rs_startblock) ||
992  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
993  if (page == 0)
994  page = scan->rs_nblocks;
995  page--;
996  }
997  else if (scan->rs_parallel != NULL)
998  {
999  page = heap_parallelscan_nextpage(scan);
1000  finished = (page == InvalidBlockNumber);
1001  }
1002  else
1003  {
1004  page++;
1005  if (page >= scan->rs_nblocks)
1006  page = 0;
1007  finished = (page == scan->rs_startblock) ||
1008  (scan->rs_numblocks != InvalidBlockNumber ? --scan->rs_numblocks == 0 : false);
1009 
1010  /*
1011  * Report our new scan position for synchronization purposes. We
1012  * don't do that when moving backwards, however. That would just
1013  * mess up any other forward-moving scanners.
1014  *
1015  * Note: we do this before checking for end of scan so that the
1016  * final state of the position hint is back at the start of the
1017  * rel. That's not strictly necessary, but otherwise when you run
1018  * the same query multiple times the starting position would shift
1019  * a little bit backwards on every invocation, which is confusing.
1020  * We don't guarantee any specific ordering in general, though.
1021  */
1022  if (scan->rs_syncscan)
1023  ss_report_location(scan->rs_rd, page);
1024  }
1025 
1026  /*
1027  * return NULL if we've exhausted all the pages
1028  */
1029  if (finished)
1030  {
1031  if (BufferIsValid(scan->rs_cbuf))
1032  ReleaseBuffer(scan->rs_cbuf);
1033  scan->rs_cbuf = InvalidBuffer;
1034  scan->rs_cblock = InvalidBlockNumber;
1035  tuple->t_data = NULL;
1036  scan->rs_inited = false;
1037  return;
1038  }
1039 
1040  heapgetpage(scan, page);
1041 
1042  dp = BufferGetPage(scan->rs_cbuf);
1043  TestForOldSnapshot(scan->rs_snapshot, scan->rs_rd, dp);
1044  lines = scan->rs_ntuples;
1045  linesleft = lines;
1046  if (backward)
1047  lineindex = lines - 1;
1048  else
1049  lineindex = 0;
1050  }
1051 }
1052 
1053 
1054 #if defined(DISABLE_COMPLEX_MACRO)
1055 /*
1056  * This is formatted so oddly so that the correspondence to the macro
1057  * definition in access/htup_details.h is maintained.
1058  */
1059 Datum
1060 fastgetattr(HeapTuple tup, int attnum, TupleDesc tupleDesc,
1061  bool *isnull)
1062 {
1063  return (
1064  (attnum) > 0 ?
1065  (
1066  (*(isnull) = false),
1067  HeapTupleNoNulls(tup) ?
1068  (
1069  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff >= 0 ?
1070  (
1071  fetchatt(TupleDescAttr((tupleDesc), (attnum) - 1),
1072  (char *) (tup)->t_data + (tup)->t_data->t_hoff +
1073  TupleDescAttr((tupleDesc), (attnum) - 1)->attcacheoff)
1074  )
1075  :
1076  nocachegetattr((tup), (attnum), (tupleDesc))
1077  )
1078  :
1079  (
1080  att_isnull((attnum) - 1, (tup)->t_data->t_bits) ?
1081  (
1082  (*(isnull) = true),
1083  (Datum) NULL
1084  )
1085  :
1086  (
1087  nocachegetattr((tup), (attnum), (tupleDesc))
1088  )
1089  )
1090  )
1091  :
1092  (
1093  (Datum) NULL
1094  )
1095  );
1096 }
1097 #endif /* defined(DISABLE_COMPLEX_MACRO) */
1098 
1099 
1100 /* ----------------------------------------------------------------
1101  * heap access method interface
1102  * ----------------------------------------------------------------
1103  */
1104 
1105 /* ----------------
1106  * relation_open - open any relation by relation OID
1107  *
1108  * If lockmode is not "NoLock", the specified kind of lock is
1109  * obtained on the relation. (Generally, NoLock should only be
1110  * used if the caller knows it has some appropriate lock on the
1111  * relation already.)
1112  *
1113  * An error is raised if the relation does not exist.
1114  *
1115  * NB: a "relation" is anything with a pg_class entry. The caller is
1116  * expected to check whether the relkind is something it can handle.
1117  * ----------------
1118  */
1119 Relation
1120 relation_open(Oid relationId, LOCKMODE lockmode)
1121 {
1122  Relation r;
1123 
1124  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1125 
1126  /* Get the lock before trying to open the relcache entry */
1127  if (lockmode != NoLock)
1128  LockRelationOid(relationId, lockmode);
1129 
1130  /* The relcache does all the real work... */
1131  r = RelationIdGetRelation(relationId);
1132 
1133  if (!RelationIsValid(r))
1134  elog(ERROR, "could not open relation with OID %u", relationId);
1135 
1136  /* Make note that we've accessed a temporary relation */
1137  if (RelationUsesLocalBuffers(r))
1139 
1140  pgstat_initstats(r);
1141 
1142  return r;
1143 }
1144 
1145 /* ----------------
1146  * try_relation_open - open any relation by relation OID
1147  *
1148  * Same as relation_open, except return NULL instead of failing
1149  * if the relation does not exist.
1150  * ----------------
1151  */
1152 Relation
1153 try_relation_open(Oid relationId, LOCKMODE lockmode)
1154 {
1155  Relation r;
1156 
1157  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1158 
1159  /* Get the lock first */
1160  if (lockmode != NoLock)
1161  LockRelationOid(relationId, lockmode);
1162 
1163  /*
1164  * Now that we have the lock, probe to see if the relation really exists
1165  * or not.
1166  */
1167  if (!SearchSysCacheExists1(RELOID, ObjectIdGetDatum(relationId)))
1168  {
1169  /* Release useless lock */
1170  if (lockmode != NoLock)
1171  UnlockRelationOid(relationId, lockmode);
1172 
1173  return NULL;
1174  }
1175 
1176  /* Should be safe to do a relcache load */
1177  r = RelationIdGetRelation(relationId);
1178 
1179  if (!RelationIsValid(r))
1180  elog(ERROR, "could not open relation with OID %u", relationId);
1181 
1182  /* Make note that we've accessed a temporary relation */
1183  if (RelationUsesLocalBuffers(r))
1185 
1186  pgstat_initstats(r);
1187 
1188  return r;
1189 }
1190 
1191 /* ----------------
1192  * relation_openrv - open any relation specified by a RangeVar
1193  *
1194  * Same as relation_open, but the relation is specified by a RangeVar.
1195  * ----------------
1196  */
1197 Relation
1198 relation_openrv(const RangeVar *relation, LOCKMODE lockmode)
1199 {
1200  Oid relOid;
1201 
1202  /*
1203  * Check for shared-cache-inval messages before trying to open the
1204  * relation. This is needed even if we already hold a lock on the
1205  * relation, because GRANT/REVOKE are executed without taking any lock on
1206  * the target relation, and we want to be sure we see current ACL
1207  * information. We can skip this if asked for NoLock, on the assumption
1208  * that such a call is not the first one in the current command, and so we
1209  * should be reasonably up-to-date already. (XXX this all could stand to
1210  * be redesigned, but for the moment we'll keep doing this like it's been
1211  * done historically.)
1212  */
1213  if (lockmode != NoLock)
1215 
1216  /* Look up and lock the appropriate relation using namespace search */
1217  relOid = RangeVarGetRelid(relation, lockmode, false);
1218 
1219  /* Let relation_open do the rest */
1220  return relation_open(relOid, NoLock);
1221 }
1222 
1223 /* ----------------
1224  * relation_openrv_extended - open any relation specified by a RangeVar
1225  *
1226  * Same as relation_openrv, but with an additional missing_ok argument
1227  * allowing a NULL return rather than an error if the relation is not
1228  * found. (Note that some other causes, such as permissions problems,
1229  * will still result in an ereport.)
1230  * ----------------
1231  */
1232 Relation
1233 relation_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1234  bool missing_ok)
1235 {
1236  Oid relOid;
1237 
1238  /*
1239  * Check for shared-cache-inval messages before trying to open the
1240  * relation. See comments in relation_openrv().
1241  */
1242  if (lockmode != NoLock)
1244 
1245  /* Look up and lock the appropriate relation using namespace search */
1246  relOid = RangeVarGetRelid(relation, lockmode, missing_ok);
1247 
1248  /* Return NULL on not-found */
1249  if (!OidIsValid(relOid))
1250  return NULL;
1251 
1252  /* Let relation_open do the rest */
1253  return relation_open(relOid, NoLock);
1254 }
1255 
1256 /* ----------------
1257  * relation_close - close any relation
1258  *
1259  * If lockmode is not "NoLock", we then release the specified lock.
1260  *
1261  * Note that it is often sensible to hold a lock beyond relation_close;
1262  * in that case, the lock is released automatically at xact end.
1263  * ----------------
1264  */
1265 void
1266 relation_close(Relation relation, LOCKMODE lockmode)
1267 {
1268  LockRelId relid = relation->rd_lockInfo.lockRelId;
1269 
1270  Assert(lockmode >= NoLock && lockmode < MAX_LOCKMODES);
1271 
1272  /* The relcache does the real work... */
1273  RelationClose(relation);
1274 
1275  if (lockmode != NoLock)
1276  UnlockRelationId(&relid, lockmode);
1277 }
1278 
1279 
1280 /* ----------------
1281  * heap_open - open a heap relation by relation OID
1282  *
1283  * This is essentially relation_open plus check that the relation
1284  * is not an index nor a composite type. (The caller should also
1285  * check that it's not a view or foreign table before assuming it has
1286  * storage.)
1287  * ----------------
1288  */
1289 Relation
1290 heap_open(Oid relationId, LOCKMODE lockmode)
1291 {
1292  Relation r;
1293 
1294  r = relation_open(relationId, lockmode);
1295 
1296  if (r->rd_rel->relkind == RELKIND_INDEX)
1297  ereport(ERROR,
1298  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1299  errmsg("\"%s\" is an index",
1301  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1302  ereport(ERROR,
1303  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1304  errmsg("\"%s\" is a composite type",
1306 
1307  return r;
1308 }
1309 
1310 /* ----------------
1311  * heap_openrv - open a heap relation specified
1312  * by a RangeVar node
1313  *
1314  * As above, but relation is specified by a RangeVar.
1315  * ----------------
1316  */
1317 Relation
1318 heap_openrv(const RangeVar *relation, LOCKMODE lockmode)
1319 {
1320  Relation r;
1321 
1322  r = relation_openrv(relation, lockmode);
1323 
1324  if (r->rd_rel->relkind == RELKIND_INDEX)
1325  ereport(ERROR,
1326  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1327  errmsg("\"%s\" is an index",
1329  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1330  ereport(ERROR,
1331  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1332  errmsg("\"%s\" is a composite type",
1334 
1335  return r;
1336 }
1337 
1338 /* ----------------
1339  * heap_openrv_extended - open a heap relation specified
1340  * by a RangeVar node
1341  *
1342  * As above, but optionally return NULL instead of failing for
1343  * relation-not-found.
1344  * ----------------
1345  */
1346 Relation
1347 heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
1348  bool missing_ok)
1349 {
1350  Relation r;
1351 
1352  r = relation_openrv_extended(relation, lockmode, missing_ok);
1353 
1354  if (r)
1355  {
1356  if (r->rd_rel->relkind == RELKIND_INDEX)
1357  ereport(ERROR,
1358  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1359  errmsg("\"%s\" is an index",
1361  else if (r->rd_rel->relkind == RELKIND_COMPOSITE_TYPE)
1362  ereport(ERROR,
1363  (errcode(ERRCODE_WRONG_OBJECT_TYPE),
1364  errmsg("\"%s\" is a composite type",
1366  }
1367 
1368  return r;
1369 }
1370 
1371 
1372 /* ----------------
1373  * heap_beginscan - begin relation scan
1374  *
1375  * heap_beginscan is the "standard" case.
1376  *
1377  * heap_beginscan_catalog differs in setting up its own temporary snapshot.
1378  *
1379  * heap_beginscan_strat offers an extended API that lets the caller control
1380  * whether a nondefault buffer access strategy can be used, and whether
1381  * syncscan can be chosen (possibly resulting in the scan not starting from
1382  * block zero). Both of these default to TRUE with plain heap_beginscan.
1383  *
1384  * heap_beginscan_bm is an alternative entry point for setting up a
1385  * HeapScanDesc for a bitmap heap scan. Although that scan technology is
1386  * really quite unlike a standard seqscan, there is just enough commonality
1387  * to make it worth using the same data structure.
1388  *
1389  * heap_beginscan_sampling is an alternative entry point for setting up a
1390  * HeapScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
1391  * using the same data structure although the behavior is rather different.
1392  * In addition to the options offered by heap_beginscan_strat, this call
1393  * also allows control of whether page-mode visibility checking is used.
1394  * ----------------
1395  */
1397 heap_beginscan(Relation relation, Snapshot snapshot,
1398  int nkeys, ScanKey key)
1399 {
1400  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1401  true, true, true, false, false, false);
1402 }
1403 
1405 heap_beginscan_catalog(Relation relation, int nkeys, ScanKey key)
1406 {
1407  Oid relid = RelationGetRelid(relation);
1408  Snapshot snapshot = RegisterSnapshot(GetCatalogSnapshot(relid));
1409 
1410  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1411  true, true, true, false, false, true);
1412 }
1413 
1416  int nkeys, ScanKey key,
1417  bool allow_strat, bool allow_sync)
1418 {
1419  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1420  allow_strat, allow_sync, true,
1421  false, false, false);
1422 }
1423 
1426  int nkeys, ScanKey key)
1427 {
1428  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1429  false, false, true, true, false, false);
1430 }
1431 
1434  int nkeys, ScanKey key,
1435  bool allow_strat, bool allow_sync, bool allow_pagemode)
1436 {
1437  return heap_beginscan_internal(relation, snapshot, nkeys, key, NULL,
1438  allow_strat, allow_sync, allow_pagemode,
1439  false, true, false);
1440 }
1441 
1442 static HeapScanDesc
1444  int nkeys, ScanKey key,
1445  ParallelHeapScanDesc parallel_scan,
1446  bool allow_strat,
1447  bool allow_sync,
1448  bool allow_pagemode,
1449  bool is_bitmapscan,
1450  bool is_samplescan,
1451  bool temp_snap)
1452 {
1453  HeapScanDesc scan;
1454 
1455  /*
1456  * increment relation ref count while scanning relation
1457  *
1458  * This is just to make really sure the relcache entry won't go away while
1459  * the scan has a pointer to it. Caller should be holding the rel open
1460  * anyway, so this is redundant in all normal scenarios...
1461  */
1463 
1464  /*
1465  * allocate and initialize scan descriptor
1466  */
1467  scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1468 
1469  scan->rs_rd = relation;
1470  scan->rs_snapshot = snapshot;
1471  scan->rs_nkeys = nkeys;
1472  scan->rs_bitmapscan = is_bitmapscan;
1473  scan->rs_samplescan = is_samplescan;
1474  scan->rs_strategy = NULL; /* set in initscan */
1475  scan->rs_allow_strat = allow_strat;
1476  scan->rs_allow_sync = allow_sync;
1477  scan->rs_temp_snap = temp_snap;
1478  scan->rs_parallel = parallel_scan;
1479 
1480  /*
1481  * we can use page-at-a-time mode if it's an MVCC-safe snapshot
1482  */
1483  scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(snapshot);
1484 
1485  /*
1486  * For a seqscan in a serializable transaction, acquire a predicate lock
1487  * on the entire relation. This is required not only to lock all the
1488  * matching tuples, but also to conflict with new insertions into the
1489  * table. In an indexscan, we take page locks on the index pages covering
1490  * the range specified in the scan qual, but in a heap scan there is
1491  * nothing more fine-grained to lock. A bitmap scan is a different story,
1492  * there we have already scanned the index and locked the index pages
1493  * covering the predicate. But in that case we still have to lock any
1494  * matching heap tuples.
1495  */
1496  if (!is_bitmapscan)
1497  PredicateLockRelation(relation, snapshot);
1498 
1499  /* we only need to set this up once */
1500  scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1501 
1502  /*
1503  * we do this here instead of in initscan() because heap_rescan also calls
1504  * initscan() and we don't want to allocate memory again
1505  */
1506  if (nkeys > 0)
1507  scan->rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1508  else
1509  scan->rs_key = NULL;
1510 
1511  initscan(scan, key, false);
1512 
1513  return scan;
1514 }
1515 
1516 /* ----------------
1517  * heap_rescan - restart a relation scan
1518  * ----------------
1519  */
1520 void
1522  ScanKey key)
1523 {
1524  /*
1525  * unpin scan buffers
1526  */
1527  if (BufferIsValid(scan->rs_cbuf))
1528  ReleaseBuffer(scan->rs_cbuf);
1529 
1530  /*
1531  * reinitialize scan descriptor
1532  */
1533  initscan(scan, key, true);
1534 }
1535 
1536 /* ----------------
1537  * heap_rescan_set_params - restart a relation scan after changing params
1538  *
1539  * This call allows changing the buffer strategy, syncscan, and pagemode
1540  * options before starting a fresh scan. Note that although the actual use
1541  * of syncscan might change (effectively, enabling or disabling reporting),
1542  * the previously selected startblock will be kept.
1543  * ----------------
1544  */
1545 void
1547  bool allow_strat, bool allow_sync, bool allow_pagemode)
1548 {
1549  /* adjust parameters */
1550  scan->rs_allow_strat = allow_strat;
1551  scan->rs_allow_sync = allow_sync;
1552  scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(scan->rs_snapshot);
1553  /* ... and rescan */
1554  heap_rescan(scan, key);
1555 }
1556 
1557 /* ----------------
1558  * heap_endscan - end relation scan
1559  *
1560  * See how to integrate with index scans.
1561  * Check handling if reldesc caching.
1562  * ----------------
1563  */
1564 void
1566 {
1567  /* Note: no locking manipulations needed */
1568 
1569  /*
1570  * unpin scan buffers
1571  */
1572  if (BufferIsValid(scan->rs_cbuf))
1573  ReleaseBuffer(scan->rs_cbuf);
1574 
1575  /*
1576  * decrement relation reference count and free scan descriptor storage
1577  */
1579 
1580  if (scan->rs_key)
1581  pfree(scan->rs_key);
1582 
1583  if (scan->rs_strategy != NULL)
1585 
1586  if (scan->rs_temp_snap)
1588 
1589  pfree(scan);
1590 }
1591 
1592 /* ----------------
1593  * heap_parallelscan_estimate - estimate storage for ParallelHeapScanDesc
1594  *
1595  * Sadly, this doesn't reduce to a constant, because the size required
1596  * to serialize the snapshot can vary.
1597  * ----------------
1598  */
1599 Size
1601 {
1602  return add_size(offsetof(ParallelHeapScanDescData, phs_snapshot_data),
1603  EstimateSnapshotSpace(snapshot));
1604 }
1605 
1606 /* ----------------
1607  * heap_parallelscan_initialize - initialize ParallelHeapScanDesc
1608  *
1609  * Must allow as many bytes of shared memory as returned by
1610  * heap_parallelscan_estimate. Call this just once in the leader
1611  * process; then, individual workers attach via heap_beginscan_parallel.
1612  * ----------------
1613  */
1614 void
1616  Snapshot snapshot)
1617 {
1618  target->phs_relid = RelationGetRelid(relation);
1619  target->phs_nblocks = RelationGetNumberOfBlocks(relation);
1620  /* compare phs_syncscan initialization to similar logic in initscan */
1621  target->phs_syncscan = synchronize_seqscans &&
1622  !RelationUsesLocalBuffers(relation) &&
1623  target->phs_nblocks > NBuffers / 4;
1624  SpinLockInit(&target->phs_mutex);
1626  pg_atomic_init_u64(&target->phs_nallocated, 0);
1627  SerializeSnapshot(snapshot, target->phs_snapshot_data);
1628 }
1629 
1630 /* ----------------
1631  * heap_parallelscan_reinitialize - reset a parallel scan
1632  *
1633  * Call this in the leader process. Caller is responsible for
1634  * making sure that all workers have finished the scan beforehand.
1635  * ----------------
1636  */
1637 void
1639 {
1640  pg_atomic_write_u64(&parallel_scan->phs_nallocated, 0);
1641 }
1642 
1643 /* ----------------
1644  * heap_beginscan_parallel - join a parallel scan
1645  *
1646  * Caller must hold a suitable lock on the correct relation.
1647  * ----------------
1648  */
1651 {
1652  Snapshot snapshot;
1653 
1654  Assert(RelationGetRelid(relation) == parallel_scan->phs_relid);
1655  snapshot = RestoreSnapshot(parallel_scan->phs_snapshot_data);
1656  RegisterSnapshot(snapshot);
1657 
1658  return heap_beginscan_internal(relation, snapshot, 0, NULL, parallel_scan,
1659  true, true, true, false, false, true);
1660 }
1661 
1662 /* ----------------
1663  * heap_parallelscan_startblock_init - find and set the scan's startblock
1664  *
1665  * Determine where the parallel seq scan should start. This function may
1666  * be called many times, once by each parallel worker. We must be careful
1667  * only to set the startblock once.
1668  * ----------------
1669  */
1670 static void
1672 {
1673  BlockNumber sync_startpage = InvalidBlockNumber;
1674  ParallelHeapScanDesc parallel_scan;
1675 
1676  Assert(scan->rs_parallel);
1677  parallel_scan = scan->rs_parallel;
1678 
1679 retry:
1680  /* Grab the spinlock. */
1681  SpinLockAcquire(&parallel_scan->phs_mutex);
1682 
1683  /*
1684  * If the scan's startblock has not yet been initialized, we must do so
1685  * now. If this is not a synchronized scan, we just start at block 0, but
1686  * if it is a synchronized scan, we must get the starting position from
1687  * the synchronized scan machinery. We can't hold the spinlock while
1688  * doing that, though, so release the spinlock, get the information we
1689  * need, and retry. If nobody else has initialized the scan in the
1690  * meantime, we'll fill in the value we fetched on the second time
1691  * through.
1692  */
1693  if (parallel_scan->phs_startblock == InvalidBlockNumber)
1694  {
1695  if (!parallel_scan->phs_syncscan)
1696  parallel_scan->phs_startblock = 0;
1697  else if (sync_startpage != InvalidBlockNumber)
1698  parallel_scan->phs_startblock = sync_startpage;
1699  else
1700  {
1701  SpinLockRelease(&parallel_scan->phs_mutex);
1702  sync_startpage = ss_get_location(scan->rs_rd, scan->rs_nblocks);
1703  goto retry;
1704  }
1705  }
1706  SpinLockRelease(&parallel_scan->phs_mutex);
1707 }
1708 
1709 /* ----------------
1710  * heap_parallelscan_nextpage - get the next page to scan
1711  *
1712  * Get the next page to scan. Even if there are no pages left to scan,
1713  * another backend could have grabbed a page to scan and not yet finished
1714  * looking at it, so it doesn't follow that the scan is done when the
1715  * first backend gets an InvalidBlockNumber return.
1716  * ----------------
1717  */
1718 static BlockNumber
1720 {
1721  BlockNumber page;
1722  ParallelHeapScanDesc parallel_scan;
1723  uint64 nallocated;
1724 
1725  Assert(scan->rs_parallel);
1726  parallel_scan = scan->rs_parallel;
1727 
1728  /*
1729  * phs_nallocated tracks how many pages have been allocated to workers
1730  * already. When phs_nallocated >= rs_nblocks, all blocks have been
1731  * allocated.
1732  *
1733  * Because we use an atomic fetch-and-add to fetch the current value, the
1734  * phs_nallocated counter will exceed rs_nblocks, because workers will
1735  * still increment the value, when they try to allocate the next block but
1736  * all blocks have been allocated already. The counter must be 64 bits
1737  * wide because of that, to avoid wrapping around when rs_nblocks is close
1738  * to 2^32.
1739  *
1740  * The actual page to return is calculated by adding the counter to the
1741  * starting block number, modulo nblocks.
1742  */
1743  nallocated = pg_atomic_fetch_add_u64(&parallel_scan->phs_nallocated, 1);
1744  if (nallocated >= scan->rs_nblocks)
1745  page = InvalidBlockNumber; /* all blocks have been allocated */
1746  else
1747  page = (nallocated + parallel_scan->phs_startblock) % scan->rs_nblocks;
1748 
1749  /*
1750  * Report scan location. Normally, we report the current page number.
1751  * When we reach the end of the scan, though, we report the starting page,
1752  * not the ending page, just so the starting positions for later scans
1753  * doesn't slew backwards. We only report the position at the end of the
1754  * scan once, though: subsequent callers will report nothing.
1755  */
1756  if (scan->rs_syncscan)
1757  {
1758  if (page != InvalidBlockNumber)
1759  ss_report_location(scan->rs_rd, page);
1760  else if (nallocated == scan->rs_nblocks)
1761  ss_report_location(scan->rs_rd, parallel_scan->phs_startblock);
1762  }
1763 
1764  return page;
1765 }
1766 
1767 /* ----------------
1768  * heap_update_snapshot
1769  *
1770  * Update snapshot info in heap scan descriptor.
1771  * ----------------
1772  */
1773 void
1775 {
1776  Assert(IsMVCCSnapshot(snapshot));
1777 
1778  RegisterSnapshot(snapshot);
1779  scan->rs_snapshot = snapshot;
1780  scan->rs_temp_snap = true;
1781 }
1782 
1783 /* ----------------
1784  * heap_getnext - retrieve next tuple in scan
1785  *
1786  * Fix to work with index relations.
1787  * We don't return the buffer anymore, but you can get it from the
1788  * returned HeapTuple.
1789  * ----------------
1790  */
1791 
1792 #ifdef HEAPDEBUGALL
1793 #define HEAPDEBUG_1 \
1794  elog(DEBUG2, "heap_getnext([%s,nkeys=%d],dir=%d) called", \
1795  RelationGetRelationName(scan->rs_rd), scan->rs_nkeys, (int) direction)
1796 #define HEAPDEBUG_2 \
1797  elog(DEBUG2, "heap_getnext returning EOS")
1798 #define HEAPDEBUG_3 \
1799  elog(DEBUG2, "heap_getnext returning tuple")
1800 #else
1801 #define HEAPDEBUG_1
1802 #define HEAPDEBUG_2
1803 #define HEAPDEBUG_3
1804 #endif /* !defined(HEAPDEBUGALL) */
1805 
1806 
1807 HeapTuple
1809 {
1810  /* Note: no locking manipulations needed */
1811 
1812  HEAPDEBUG_1; /* heap_getnext( info ) */
1813 
1814  if (scan->rs_pageatatime)
1815  heapgettup_pagemode(scan, direction,
1816  scan->rs_nkeys, scan->rs_key);
1817  else
1818  heapgettup(scan, direction, scan->rs_nkeys, scan->rs_key);
1819 
1820  if (scan->rs_ctup.t_data == NULL)
1821  {
1822  HEAPDEBUG_2; /* heap_getnext returning EOS */
1823  return NULL;
1824  }
1825 
1826  /*
1827  * if we get here it means we have a new current scan tuple, so point to
1828  * the proper return buffer and return the tuple.
1829  */
1830  HEAPDEBUG_3; /* heap_getnext returning tuple */
1831 
1833 
1834  return &(scan->rs_ctup);
1835 }
1836 
1837 /*
1838  * heap_fetch - retrieve tuple with given tid
1839  *
1840  * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1841  * the tuple, fill in the remaining fields of *tuple, and check the tuple
1842  * against the specified snapshot.
1843  *
1844  * If successful (tuple found and passes snapshot time qual), then *userbuf
1845  * is set to the buffer holding the tuple and TRUE is returned. The caller
1846  * must unpin the buffer when done with the tuple.
1847  *
1848  * If the tuple is not found (ie, item number references a deleted slot),
1849  * then tuple->t_data is set to NULL and FALSE is returned.
1850  *
1851  * If the tuple is found but fails the time qual check, then FALSE is returned
1852  * but tuple->t_data is left pointing to the tuple.
1853  *
1854  * keep_buf determines what is done with the buffer in the FALSE-result cases.
1855  * When the caller specifies keep_buf = true, we retain the pin on the buffer
1856  * and return it in *userbuf (so the caller must eventually unpin it); when
1857  * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer.
1858  *
1859  * stats_relation is the relation to charge the heap_fetch operation against
1860  * for statistical purposes. (This could be the heap rel itself, an
1861  * associated index, or NULL to not count the fetch at all.)
1862  *
1863  * heap_fetch does not follow HOT chains: only the exact TID requested will
1864  * be fetched.
1865  *
1866  * It is somewhat inconsistent that we ereport() on invalid block number but
1867  * return false on invalid item number. There are a couple of reasons though.
1868  * One is that the caller can relatively easily check the block number for
1869  * validity, but cannot check the item number without reading the page
1870  * himself. Another is that when we are following a t_ctid link, we can be
1871  * reasonably confident that the page number is valid (since VACUUM shouldn't
1872  * truncate off the destination page without having killed the referencing
1873  * tuple first), but the item number might well not be good.
1874  */
1875 bool
1877  Snapshot snapshot,
1878  HeapTuple tuple,
1879  Buffer *userbuf,
1880  bool keep_buf,
1881  Relation stats_relation)
1882 {
1883  ItemPointer tid = &(tuple->t_self);
1884  ItemId lp;
1885  Buffer buffer;
1886  Page page;
1887  OffsetNumber offnum;
1888  bool valid;
1889 
1890  /*
1891  * Fetch and pin the appropriate page of the relation.
1892  */
1893  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1894 
1895  /*
1896  * Need share lock on buffer to examine tuple commit status.
1897  */
1898  LockBuffer(buffer, BUFFER_LOCK_SHARE);
1899  page = BufferGetPage(buffer);
1900  TestForOldSnapshot(snapshot, relation, page);
1901 
1902  /*
1903  * We'd better check for out-of-range offnum in case of VACUUM since the
1904  * TID was obtained.
1905  */
1906  offnum = ItemPointerGetOffsetNumber(tid);
1907  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1908  {
1909  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1910  if (keep_buf)
1911  *userbuf = buffer;
1912  else
1913  {
1914  ReleaseBuffer(buffer);
1915  *userbuf = InvalidBuffer;
1916  }
1917  tuple->t_data = NULL;
1918  return false;
1919  }
1920 
1921  /*
1922  * get the item line pointer corresponding to the requested tid
1923  */
1924  lp = PageGetItemId(page, offnum);
1925 
1926  /*
1927  * Must check for deleted tuple.
1928  */
1929  if (!ItemIdIsNormal(lp))
1930  {
1931  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1932  if (keep_buf)
1933  *userbuf = buffer;
1934  else
1935  {
1936  ReleaseBuffer(buffer);
1937  *userbuf = InvalidBuffer;
1938  }
1939  tuple->t_data = NULL;
1940  return false;
1941  }
1942 
1943  /*
1944  * fill in *tuple fields
1945  */
1946  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1947  tuple->t_len = ItemIdGetLength(lp);
1948  tuple->t_tableOid = RelationGetRelid(relation);
1949 
1950  /*
1951  * check time qualification of tuple, then release lock
1952  */
1953  valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1954 
1955  if (valid)
1956  PredicateLockTuple(relation, tuple, snapshot);
1957 
1958  CheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1959 
1960  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
1961 
1962  if (valid)
1963  {
1964  /*
1965  * All checks passed, so return the tuple as valid. Caller is now
1966  * responsible for releasing the buffer.
1967  */
1968  *userbuf = buffer;
1969 
1970  /* Count the successful fetch against appropriate rel, if any */
1971  if (stats_relation != NULL)
1972  pgstat_count_heap_fetch(stats_relation);
1973 
1974  return true;
1975  }
1976 
1977  /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1978  if (keep_buf)
1979  *userbuf = buffer;
1980  else
1981  {
1982  ReleaseBuffer(buffer);
1983  *userbuf = InvalidBuffer;
1984  }
1985 
1986  return false;
1987 }
1988 
1989 /*
1990  * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1991  *
1992  * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1993  * of a HOT chain), and buffer is the buffer holding this tuple. We search
1994  * for the first chain member satisfying the given snapshot. If one is
1995  * found, we update *tid to reference that tuple's offset number, and
1996  * return TRUE. If no match, return FALSE without modifying *tid.
1997  *
1998  * heapTuple is a caller-supplied buffer. When a match is found, we return
1999  * the tuple here, in addition to updating *tid. If no match is found, the
2000  * contents of this buffer on return are undefined.
2001  *
2002  * If all_dead is not NULL, we check non-visible tuples to see if they are
2003  * globally dead; *all_dead is set TRUE if all members of the HOT chain
2004  * are vacuumable, FALSE if not.
2005  *
2006  * Unlike heap_fetch, the caller must already have pin and (at least) share
2007  * lock on the buffer; it is still pinned/locked at exit. Also unlike
2008  * heap_fetch, we do not report any pgstats count; caller may do so if wanted.
2009  */
2010 bool
2012  Snapshot snapshot, HeapTuple heapTuple,
2013  bool *all_dead, bool first_call)
2014 {
2015  Page dp = (Page) BufferGetPage(buffer);
2016  TransactionId prev_xmax = InvalidTransactionId;
2017  OffsetNumber offnum;
2018  bool at_chain_start;
2019  bool valid;
2020  bool skip;
2021 
2022  /* If this is not the first call, previous call returned a (live!) tuple */
2023  if (all_dead)
2024  *all_dead = first_call;
2025 
2027 
2029  offnum = ItemPointerGetOffsetNumber(tid);
2030  at_chain_start = first_call;
2031  skip = !first_call;
2032 
2033  heapTuple->t_self = *tid;
2034 
2035  /* Scan through possible multiple members of HOT-chain */
2036  for (;;)
2037  {
2038  ItemId lp;
2039 
2040  /* check for bogus TID */
2041  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp))
2042  break;
2043 
2044  lp = PageGetItemId(dp, offnum);
2045 
2046  /* check for unused, dead, or redirected items */
2047  if (!ItemIdIsNormal(lp))
2048  {
2049  /* We should only see a redirect at start of chain */
2050  if (ItemIdIsRedirected(lp) && at_chain_start)
2051  {
2052  /* Follow the redirect */
2053  offnum = ItemIdGetRedirect(lp);
2054  at_chain_start = false;
2055  continue;
2056  }
2057  /* else must be end of chain */
2058  break;
2059  }
2060 
2061  heapTuple->t_data = (HeapTupleHeader) PageGetItem(dp, lp);
2062  heapTuple->t_len = ItemIdGetLength(lp);
2063  heapTuple->t_tableOid = RelationGetRelid(relation);
2064  ItemPointerSetOffsetNumber(&heapTuple->t_self, offnum);
2065 
2066  /*
2067  * Shouldn't see a HEAP_ONLY tuple at chain start.
2068  */
2069  if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
2070  break;
2071 
2072  /*
2073  * The xmin should match the previous xmax value, else chain is
2074  * broken.
2075  */
2076  if (TransactionIdIsValid(prev_xmax) &&
2077  !TransactionIdEquals(prev_xmax,
2078  HeapTupleHeaderGetXmin(heapTuple->t_data)))
2079  break;
2080 
2081  /*
2082  * When first_call is true (and thus, skip is initially false) we'll
2083  * return the first tuple we find. But on later passes, heapTuple
2084  * will initially be pointing to the tuple we returned last time.
2085  * Returning it again would be incorrect (and would loop forever), so
2086  * we skip it and return the next match we find.
2087  */
2088  if (!skip)
2089  {
2090  /*
2091  * For the benefit of logical decoding, have t_self point at the
2092  * element of the HOT chain we're currently investigating instead
2093  * of the root tuple of the HOT chain. This is important because
2094  * the *Satisfies routine for historical mvcc snapshots needs the
2095  * correct tid to decide about the visibility in some cases.
2096  */
2097  ItemPointerSet(&(heapTuple->t_self), BufferGetBlockNumber(buffer), offnum);
2098 
2099  /* If it's visible per the snapshot, we must return it */
2100  valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
2101  CheckForSerializableConflictOut(valid, relation, heapTuple,
2102  buffer, snapshot);
2103  /* reset to original, non-redirected, tid */
2104  heapTuple->t_self = *tid;
2105 
2106  if (valid)
2107  {
2108  ItemPointerSetOffsetNumber(tid, offnum);
2109  PredicateLockTuple(relation, heapTuple, snapshot);
2110  if (all_dead)
2111  *all_dead = false;
2112  return true;
2113  }
2114  }
2115  skip = false;
2116 
2117  /*
2118  * If we can't see it, maybe no one else can either. At caller
2119  * request, check whether all chain members are dead to all
2120  * transactions.
2121  *
2122  * Note: if you change the criterion here for what is "dead", fix the
2123  * planner's get_actual_variable_range() function to match.
2124  */
2125  if (all_dead && *all_dead &&
2127  *all_dead = false;
2128 
2129  /*
2130  * Check to see if HOT chain continues past this tuple; if so fetch
2131  * the next offnum and loop around.
2132  */
2133  if (HeapTupleIsHotUpdated(heapTuple))
2134  {
2137  offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
2138  at_chain_start = false;
2139  prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
2140  }
2141  else
2142  break; /* end of chain */
2143  }
2144 
2145  return false;
2146 }
2147 
2148 /*
2149  * heap_hot_search - search HOT chain for tuple satisfying snapshot
2150  *
2151  * This has the same API as heap_hot_search_buffer, except that the caller
2152  * does not provide the buffer containing the page, rather we access it
2153  * locally.
2154  */
2155 bool
2157  bool *all_dead)
2158 {
2159  bool result;
2160  Buffer buffer;
2161  HeapTupleData heapTuple;
2162 
2163  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
2164  LockBuffer(buffer, BUFFER_LOCK_SHARE);
2165  result = heap_hot_search_buffer(tid, relation, buffer, snapshot,
2166  &heapTuple, all_dead, true);
2167  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2168  ReleaseBuffer(buffer);
2169  return result;
2170 }
2171 
2172 /*
2173  * heap_get_latest_tid - get the latest tid of a specified tuple
2174  *
2175  * Actually, this gets the latest version that is visible according to
2176  * the passed snapshot. You can pass SnapshotDirty to get the very latest,
2177  * possibly uncommitted version.
2178  *
2179  * *tid is both an input and an output parameter: it is updated to
2180  * show the latest version of the row. Note that it will not be changed
2181  * if no version of the row passes the snapshot test.
2182  */
2183 void
2185  Snapshot snapshot,
2186  ItemPointer tid)
2187 {
2188  BlockNumber blk;
2189  ItemPointerData ctid;
2190  TransactionId priorXmax;
2191 
2192  /* this is to avoid Assert failures on bad input */
2193  if (!ItemPointerIsValid(tid))
2194  return;
2195 
2196  /*
2197  * Since this can be called with user-supplied TID, don't trust the input
2198  * too much. (RelationGetNumberOfBlocks is an expensive check, so we
2199  * don't check t_ctid links again this way. Note that it would not do to
2200  * call it just once and save the result, either.)
2201  */
2202  blk = ItemPointerGetBlockNumber(tid);
2203  if (blk >= RelationGetNumberOfBlocks(relation))
2204  elog(ERROR, "block number %u is out of range for relation \"%s\"",
2205  blk, RelationGetRelationName(relation));
2206 
2207  /*
2208  * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
2209  * need to examine, and *tid is the TID we will return if ctid turns out
2210  * to be bogus.
2211  *
2212  * Note that we will loop until we reach the end of the t_ctid chain.
2213  * Depending on the snapshot passed, there might be at most one visible
2214  * version of the row, but we don't try to optimize for that.
2215  */
2216  ctid = *tid;
2217  priorXmax = InvalidTransactionId; /* cannot check first XMIN */
2218  for (;;)
2219  {
2220  Buffer buffer;
2221  Page page;
2222  OffsetNumber offnum;
2223  ItemId lp;
2224  HeapTupleData tp;
2225  bool valid;
2226 
2227  /*
2228  * Read, pin, and lock the page.
2229  */
2230  buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
2231  LockBuffer(buffer, BUFFER_LOCK_SHARE);
2232  page = BufferGetPage(buffer);
2233  TestForOldSnapshot(snapshot, relation, page);
2234 
2235  /*
2236  * Check for bogus item number. This is not treated as an error
2237  * condition because it can happen while following a t_ctid link. We
2238  * just assume that the prior tid is OK and return it unchanged.
2239  */
2240  offnum = ItemPointerGetOffsetNumber(&ctid);
2241  if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
2242  {
2243  UnlockReleaseBuffer(buffer);
2244  break;
2245  }
2246  lp = PageGetItemId(page, offnum);
2247  if (!ItemIdIsNormal(lp))
2248  {
2249  UnlockReleaseBuffer(buffer);
2250  break;
2251  }
2252 
2253  /* OK to access the tuple */
2254  tp.t_self = ctid;
2255  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2256  tp.t_len = ItemIdGetLength(lp);
2257  tp.t_tableOid = RelationGetRelid(relation);
2258 
2259  /*
2260  * After following a t_ctid link, we might arrive at an unrelated
2261  * tuple. Check for XMIN match.
2262  */
2263  if (TransactionIdIsValid(priorXmax) &&
2265  {
2266  UnlockReleaseBuffer(buffer);
2267  break;
2268  }
2269 
2270  /*
2271  * Check time qualification of tuple; if visible, set it as the new
2272  * result candidate.
2273  */
2274  valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2275  CheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2276  if (valid)
2277  *tid = ctid;
2278 
2279  /*
2280  * If there's a valid t_ctid link, follow it, else we're done.
2281  */
2282  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2285  {
2286  UnlockReleaseBuffer(buffer);
2287  break;
2288  }
2289 
2290  ctid = tp.t_data->t_ctid;
2291  priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
2292  UnlockReleaseBuffer(buffer);
2293  } /* end of loop */
2294 }
2295 
2296 
2297 /*
2298  * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2299  *
2300  * This is called after we have waited for the XMAX transaction to terminate.
2301  * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2302  * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2303  * hint bit if possible --- but beware that that may not yet be possible,
2304  * if the transaction committed asynchronously.
2305  *
2306  * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2307  * even if it commits.
2308  *
2309  * Hence callers should look only at XMAX_INVALID.
2310  *
2311  * Note this is not allowed for tuples whose xmax is a multixact.
2312  */
2313 static void
2315 {
2317  Assert(!(tuple->t_infomask & HEAP_XMAX_IS_MULTI));
2318 
2319  if (!(tuple->t_infomask & (HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID)))
2320  {
2321  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2324  xid);
2325  else
2326  HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID,
2328  }
2329 }
2330 
2331 
2332 /*
2333  * GetBulkInsertState - prepare status object for a bulk insert
2334  */
2337 {
2338  BulkInsertState bistate;
2339 
2340  bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
2342  bistate->current_buf = InvalidBuffer;
2343  return bistate;
2344 }
2345 
2346 /*
2347  * FreeBulkInsertState - clean up after finishing a bulk insert
2348  */
2349 void
2351 {
2352  if (bistate->current_buf != InvalidBuffer)
2353  ReleaseBuffer(bistate->current_buf);
2354  FreeAccessStrategy(bistate->strategy);
2355  pfree(bistate);
2356 }
2357 
2358 /*
2359  * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2360  */
2361 void
2363 {
2364  if (bistate->current_buf != InvalidBuffer)
2365  ReleaseBuffer(bistate->current_buf);
2366  bistate->current_buf = InvalidBuffer;
2367 }
2368 
2369 
2370 /*
2371  * heap_insert - insert tuple into a heap
2372  *
2373  * The new tuple is stamped with current transaction ID and the specified
2374  * command ID.
2375  *
2376  * If the HEAP_INSERT_SKIP_WAL option is specified, the new tuple is not
2377  * logged in WAL, even for a non-temp relation. Safe usage of this behavior
2378  * requires that we arrange that all new tuples go into new pages not
2379  * containing any tuples from other transactions, and that the relation gets
2380  * fsync'd before commit. (See also heap_sync() comments)
2381  *
2382  * The HEAP_INSERT_SKIP_FSM option is passed directly to
2383  * RelationGetBufferForTuple, which see for more info.
2384  *
2385  * HEAP_INSERT_FROZEN should only be specified for inserts into
2386  * relfilenodes created during the current subtransaction and when
2387  * there are no prior snapshots or pre-existing portals open.
2388  * This causes rows to be frozen, which is an MVCC violation and
2389  * requires explicit options chosen by user.
2390  *
2391  * HEAP_INSERT_IS_SPECULATIVE is used on so-called "speculative insertions",
2392  * which can be backed out afterwards without aborting the whole transaction.
2393  * Other sessions can wait for the speculative insertion to be confirmed,
2394  * turning it into a regular tuple, or aborted, as if it never existed.
2395  * Speculatively inserted tuples behave as "value locks" of short duration,
2396  * used to implement INSERT .. ON CONFLICT.
2397  *
2398  * Note that most of these options will be applied when inserting into the
2399  * heap's TOAST table, too, if the tuple requires any out-of-line data. Only
2400  * HEAP_INSERT_IS_SPECULATIVE is explicitly ignored, as the toast data does
2401  * not partake in speculative insertion.
2402  *
2403  * The BulkInsertState object (if any; bistate can be NULL for default
2404  * behavior) is also just passed through to RelationGetBufferForTuple.
2405  *
2406  * The return value is the OID assigned to the tuple (either here or by the
2407  * caller), or InvalidOid if no OID. The header fields of *tup are updated
2408  * to match the stored tuple; in particular tup->t_self receives the actual
2409  * TID where the tuple was stored. But note that any toasting of fields
2410  * within the tuple data is NOT reflected into *tup.
2411  */
2412 Oid
2414  int options, BulkInsertState bistate)
2415 {
2417  HeapTuple heaptup;
2418  Buffer buffer;
2419  Buffer vmbuffer = InvalidBuffer;
2420  bool all_visible_cleared = false;
2421 
2422  /*
2423  * Fill in tuple header fields, assign an OID, and toast the tuple if
2424  * necessary.
2425  *
2426  * Note: below this point, heaptup is the data we actually intend to store
2427  * into the relation; tup is the caller's original untoasted data.
2428  */
2429  heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2430 
2431  /*
2432  * Find buffer to insert this tuple into. If the page is all visible,
2433  * this will also pin the requisite visibility map page.
2434  */
2435  buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2436  InvalidBuffer, options, bistate,
2437  &vmbuffer, NULL);
2438 
2439  /*
2440  * We're about to do the actual insert -- but check for conflict first, to
2441  * avoid possibly having to roll back work we've just done.
2442  *
2443  * This is safe without a recheck as long as there is no possibility of
2444  * another process scanning the page between this check and the insert
2445  * being visible to the scan (i.e., an exclusive buffer content lock is
2446  * continuously held from this point until the tuple insert is visible).
2447  *
2448  * For a heap insert, we only need to check for table-level SSI locks. Our
2449  * new tuple can't possibly conflict with existing tuple locks, and heap
2450  * page locks are only consolidated versions of tuple locks; they do not
2451  * lock "gaps" as index page locks do. So we don't need to specify a
2452  * buffer when making the call, which makes for a faster check.
2453  */
2455 
2456  /* NO EREPORT(ERROR) from here till changes are logged */
2458 
2459  RelationPutHeapTuple(relation, buffer, heaptup,
2460  (options & HEAP_INSERT_SPECULATIVE) != 0);
2461 
2462  if (PageIsAllVisible(BufferGetPage(buffer)))
2463  {
2464  all_visible_cleared = true;
2466  visibilitymap_clear(relation,
2467  ItemPointerGetBlockNumber(&(heaptup->t_self)),
2468  vmbuffer, VISIBILITYMAP_VALID_BITS);
2469  }
2470 
2471  /*
2472  * XXX Should we set PageSetPrunable on this page ?
2473  *
2474  * The inserting transaction may eventually abort thus making this tuple
2475  * DEAD and hence available for pruning. Though we don't want to optimize
2476  * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2477  * aborted tuple will never be pruned until next vacuum is triggered.
2478  *
2479  * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2480  */
2481 
2482  MarkBufferDirty(buffer);
2483 
2484  /* XLOG stuff */
2485  if (!(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation))
2486  {
2487  xl_heap_insert xlrec;
2488  xl_heap_header xlhdr;
2489  XLogRecPtr recptr;
2490  Page page = BufferGetPage(buffer);
2491  uint8 info = XLOG_HEAP_INSERT;
2492  int bufflags = 0;
2493 
2494  /*
2495  * If this is a catalog, we need to transmit combocids to properly
2496  * decode, so log that as well.
2497  */
2499  log_heap_new_cid(relation, heaptup);
2500 
2501  /*
2502  * If this is the single and first tuple on page, we can reinit the
2503  * page instead of restoring the whole thing. Set flag, and hide
2504  * buffer references from XLogInsert.
2505  */
2506  if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
2508  {
2509  info |= XLOG_HEAP_INIT_PAGE;
2510  bufflags |= REGBUF_WILL_INIT;
2511  }
2512 
2513  xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2514  xlrec.flags = 0;
2515  if (all_visible_cleared)
2517  if (options & HEAP_INSERT_SPECULATIVE)
2520 
2521  /*
2522  * For logical decoding, we need the tuple even if we're doing a full
2523  * page write, so make sure it's included even if we take a full-page
2524  * image. (XXX We could alternatively store a pointer into the FPW).
2525  */
2526  if (RelationIsLogicallyLogged(relation))
2527  {
2529  bufflags |= REGBUF_KEEP_DATA;
2530  }
2531 
2532  XLogBeginInsert();
2533  XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2534 
2535  xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2536  xlhdr.t_infomask = heaptup->t_data->t_infomask;
2537  xlhdr.t_hoff = heaptup->t_data->t_hoff;
2538 
2539  /*
2540  * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2541  * write the whole page to the xlog, we don't need to store
2542  * xl_heap_header in the xlog.
2543  */
2544  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2545  XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2546  /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2548  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2549  heaptup->t_len - SizeofHeapTupleHeader);
2550 
2551  /* filtering by origin on a row level is much more efficient */
2553 
2554  recptr = XLogInsert(RM_HEAP_ID, info);
2555 
2556  PageSetLSN(page, recptr);
2557  }
2558 
2559  END_CRIT_SECTION();
2560 
2561  UnlockReleaseBuffer(buffer);
2562  if (vmbuffer != InvalidBuffer)
2563  ReleaseBuffer(vmbuffer);
2564 
2565  /*
2566  * If tuple is cachable, mark it for invalidation from the caches in case
2567  * we abort. Note it is OK to do this after releasing the buffer, because
2568  * the heaptup data structure is all in local memory, not in the shared
2569  * buffer.
2570  */
2571  CacheInvalidateHeapTuple(relation, heaptup, NULL);
2572 
2573  /* Note: speculative insertions are counted too, even if aborted later */
2574  pgstat_count_heap_insert(relation, 1);
2575 
2576  /*
2577  * If heaptup is a private copy, release it. Don't forget to copy t_self
2578  * back to the caller's image, too.
2579  */
2580  if (heaptup != tup)
2581  {
2582  tup->t_self = heaptup->t_self;
2583  heap_freetuple(heaptup);
2584  }
2585 
2586  return HeapTupleGetOid(tup);
2587 }
2588 
2589 /*
2590  * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2591  * tuple header fields, assigns an OID, and toasts the tuple if necessary.
2592  * Returns a toasted version of the tuple if it was toasted, or the original
2593  * tuple if not. Note that in any case, the header fields are also set in
2594  * the original tuple.
2595  */
2596 static HeapTuple
2598  CommandId cid, int options)
2599 {
2600  /*
2601  * For now, parallel operations are required to be strictly read-only.
2602  * Unlike heap_update() and heap_delete(), an insert should never create a
2603  * combo CID, so it might be possible to relax this restriction, but not
2604  * without more thought and testing.
2605  */
2606  if (IsInParallelMode())
2607  ereport(ERROR,
2608  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2609  errmsg("cannot insert tuples during a parallel operation")));
2610 
2611  if (relation->rd_rel->relhasoids)
2612  {
2613 #ifdef NOT_USED
2614  /* this is redundant with an Assert in HeapTupleSetOid */
2616 #endif
2617 
2618  /*
2619  * If the object id of this tuple has already been assigned, trust the
2620  * caller. There are a couple of ways this can happen. At initial db
2621  * creation, the backend program sets oids for tuples. When we define
2622  * an index, we set the oid. Finally, in the future, we may allow
2623  * users to set their own object ids in order to support a persistent
2624  * object store (objects need to contain pointers to one another).
2625  */
2626  if (!OidIsValid(HeapTupleGetOid(tup)))
2627  HeapTupleSetOid(tup, GetNewOid(relation));
2628  }
2629  else
2630  {
2631  /* check there is not space for an OID */
2632  Assert(!(tup->t_data->t_infomask & HEAP_HASOID));
2633  }
2634 
2635  tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2636  tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2638  HeapTupleHeaderSetXmin(tup->t_data, xid);
2639  if (options & HEAP_INSERT_FROZEN)
2641 
2642  HeapTupleHeaderSetCmin(tup->t_data, cid);
2643  HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2644  tup->t_tableOid = RelationGetRelid(relation);
2645 
2646  /*
2647  * If the new tuple is too big for storage or contains already toasted
2648  * out-of-line attributes from some other relation, invoke the toaster.
2649  */
2650  if (relation->rd_rel->relkind != RELKIND_RELATION &&
2651  relation->rd_rel->relkind != RELKIND_MATVIEW)
2652  {
2653  /* toast table entries should never be recursively toasted */
2655  return tup;
2656  }
2657  else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2658  return toast_insert_or_update(relation, tup, NULL, options);
2659  else
2660  return tup;
2661 }
2662 
2663 /*
2664  * heap_multi_insert - insert multiple tuple into a heap
2665  *
2666  * This is like heap_insert(), but inserts multiple tuples in one operation.
2667  * That's faster than calling heap_insert() in a loop, because when multiple
2668  * tuples can be inserted on a single page, we can write just a single WAL
2669  * record covering all of them, and only need to lock/unlock the page once.
2670  *
2671  * Note: this leaks memory into the current memory context. You can create a
2672  * temporary context before calling this, if that's a problem.
2673  */
2674 void
2675 heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
2676  CommandId cid, int options, BulkInsertState bistate)
2677 {
2679  HeapTuple *heaptuples;
2680  int i;
2681  int ndone;
2682  char *scratch = NULL;
2683  Page page;
2684  bool needwal;
2685  Size saveFreeSpace;
2686  bool need_tuple_data = RelationIsLogicallyLogged(relation);
2687  bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2688 
2689  needwal = !(options & HEAP_INSERT_SKIP_WAL) && RelationNeedsWAL(relation);
2690  saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2692 
2693  /* Toast and set header data in all the tuples */
2694  heaptuples = palloc(ntuples * sizeof(HeapTuple));
2695  for (i = 0; i < ntuples; i++)
2696  heaptuples[i] = heap_prepare_insert(relation, tuples[i],
2697  xid, cid, options);
2698 
2699  /*
2700  * Allocate some memory to use for constructing the WAL record. Using
2701  * palloc() within a critical section is not safe, so we allocate this
2702  * beforehand.
2703  */
2704  if (needwal)
2705  scratch = palloc(BLCKSZ);
2706 
2707  /*
2708  * We're about to do the actual inserts -- but check for conflict first,
2709  * to minimize the possibility of having to roll back work we've just
2710  * done.
2711  *
2712  * A check here does not definitively prevent a serialization anomaly;
2713  * that check MUST be done at least past the point of acquiring an
2714  * exclusive buffer content lock on every buffer that will be affected,
2715  * and MAY be done after all inserts are reflected in the buffers and
2716  * those locks are released; otherwise there race condition. Since
2717  * multiple buffers can be locked and unlocked in the loop below, and it
2718  * would not be feasible to identify and lock all of those buffers before
2719  * the loop, we must do a final check at the end.
2720  *
2721  * The check here could be omitted with no loss of correctness; it is
2722  * present strictly as an optimization.
2723  *
2724  * For heap inserts, we only need to check for table-level SSI locks. Our
2725  * new tuples can't possibly conflict with existing tuple locks, and heap
2726  * page locks are only consolidated versions of tuple locks; they do not
2727  * lock "gaps" as index page locks do. So we don't need to specify a
2728  * buffer when making the call, which makes for a faster check.
2729  */
2731 
2732  ndone = 0;
2733  while (ndone < ntuples)
2734  {
2735  Buffer buffer;
2736  Buffer vmbuffer = InvalidBuffer;
2737  bool all_visible_cleared = false;
2738  int nthispage;
2739 
2741 
2742  /*
2743  * Find buffer where at least the next tuple will fit. If the page is
2744  * all-visible, this will also pin the requisite visibility map page.
2745  */
2746  buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2747  InvalidBuffer, options, bistate,
2748  &vmbuffer, NULL);
2749  page = BufferGetPage(buffer);
2750 
2751  /* NO EREPORT(ERROR) from here till changes are logged */
2753 
2754  /*
2755  * RelationGetBufferForTuple has ensured that the first tuple fits.
2756  * Put that on the page, and then as many other tuples as fit.
2757  */
2758  RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2759  for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2760  {
2761  HeapTuple heaptup = heaptuples[ndone + nthispage];
2762 
2763  if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2764  break;
2765 
2766  RelationPutHeapTuple(relation, buffer, heaptup, false);
2767 
2768  /*
2769  * We don't use heap_multi_insert for catalog tuples yet, but
2770  * better be prepared...
2771  */
2772  if (needwal && need_cids)
2773  log_heap_new_cid(relation, heaptup);
2774  }
2775 
2776  if (PageIsAllVisible(page))
2777  {
2778  all_visible_cleared = true;
2779  PageClearAllVisible(page);
2780  visibilitymap_clear(relation,
2781  BufferGetBlockNumber(buffer),
2782  vmbuffer, VISIBILITYMAP_VALID_BITS);
2783  }
2784 
2785  /*
2786  * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2787  */
2788 
2789  MarkBufferDirty(buffer);
2790 
2791  /* XLOG stuff */
2792  if (needwal)
2793  {
2794  XLogRecPtr recptr;
2795  xl_heap_multi_insert *xlrec;
2797  char *tupledata;
2798  int totaldatalen;
2799  char *scratchptr = scratch;
2800  bool init;
2801  int bufflags = 0;
2802 
2803  /*
2804  * If the page was previously empty, we can reinit the page
2805  * instead of restoring the whole thing.
2806  */
2807  init = (ItemPointerGetOffsetNumber(&(heaptuples[ndone]->t_self)) == FirstOffsetNumber &&
2808  PageGetMaxOffsetNumber(page) == FirstOffsetNumber + nthispage - 1);
2809 
2810  /* allocate xl_heap_multi_insert struct from the scratch area */
2811  xlrec = (xl_heap_multi_insert *) scratchptr;
2812  scratchptr += SizeOfHeapMultiInsert;
2813 
2814  /*
2815  * Allocate offsets array. Unless we're reinitializing the page,
2816  * in that case the tuples are stored in order starting at
2817  * FirstOffsetNumber and we don't need to store the offsets
2818  * explicitly.
2819  */
2820  if (!init)
2821  scratchptr += nthispage * sizeof(OffsetNumber);
2822 
2823  /* the rest of the scratch space is used for tuple data */
2824  tupledata = scratchptr;
2825 
2826  xlrec->flags = all_visible_cleared ? XLH_INSERT_ALL_VISIBLE_CLEARED : 0;
2827  xlrec->ntuples = nthispage;
2828 
2829  /*
2830  * Write out an xl_multi_insert_tuple and the tuple data itself
2831  * for each tuple.
2832  */
2833  for (i = 0; i < nthispage; i++)
2834  {
2835  HeapTuple heaptup = heaptuples[ndone + i];
2836  xl_multi_insert_tuple *tuphdr;
2837  int datalen;
2838 
2839  if (!init)
2840  xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2841  /* xl_multi_insert_tuple needs two-byte alignment. */
2842  tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2843  scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2844 
2845  tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2846  tuphdr->t_infomask = heaptup->t_data->t_infomask;
2847  tuphdr->t_hoff = heaptup->t_data->t_hoff;
2848 
2849  /* write bitmap [+ padding] [+ oid] + data */
2850  datalen = heaptup->t_len - SizeofHeapTupleHeader;
2851  memcpy(scratchptr,
2852  (char *) heaptup->t_data + SizeofHeapTupleHeader,
2853  datalen);
2854  tuphdr->datalen = datalen;
2855  scratchptr += datalen;
2856  }
2857  totaldatalen = scratchptr - tupledata;
2858  Assert((scratchptr - scratch) < BLCKSZ);
2859 
2860  if (need_tuple_data)
2862 
2863  /*
2864  * Signal that this is the last xl_heap_multi_insert record
2865  * emitted by this call to heap_multi_insert(). Needed for logical
2866  * decoding so it knows when to cleanup temporary data.
2867  */
2868  if (ndone + nthispage == ntuples)
2869  xlrec->flags |= XLH_INSERT_LAST_IN_MULTI;
2870 
2871  if (init)
2872  {
2873  info |= XLOG_HEAP_INIT_PAGE;
2874  bufflags |= REGBUF_WILL_INIT;
2875  }
2876 
2877  /*
2878  * If we're doing logical decoding, include the new tuple data
2879  * even if we take a full-page image of the page.
2880  */
2881  if (need_tuple_data)
2882  bufflags |= REGBUF_KEEP_DATA;
2883 
2884  XLogBeginInsert();
2885  XLogRegisterData((char *) xlrec, tupledata - scratch);
2886  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2887 
2888  XLogRegisterBufData(0, tupledata, totaldatalen);
2889 
2890  /* filtering by origin on a row level is much more efficient */
2892 
2893  recptr = XLogInsert(RM_HEAP2_ID, info);
2894 
2895  PageSetLSN(page, recptr);
2896  }
2897 
2898  END_CRIT_SECTION();
2899 
2900  UnlockReleaseBuffer(buffer);
2901  if (vmbuffer != InvalidBuffer)
2902  ReleaseBuffer(vmbuffer);
2903 
2904  ndone += nthispage;
2905  }
2906 
2907  /*
2908  * We're done with the actual inserts. Check for conflicts again, to
2909  * ensure that all rw-conflicts in to these inserts are detected. Without
2910  * this final check, a sequential scan of the heap may have locked the
2911  * table after the "before" check, missing one opportunity to detect the
2912  * conflict, and then scanned the table before the new tuples were there,
2913  * missing the other chance to detect the conflict.
2914  *
2915  * For heap inserts, we only need to check for table-level SSI locks. Our
2916  * new tuples can't possibly conflict with existing tuple locks, and heap
2917  * page locks are only consolidated versions of tuple locks; they do not
2918  * lock "gaps" as index page locks do. So we don't need to specify a
2919  * buffer when making the call.
2920  */
2922 
2923  /*
2924  * If tuples are cachable, mark them for invalidation from the caches in
2925  * case we abort. Note it is OK to do this after releasing the buffer,
2926  * because the heaptuples data structure is all in local memory, not in
2927  * the shared buffer.
2928  */
2929  if (IsCatalogRelation(relation))
2930  {
2931  for (i = 0; i < ntuples; i++)
2932  CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2933  }
2934 
2935  /*
2936  * Copy t_self fields back to the caller's original tuples. This does
2937  * nothing for untoasted tuples (tuples[i] == heaptuples[i)], but it's
2938  * probably faster to always copy than check.
2939  */
2940  for (i = 0; i < ntuples; i++)
2941  tuples[i]->t_self = heaptuples[i]->t_self;
2942 
2943  pgstat_count_heap_insert(relation, ntuples);
2944 }
2945 
2946 /*
2947  * simple_heap_insert - insert a tuple
2948  *
2949  * Currently, this routine differs from heap_insert only in supplying
2950  * a default command ID and not allowing access to the speedup options.
2951  *
2952  * This should be used rather than using heap_insert directly in most places
2953  * where we are modifying system catalogs.
2954  */
2955 Oid
2957 {
2958  return heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2959 }
2960 
2961 /*
2962  * Given infomask/infomask2, compute the bits that must be saved in the
2963  * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2964  * xl_heap_lock_updated WAL records.
2965  *
2966  * See fix_infomask_from_infobits.
2967  */
2968 static uint8
2969 compute_infobits(uint16 infomask, uint16 infomask2)
2970 {
2971  return
2972  ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2973  ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2974  ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2975  /* note we ignore HEAP_XMAX_SHR_LOCK here */
2976  ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2977  ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2978  XLHL_KEYS_UPDATED : 0);
2979 }
2980 
2981 /*
2982  * Given two versions of the same t_infomask for a tuple, compare them and
2983  * return whether the relevant status for a tuple Xmax has changed. This is
2984  * used after a buffer lock has been released and reacquired: we want to ensure
2985  * that the tuple state continues to be the same it was when we previously
2986  * examined it.
2987  *
2988  * Note the Xmax field itself must be compared separately.
2989  */
2990 static inline bool
2991 xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2992 {
2993  const uint16 interesting =
2995 
2996  if ((new_infomask & interesting) != (old_infomask & interesting))
2997  return true;
2998 
2999  return false;
3000 }
3001 
3002 /*
3003  * heap_delete - delete a tuple
3004  *
3005  * NB: do not call this directly unless you are prepared to deal with
3006  * concurrent-update conditions. Use simple_heap_delete instead.
3007  *
3008  * relation - table to be modified (caller must hold suitable lock)
3009  * tid - TID of tuple to be deleted
3010  * cid - delete command ID (used for visibility test, and stored into
3011  * cmax if successful)
3012  * crosscheck - if not InvalidSnapshot, also check tuple against this
3013  * wait - true if should wait for any conflicting update to commit/abort
3014  * hufd - output parameter, filled in failure cases (see below)
3015  *
3016  * Normal, successful return value is HeapTupleMayBeUpdated, which
3017  * actually means we did delete it. Failure return codes are
3018  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3019  * (the last only possible if wait == false).
3020  *
3021  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3022  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3023  * (the last only for HeapTupleSelfUpdated, since we
3024  * cannot obtain cmax from a combocid generated by another transaction).
3025  * See comments for struct HeapUpdateFailureData for additional info.
3026  */
3029  CommandId cid, Snapshot crosscheck, bool wait,
3030  HeapUpdateFailureData *hufd)
3031 {
3032  HTSU_Result result;
3034  ItemId lp;
3035  HeapTupleData tp;
3036  Page page;
3037  BlockNumber block;
3038  Buffer buffer;
3039  Buffer vmbuffer = InvalidBuffer;
3040  TransactionId new_xmax;
3041  uint16 new_infomask,
3042  new_infomask2;
3043  bool have_tuple_lock = false;
3044  bool iscombo;
3045  bool all_visible_cleared = false;
3046  HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
3047  bool old_key_copied = false;
3048 
3049  Assert(ItemPointerIsValid(tid));
3050 
3051  /*
3052  * Forbid this during a parallel operation, lest it allocate a combocid.
3053  * Other workers might need that combocid for visibility checks, and we
3054  * have no provision for broadcasting it to them.
3055  */
3056  if (IsInParallelMode())
3057  ereport(ERROR,
3058  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3059  errmsg("cannot delete tuples during a parallel operation")));
3060 
3061  block = ItemPointerGetBlockNumber(tid);
3062  buffer = ReadBuffer(relation, block);
3063  page = BufferGetPage(buffer);
3064 
3065  /*
3066  * Before locking the buffer, pin the visibility map page if it appears to
3067  * be necessary. Since we haven't got the lock yet, someone else might be
3068  * in the middle of changing this, so we'll need to recheck after we have
3069  * the lock.
3070  */
3071  if (PageIsAllVisible(page))
3072  visibilitymap_pin(relation, block, &vmbuffer);
3073 
3075 
3076  /*
3077  * If we didn't pin the visibility map page and the page has become all
3078  * visible while we were busy locking the buffer, we'll have to unlock and
3079  * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
3080  * unfortunate, but hopefully shouldn't happen often.
3081  */
3082  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3083  {
3084  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3085  visibilitymap_pin(relation, block, &vmbuffer);
3087  }
3088 
3089  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
3090  Assert(ItemIdIsNormal(lp));
3091 
3092  tp.t_tableOid = RelationGetRelid(relation);
3093  tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3094  tp.t_len = ItemIdGetLength(lp);
3095  tp.t_self = *tid;
3096 
3097 l1:
3098  result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
3099 
3100  if (result == HeapTupleInvisible)
3101  {
3102  UnlockReleaseBuffer(buffer);
3103  ereport(ERROR,
3104  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3105  errmsg("attempted to delete invisible tuple")));
3106  }
3107  else if (result == HeapTupleBeingUpdated && wait)
3108  {
3109  TransactionId xwait;
3110  uint16 infomask;
3111 
3112  /* must copy state data before unlocking buffer */
3113  xwait = HeapTupleHeaderGetRawXmax(tp.t_data);
3114  infomask = tp.t_data->t_infomask;
3115 
3116  /*
3117  * Sleep until concurrent transaction ends -- except when there's a
3118  * single locker and it's our own transaction. Note we don't care
3119  * which lock mode the locker has, because we need the strongest one.
3120  *
3121  * Before sleeping, we need to acquire tuple lock to establish our
3122  * priority for the tuple (see heap_lock_tuple). LockTuple will
3123  * release us when we are next-in-line for the tuple.
3124  *
3125  * If we are forced to "start over" below, we keep the tuple lock;
3126  * this arranges that we stay at the head of the line while rechecking
3127  * tuple state.
3128  */
3129  if (infomask & HEAP_XMAX_IS_MULTI)
3130  {
3131  /* wait for multixact */
3132  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3134  {
3135  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3136 
3137  /* acquire tuple lock, if necessary */
3139  LockWaitBlock, &have_tuple_lock);
3140 
3141  /* wait for multixact */
3143  relation, &(tp.t_self), XLTW_Delete,
3144  NULL);
3146 
3147  /*
3148  * If xwait had just locked the tuple then some other xact
3149  * could update this tuple before we get to this point. Check
3150  * for xmax change, and start over if so.
3151  */
3152  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3154  xwait))
3155  goto l1;
3156  }
3157 
3158  /*
3159  * You might think the multixact is necessarily done here, but not
3160  * so: it could have surviving members, namely our own xact or
3161  * other subxacts of this backend. It is legal for us to delete
3162  * the tuple in either case, however (the latter case is
3163  * essentially a situation of upgrading our former shared lock to
3164  * exclusive). We don't bother changing the on-disk hint bits
3165  * since we are about to overwrite the xmax altogether.
3166  */
3167  }
3168  else if (!TransactionIdIsCurrentTransactionId(xwait))
3169  {
3170  /*
3171  * Wait for regular transaction to end; but first, acquire tuple
3172  * lock.
3173  */
3174  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3176  LockWaitBlock, &have_tuple_lock);
3177  XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3179 
3180  /*
3181  * xwait is done, but if xwait had just locked the tuple then some
3182  * other xact could update this tuple before we get to this point.
3183  * Check for xmax change, and start over if so.
3184  */
3185  if (xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
3187  xwait))
3188  goto l1;
3189 
3190  /* Otherwise check if it committed or aborted */
3191  UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3192  }
3193 
3194  /*
3195  * We may overwrite if previous xmax aborted, or if it committed but
3196  * only locked the tuple without updating it.
3197  */
3198  if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3201  result = HeapTupleMayBeUpdated;
3202  else
3203  result = HeapTupleUpdated;
3204  }
3205 
3206  if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3207  {
3208  /* Perform additional check for transaction-snapshot mode RI updates */
3209  if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3210  result = HeapTupleUpdated;
3211  }
3212 
3213  if (result != HeapTupleMayBeUpdated)
3214  {
3215  Assert(result == HeapTupleSelfUpdated ||
3216  result == HeapTupleUpdated ||
3217  result == HeapTupleBeingUpdated);
3219  hufd->ctid = tp.t_data->t_ctid;
3221  if (result == HeapTupleSelfUpdated)
3222  hufd->cmax = HeapTupleHeaderGetCmax(tp.t_data);
3223  else
3224  hufd->cmax = InvalidCommandId;
3225  UnlockReleaseBuffer(buffer);
3226  if (have_tuple_lock)
3227  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3228  if (vmbuffer != InvalidBuffer)
3229  ReleaseBuffer(vmbuffer);
3230  return result;
3231  }
3232 
3233  /*
3234  * We're about to do the actual delete -- check for conflict first, to
3235  * avoid possibly having to roll back work we've just done.
3236  *
3237  * This is safe without a recheck as long as there is no possibility of
3238  * another process scanning the page between this check and the delete
3239  * being visible to the scan (i.e., an exclusive buffer content lock is
3240  * continuously held from this point until the tuple delete is visible).
3241  */
3242  CheckForSerializableConflictIn(relation, &tp, buffer);
3243 
3244  /* replace cid with a combo cid if necessary */
3245  HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
3246 
3247  /*
3248  * Compute replica identity tuple before entering the critical section so
3249  * we don't PANIC upon a memory allocation failure.
3250  */
3251  old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3252 
3253  /*
3254  * If this is the first possibly-multixact-able operation in the current
3255  * transaction, set my per-backend OldestMemberMXactId setting. We can be
3256  * certain that the transaction will never become a member of any older
3257  * MultiXactIds than that. (We have to do this even if we end up just
3258  * using our own TransactionId below, since some other backend could
3259  * incorporate our XID into a MultiXact immediately afterwards.)
3260  */
3262 
3265  xid, LockTupleExclusive, true,
3266  &new_xmax, &new_infomask, &new_infomask2);
3267 
3269 
3270  /*
3271  * If this transaction commits, the tuple will become DEAD sooner or
3272  * later. Set flag that this page is a candidate for pruning once our xid
3273  * falls below the OldestXmin horizon. If the transaction finally aborts,
3274  * the subsequent page pruning will be a no-op and the hint will be
3275  * cleared.
3276  */
3277  PageSetPrunable(page, xid);
3278 
3279  if (PageIsAllVisible(page))
3280  {
3281  all_visible_cleared = true;
3282  PageClearAllVisible(page);
3283  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3284  vmbuffer, VISIBILITYMAP_VALID_BITS);
3285  }
3286 
3287  /* store transaction information of xact deleting the tuple */
3290  tp.t_data->t_infomask |= new_infomask;
3291  tp.t_data->t_infomask2 |= new_infomask2;
3293  HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3294  HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
3295  /* Make sure there is no forward chain link in t_ctid */
3296  tp.t_data->t_ctid = tp.t_self;
3297 
3298  MarkBufferDirty(buffer);
3299 
3300  /*
3301  * XLOG stuff
3302  *
3303  * NB: heap_abort_speculative() uses the same xlog record and replay
3304  * routines.
3305  */
3306  if (RelationNeedsWAL(relation))
3307  {
3308  xl_heap_delete xlrec;
3309  XLogRecPtr recptr;
3310 
3311  /* For logical decode we need combocids to properly decode the catalog */
3313  log_heap_new_cid(relation, &tp);
3314 
3315  xlrec.flags = all_visible_cleared ? XLH_DELETE_ALL_VISIBLE_CLEARED : 0;
3317  tp.t_data->t_infomask2);
3319  xlrec.xmax = new_xmax;
3320 
3321  if (old_key_tuple != NULL)
3322  {
3323  if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3325  else
3327  }
3328 
3329  XLogBeginInsert();
3330  XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3331 
3332  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
3333 
3334  /*
3335  * Log replica identity of the deleted tuple if there is one
3336  */
3337  if (old_key_tuple != NULL)
3338  {
3339  xl_heap_header xlhdr;
3340 
3341  xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3342  xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3343  xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3344 
3345  XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3346  XLogRegisterData((char *) old_key_tuple->t_data
3348  old_key_tuple->t_len
3350  }
3351 
3352  /* filtering by origin on a row level is much more efficient */
3354 
3355  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3356 
3357  PageSetLSN(page, recptr);
3358  }
3359 
3360  END_CRIT_SECTION();
3361 
3362  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3363 
3364  if (vmbuffer != InvalidBuffer)
3365  ReleaseBuffer(vmbuffer);
3366 
3367  /*
3368  * If the tuple has toasted out-of-line attributes, we need to delete
3369  * those items too. We have to do this before releasing the buffer
3370  * because we need to look at the contents of the tuple, but it's OK to
3371  * release the content lock on the buffer first.
3372  */
3373  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3374  relation->rd_rel->relkind != RELKIND_MATVIEW)
3375  {
3376  /* toast table entries should never be recursively toasted */
3378  }
3379  else if (HeapTupleHasExternal(&tp))
3380  toast_delete(relation, &tp, false);
3381 
3382  /*
3383  * Mark tuple for invalidation from system caches at next command
3384  * boundary. We have to do this before releasing the buffer because we
3385  * need to look at the contents of the tuple.
3386  */
3387  CacheInvalidateHeapTuple(relation, &tp, NULL);
3388 
3389  /* Now we can release the buffer */
3390  ReleaseBuffer(buffer);
3391 
3392  /*
3393  * Release the lmgr tuple lock, if we had it.
3394  */
3395  if (have_tuple_lock)
3396  UnlockTupleTuplock(relation, &(tp.t_self), LockTupleExclusive);
3397 
3398  pgstat_count_heap_delete(relation);
3399 
3400  if (old_key_tuple != NULL && old_key_copied)
3401  heap_freetuple(old_key_tuple);
3402 
3403  return HeapTupleMayBeUpdated;
3404 }
3405 
3406 /*
3407  * simple_heap_delete - delete a tuple
3408  *
3409  * This routine may be used to delete a tuple when concurrent updates of
3410  * the target tuple are not expected (for example, because we have a lock
3411  * on the relation associated with the tuple). Any failure is reported
3412  * via ereport().
3413  */
3414 void
3416 {
3417  HTSU_Result result;
3418  HeapUpdateFailureData hufd;
3419 
3420  result = heap_delete(relation, tid,
3422  true /* wait for commit */ ,
3423  &hufd);
3424  switch (result)
3425  {
3426  case HeapTupleSelfUpdated:
3427  /* Tuple was already updated in current command? */
3428  elog(ERROR, "tuple already updated by self");
3429  break;
3430 
3431  case HeapTupleMayBeUpdated:
3432  /* done successfully */
3433  break;
3434 
3435  case HeapTupleUpdated:
3436  elog(ERROR, "tuple concurrently updated");
3437  break;
3438 
3439  default:
3440  elog(ERROR, "unrecognized heap_delete status: %u", result);
3441  break;
3442  }
3443 }
3444 
3445 /*
3446  * heap_update - replace a tuple
3447  *
3448  * NB: do not call this directly unless you are prepared to deal with
3449  * concurrent-update conditions. Use simple_heap_update instead.
3450  *
3451  * relation - table to be modified (caller must hold suitable lock)
3452  * otid - TID of old tuple to be replaced
3453  * newtup - newly constructed tuple data to store
3454  * cid - update command ID (used for visibility test, and stored into
3455  * cmax/cmin if successful)
3456  * crosscheck - if not InvalidSnapshot, also check old tuple against this
3457  * wait - true if should wait for any conflicting update to commit/abort
3458  * hufd - output parameter, filled in failure cases (see below)
3459  * lockmode - output parameter, filled with lock mode acquired on tuple
3460  *
3461  * Normal, successful return value is HeapTupleMayBeUpdated, which
3462  * actually means we *did* update it. Failure return codes are
3463  * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated
3464  * (the last only possible if wait == false).
3465  *
3466  * On success, the header fields of *newtup are updated to match the new
3467  * stored tuple; in particular, newtup->t_self is set to the TID where the
3468  * new tuple was inserted, and its HEAP_ONLY_TUPLE flag is set iff a HOT
3469  * update was done. However, any TOAST changes in the new tuple's
3470  * data are not reflected into *newtup.
3471  *
3472  * In the failure cases, the routine fills *hufd with the tuple's t_ctid,
3473  * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax
3474  * (the last only for HeapTupleSelfUpdated, since we
3475  * cannot obtain cmax from a combocid generated by another transaction).
3476  * See comments for struct HeapUpdateFailureData for additional info.
3477  */
3480  CommandId cid, Snapshot crosscheck, bool wait,
3481  HeapUpdateFailureData *hufd, LockTupleMode *lockmode)
3482 {
3483  HTSU_Result result;
3485  Bitmapset *hot_attrs;
3486  Bitmapset *key_attrs;
3487  Bitmapset *id_attrs;
3488  Bitmapset *interesting_attrs;
3489  Bitmapset *modified_attrs;
3490  ItemId lp;
3491  HeapTupleData oldtup;
3492  HeapTuple heaptup;
3493  HeapTuple old_key_tuple = NULL;
3494  bool old_key_copied = false;
3495  Page page;
3496  BlockNumber block;
3497  MultiXactStatus mxact_status;
3498  Buffer buffer,
3499  newbuf,
3500  vmbuffer = InvalidBuffer,
3501  vmbuffer_new = InvalidBuffer;
3502  bool need_toast;
3503  Size newtupsize,
3504  pagefree;
3505  bool have_tuple_lock = false;
3506  bool iscombo;
3507  bool use_hot_update = false;
3508  bool hot_attrs_checked = false;
3509  bool key_intact;
3510  bool all_visible_cleared = false;
3511  bool all_visible_cleared_new = false;
3512  bool checked_lockers;
3513  bool locker_remains;
3514  TransactionId xmax_new_tuple,
3515  xmax_old_tuple;
3516  uint16 infomask_old_tuple,
3517  infomask2_old_tuple,
3518  infomask_new_tuple,
3519  infomask2_new_tuple;
3520 
3521  Assert(ItemPointerIsValid(otid));
3522 
3523  /*
3524  * Forbid this during a parallel operation, lest it allocate a combocid.
3525  * Other workers might need that combocid for visibility checks, and we
3526  * have no provision for broadcasting it to them.
3527  */
3528  if (IsInParallelMode())
3529  ereport(ERROR,
3530  (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3531  errmsg("cannot update tuples during a parallel operation")));
3532 
3533  /*
3534  * Fetch the list of attributes to be checked for various operations.
3535  *
3536  * For HOT considerations, this is wasted effort if we fail to update or
3537  * have to put the new tuple on a different page. But we must compute the
3538  * list before obtaining buffer lock --- in the worst case, if we are
3539  * doing an update on one of the relevant system catalogs, we could
3540  * deadlock if we try to fetch the list later. In any case, the relcache
3541  * caches the data so this is usually pretty cheap.
3542  *
3543  * We also need columns used by the replica identity and columns that are
3544  * considered the "key" of rows in the table.
3545  *
3546  * Note that we get copies of each bitmap, so we need not worry about
3547  * relcache flush happening midway through.
3548  */
3549  hot_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_ALL);
3550  key_attrs = RelationGetIndexAttrBitmap(relation, INDEX_ATTR_BITMAP_KEY);
3551  id_attrs = RelationGetIndexAttrBitmap(relation,
3553 
3554 
3555  block = ItemPointerGetBlockNumber(otid);
3556  buffer = ReadBuffer(relation, block);
3557  page = BufferGetPage(buffer);
3558 
3559  interesting_attrs = NULL;
3560 
3561  /*
3562  * If the page is already full, there is hardly any chance of doing a HOT
3563  * update on this page. It might be wasteful effort to look for index
3564  * column updates only to later reject HOT updates for lack of space in
3565  * the same page. So we be conservative and only fetch hot_attrs if the
3566  * page is not already full. Since we are already holding a pin on the
3567  * buffer, there is no chance that the buffer can get cleaned up
3568  * concurrently and even if that was possible, in the worst case we lose a
3569  * chance to do a HOT update.
3570  */
3571  if (!PageIsFull(page))
3572  {
3573  interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3574  hot_attrs_checked = true;
3575  }
3576  interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3577  interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3578 
3579  /*
3580  * Before locking the buffer, pin the visibility map page if it appears to
3581  * be necessary. Since we haven't got the lock yet, someone else might be
3582  * in the middle of changing this, so we'll need to recheck after we have
3583  * the lock.
3584  */
3585  if (PageIsAllVisible(page))
3586  visibilitymap_pin(relation, block, &vmbuffer);
3587 
3589 
3590  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3591  Assert(ItemIdIsNormal(lp));
3592 
3593  /*
3594  * Fill in enough data in oldtup for HeapDetermineModifiedColumns to work
3595  * properly.
3596  */
3597  oldtup.t_tableOid = RelationGetRelid(relation);
3598  oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3599  oldtup.t_len = ItemIdGetLength(lp);
3600  oldtup.t_self = *otid;
3601 
3602  /* the new tuple is ready, except for this: */
3603  newtup->t_tableOid = RelationGetRelid(relation);
3604 
3605  /* Fill in OID for newtup */
3606  if (relation->rd_rel->relhasoids)
3607  {
3608 #ifdef NOT_USED
3609  /* this is redundant with an Assert in HeapTupleSetOid */
3610  Assert(newtup->t_data->t_infomask & HEAP_HASOID);
3611 #endif
3612  HeapTupleSetOid(newtup, HeapTupleGetOid(&oldtup));
3613  }
3614  else
3615  {
3616  /* check there is not space for an OID */
3617  Assert(!(newtup->t_data->t_infomask & HEAP_HASOID));
3618  }
3619 
3620  /* Determine columns modified by the update. */
3621  modified_attrs = HeapDetermineModifiedColumns(relation, interesting_attrs,
3622  &oldtup, newtup);
3623 
3624  /*
3625  * If we're not updating any "key" column, we can grab a weaker lock type.
3626  * This allows for more concurrency when we are running simultaneously
3627  * with foreign key checks.
3628  *
3629  * Note that if a column gets detoasted while executing the update, but
3630  * the value ends up being the same, this test will fail and we will use
3631  * the stronger lock. This is acceptable; the important case to optimize
3632  * is updates that don't manipulate key columns, not those that
3633  * serendipitiously arrive at the same key values.
3634  */
3635  if (!bms_overlap(modified_attrs, key_attrs))
3636  {
3637  *lockmode = LockTupleNoKeyExclusive;
3638  mxact_status = MultiXactStatusNoKeyUpdate;
3639  key_intact = true;
3640 
3641  /*
3642  * If this is the first possibly-multixact-able operation in the
3643  * current transaction, set my per-backend OldestMemberMXactId
3644  * setting. We can be certain that the transaction will never become a
3645  * member of any older MultiXactIds than that. (We have to do this
3646  * even if we end up just using our own TransactionId below, since
3647  * some other backend could incorporate our XID into a MultiXact
3648  * immediately afterwards.)
3649  */
3651  }
3652  else
3653  {
3654  *lockmode = LockTupleExclusive;
3655  mxact_status = MultiXactStatusUpdate;
3656  key_intact = false;
3657  }
3658 
3659  /*
3660  * Note: beyond this point, use oldtup not otid to refer to old tuple.
3661  * otid may very well point at newtup->t_self, which we will overwrite
3662  * with the new tuple's location, so there's great risk of confusion if we
3663  * use otid anymore.
3664  */
3665 
3666 l2:
3667  checked_lockers = false;
3668  locker_remains = false;
3669  result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3670 
3671  /* see below about the "no wait" case */
3672  Assert(result != HeapTupleBeingUpdated || wait);
3673 
3674  if (result == HeapTupleInvisible)
3675  {
3676  UnlockReleaseBuffer(buffer);
3677  ereport(ERROR,
3678  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3679  errmsg("attempted to update invisible tuple")));
3680  }
3681  else if (result == HeapTupleBeingUpdated && wait)
3682  {
3683  TransactionId xwait;
3684  uint16 infomask;
3685  bool can_continue = false;
3686 
3687  /*
3688  * XXX note that we don't consider the "no wait" case here. This
3689  * isn't a problem currently because no caller uses that case, but it
3690  * should be fixed if such a caller is introduced. It wasn't a
3691  * problem previously because this code would always wait, but now
3692  * that some tuple locks do not conflict with one of the lock modes we
3693  * use, it is possible that this case is interesting to handle
3694  * specially.
3695  *
3696  * This may cause failures with third-party code that calls
3697  * heap_update directly.
3698  */
3699 
3700  /* must copy state data before unlocking buffer */
3701  xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3702  infomask = oldtup.t_data->t_infomask;
3703 
3704  /*
3705  * Now we have to do something about the existing locker. If it's a
3706  * multi, sleep on it; we might be awakened before it is completely
3707  * gone (or even not sleep at all in some cases); we need to preserve
3708  * it as locker, unless it is gone completely.
3709  *
3710  * If it's not a multi, we need to check for sleeping conditions
3711  * before actually going to sleep. If the update doesn't conflict
3712  * with the locks, we just continue without sleeping (but making sure
3713  * it is preserved).
3714  *
3715  * Before sleeping, we need to acquire tuple lock to establish our
3716  * priority for the tuple (see heap_lock_tuple). LockTuple will
3717  * release us when we are next-in-line for the tuple. Note we must
3718  * not acquire the tuple lock until we're sure we're going to sleep;
3719  * otherwise we're open for race conditions with other transactions
3720  * holding the tuple lock which sleep on us.
3721  *
3722  * If we are forced to "start over" below, we keep the tuple lock;
3723  * this arranges that we stay at the head of the line while rechecking
3724  * tuple state.
3725  */
3726  if (infomask & HEAP_XMAX_IS_MULTI)
3727  {
3728  TransactionId update_xact;
3729  int remain;
3730 
3731  if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3732  *lockmode))
3733  {
3734  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3735 
3736  /* acquire tuple lock, if necessary */
3737  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3738  LockWaitBlock, &have_tuple_lock);
3739 
3740  /* wait for multixact */
3741  MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3742  relation, &oldtup.t_self, XLTW_Update,
3743  &remain);
3744  checked_lockers = true;
3745  locker_remains = remain != 0;
3747 
3748  /*
3749  * If xwait had just locked the tuple then some other xact
3750  * could update this tuple before we get to this point. Check
3751  * for xmax change, and start over if so.
3752  */
3754  infomask) ||
3756  xwait))
3757  goto l2;
3758  }
3759 
3760  /*
3761  * Note that the multixact may not be done by now. It could have
3762  * surviving members; our own xact or other subxacts of this
3763  * backend, and also any other concurrent transaction that locked
3764  * the tuple with KeyShare if we only got TupleLockUpdate. If
3765  * this is the case, we have to be careful to mark the updated
3766  * tuple with the surviving members in Xmax.
3767  *
3768  * Note that there could have been another update in the
3769  * MultiXact. In that case, we need to check whether it committed
3770  * or aborted. If it aborted we are safe to update it again;
3771  * otherwise there is an update conflict, and we have to return
3772  * HeapTupleUpdated below.
3773  *
3774  * In the LockTupleExclusive case, we still need to preserve the
3775  * surviving members: those would include the tuple locks we had
3776  * before this one, which are important to keep in case this
3777  * subxact aborts.
3778  */
3780  update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3781  else
3782  update_xact = InvalidTransactionId;
3783 
3784  /*
3785  * There was no UPDATE in the MultiXact; or it aborted. No
3786  * TransactionIdIsInProgress() call needed here, since we called
3787  * MultiXactIdWait() above.
3788  */
3789  if (!TransactionIdIsValid(update_xact) ||
3790  TransactionIdDidAbort(update_xact))
3791  can_continue = true;
3792  }
3793  else if (TransactionIdIsCurrentTransactionId(xwait))
3794  {
3795  /*
3796  * The only locker is ourselves; we can avoid grabbing the tuple
3797  * lock here, but must preserve our locking information.
3798  */
3799  checked_lockers = true;
3800  locker_remains = true;
3801  can_continue = true;
3802  }
3803  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3804  {
3805  /*
3806  * If it's just a key-share locker, and we're not changing the key
3807  * columns, we don't need to wait for it to end; but we need to
3808  * preserve it as locker.
3809  */
3810  checked_lockers = true;
3811  locker_remains = true;
3812  can_continue = true;
3813  }
3814  else
3815  {
3816  /*
3817  * Wait for regular transaction to end; but first, acquire tuple
3818  * lock.
3819  */
3820  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3821  heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3822  LockWaitBlock, &have_tuple_lock);
3823  XactLockTableWait(xwait, relation, &oldtup.t_self,
3824  XLTW_Update);
3825  checked_lockers = true;
3827 
3828  /*
3829  * xwait is done, but if xwait had just locked the tuple then some
3830  * other xact could update this tuple before we get to this point.
3831  * Check for xmax change, and start over if so.
3832  */
3833  if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3834  !TransactionIdEquals(xwait,
3836  goto l2;
3837 
3838  /* Otherwise check if it committed or aborted */
3839  UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3840  if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3841  can_continue = true;
3842  }
3843 
3844  result = can_continue ? HeapTupleMayBeUpdated : HeapTupleUpdated;
3845  }
3846 
3847  if (crosscheck != InvalidSnapshot && result == HeapTupleMayBeUpdated)
3848  {
3849  /* Perform additional check for transaction-snapshot mode RI updates */
3850  if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3851  result = HeapTupleUpdated;
3852  }
3853 
3854  if (result != HeapTupleMayBeUpdated)
3855  {
3856  Assert(result == HeapTupleSelfUpdated ||
3857  result == HeapTupleUpdated ||
3858  result == HeapTupleBeingUpdated);
3859  Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3860  hufd->ctid = oldtup.t_data->t_ctid;
3861  hufd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3862  if (result == HeapTupleSelfUpdated)
3863  hufd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3864  else
3865  hufd->cmax = InvalidCommandId;
3866  UnlockReleaseBuffer(buffer);
3867  if (have_tuple_lock)
3868  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3869  if (vmbuffer != InvalidBuffer)
3870  ReleaseBuffer(vmbuffer);
3871  bms_free(hot_attrs);
3872  bms_free(key_attrs);
3873  bms_free(id_attrs);
3874  bms_free(modified_attrs);
3875  bms_free(interesting_attrs);
3876  return result;
3877  }
3878 
3879  /*
3880  * If we didn't pin the visibility map page and the page has become all
3881  * visible while we were busy locking the buffer, or during some
3882  * subsequent window during which we had it unlocked, we'll have to unlock
3883  * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3884  * bit unfortunate, especially since we'll now have to recheck whether the
3885  * tuple has been locked or updated under us, but hopefully it won't
3886  * happen very often.
3887  */
3888  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3889  {
3890  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
3891  visibilitymap_pin(relation, block, &vmbuffer);
3893  goto l2;
3894  }
3895 
3896  /* Fill in transaction status data */
3897 
3898  /*
3899  * If the tuple we're updating is locked, we need to preserve the locking
3900  * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3901  */
3903  oldtup.t_data->t_infomask,
3904  oldtup.t_data->t_infomask2,
3905  xid, *lockmode, true,
3906  &xmax_old_tuple, &infomask_old_tuple,
3907  &infomask2_old_tuple);
3908 
3909  /*
3910  * And also prepare an Xmax value for the new copy of the tuple. If there
3911  * was no xmax previously, or there was one but all lockers are now gone,
3912  * then use InvalidXid; otherwise, get the xmax from the old tuple. (In
3913  * rare cases that might also be InvalidXid and yet not have the
3914  * HEAP_XMAX_INVALID bit set; that's fine.)
3915  */
3916  if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3918  (checked_lockers && !locker_remains))
3919  xmax_new_tuple = InvalidTransactionId;
3920  else
3921  xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3922 
3923  if (!TransactionIdIsValid(xmax_new_tuple))
3924  {
3925  infomask_new_tuple = HEAP_XMAX_INVALID;
3926  infomask2_new_tuple = 0;
3927  }
3928  else
3929  {
3930  /*
3931  * If we found a valid Xmax for the new tuple, then the infomask bits
3932  * to use on the new tuple depend on what was there on the old one.
3933  * Note that since we're doing an update, the only possibility is that
3934  * the lockers had FOR KEY SHARE lock.
3935  */
3936  if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3937  {
3938  GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3939  &infomask2_new_tuple);
3940  }
3941  else
3942  {
3943  infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3944  infomask2_new_tuple = 0;
3945  }
3946  }
3947 
3948  /*
3949  * Prepare the new tuple with the appropriate initial values of Xmin and
3950  * Xmax, as well as initial infomask bits as computed above.
3951  */
3952  newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3953  newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3954  HeapTupleHeaderSetXmin(newtup->t_data, xid);
3955  HeapTupleHeaderSetCmin(newtup->t_data, cid);
3956  newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3957  newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3958  HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3959 
3960  /*
3961  * Replace cid with a combo cid if necessary. Note that we already put
3962  * the plain cid into the new tuple.
3963  */
3964  HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3965 
3966  /*
3967  * If the toaster needs to be activated, OR if the new tuple will not fit
3968  * on the same page as the old, then we need to release the content lock
3969  * (but not the pin!) on the old tuple's buffer while we are off doing
3970  * TOAST and/or table-file-extension work. We must mark the old tuple to
3971  * show that it's locked, else other processes may try to update it
3972  * themselves.
3973  *
3974  * We need to invoke the toaster if there are already any out-of-line
3975  * toasted values present, or if the new tuple is over-threshold.
3976  */
3977  if (relation->rd_rel->relkind != RELKIND_RELATION &&
3978  relation->rd_rel->relkind != RELKIND_MATVIEW)
3979  {
3980  /* toast table entries should never be recursively toasted */
3981  Assert(!HeapTupleHasExternal(&oldtup));
3982  Assert(!HeapTupleHasExternal(newtup));
3983  need_toast = false;
3984  }
3985  else
3986  need_toast = (HeapTupleHasExternal(&oldtup) ||
3987  HeapTupleHasExternal(newtup) ||
3988  newtup->t_len > TOAST_TUPLE_THRESHOLD);
3989 
3990  pagefree = PageGetHeapFreeSpace(page);
3991 
3992  newtupsize = MAXALIGN(newtup->t_len);
3993 
3994  if (need_toast || newtupsize > pagefree)
3995  {
3996  TransactionId xmax_lock_old_tuple;
3997  uint16 infomask_lock_old_tuple,
3998  infomask2_lock_old_tuple;
3999  bool cleared_all_frozen = false;
4000 
4001  /*
4002  * To prevent concurrent sessions from updating the tuple, we have to
4003  * temporarily mark it locked, while we release the page-level lock.
4004  *
4005  * To satisfy the rule that any xid potentially appearing in a buffer
4006  * written out to disk, we unfortunately have to WAL log this
4007  * temporary modification. We can reuse xl_heap_lock for this
4008  * purpose. If we crash/error before following through with the
4009  * actual update, xmax will be of an aborted transaction, allowing
4010  * other sessions to proceed.
4011  */
4012 
4013  /*
4014  * Compute xmax / infomask appropriate for locking the tuple. This has
4015  * to be done separately from the combo that's going to be used for
4016  * updating, because the potentially created multixact would otherwise
4017  * be wrong.
4018  */
4020  oldtup.t_data->t_infomask,
4021  oldtup.t_data->t_infomask2,
4022  xid, *lockmode, false,
4023  &xmax_lock_old_tuple, &infomask_lock_old_tuple,
4024  &infomask2_lock_old_tuple);
4025 
4026  Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
4027 
4029 
4030  /* Clear obsolete visibility flags ... */
4031  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4032  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4033  HeapTupleClearHotUpdated(&oldtup);
4034  /* ... and store info about transaction updating this tuple */
4035  Assert(TransactionIdIsValid(xmax_lock_old_tuple));
4036  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
4037  oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
4038  oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
4039  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4040 
4041  /* temporarily make it look not-updated, but locked */
4042  oldtup.t_data->t_ctid = oldtup.t_self;
4043 
4044  /*
4045  * Clear all-frozen bit on visibility map if needed. We could
4046  * immediately reset ALL_VISIBLE, but given that the WAL logging
4047  * overhead would be unchanged, that doesn't seem necessarily
4048  * worthwhile.
4049  */
4050  if (PageIsAllVisible(BufferGetPage(buffer)) &&
4051  visibilitymap_clear(relation, block, vmbuffer,
4053  cleared_all_frozen = true;
4054 
4055  MarkBufferDirty(buffer);
4056 
4057  if (RelationNeedsWAL(relation))
4058  {
4059  xl_heap_lock xlrec;
4060  XLogRecPtr recptr;
4061 
4062  XLogBeginInsert();
4063  XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
4064 
4065  xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
4066  xlrec.locking_xid = xmax_lock_old_tuple;
4068  oldtup.t_data->t_infomask2);
4069  xlrec.flags =
4070  cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
4071  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
4072  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
4073  PageSetLSN(page, recptr);
4074  }
4075 
4076  END_CRIT_SECTION();
4077 
4078  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4079 
4080  /*
4081  * Let the toaster do its thing, if needed.
4082  *
4083  * Note: below this point, heaptup is the data we actually intend to
4084  * store into the relation; newtup is the caller's original untoasted
4085  * data.
4086  */
4087  if (need_toast)
4088  {
4089  /* Note we always use WAL and FSM during updates */
4090  heaptup = toast_insert_or_update(relation, newtup, &oldtup, 0);
4091  newtupsize = MAXALIGN(heaptup->t_len);
4092  }
4093  else
4094  heaptup = newtup;
4095 
4096  /*
4097  * Now, do we need a new page for the tuple, or not? This is a bit
4098  * tricky since someone else could have added tuples to the page while
4099  * we weren't looking. We have to recheck the available space after
4100  * reacquiring the buffer lock. But don't bother to do that if the
4101  * former amount of free space is still not enough; it's unlikely
4102  * there's more free now than before.
4103  *
4104  * What's more, if we need to get a new page, we will need to acquire
4105  * buffer locks on both old and new pages. To avoid deadlock against
4106  * some other backend trying to get the same two locks in the other
4107  * order, we must be consistent about the order we get the locks in.
4108  * We use the rule "lock the lower-numbered page of the relation
4109  * first". To implement this, we must do RelationGetBufferForTuple
4110  * while not holding the lock on the old page, and we must rely on it
4111  * to get the locks on both pages in the correct order.
4112  */
4113  if (newtupsize > pagefree)
4114  {
4115  /* Assume there's no chance to put heaptup on same page. */
4116  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4117  buffer, 0, NULL,
4118  &vmbuffer_new, &vmbuffer);
4119  }
4120  else
4121  {
4122  /* Re-acquire the lock on the old tuple's page. */
4124  /* Re-check using the up-to-date free space */
4125  pagefree = PageGetHeapFreeSpace(page);
4126  if (newtupsize > pagefree)
4127  {
4128  /*
4129  * Rats, it doesn't fit anymore. We must now unlock and
4130  * relock to avoid deadlock. Fortunately, this path should
4131  * seldom be taken.
4132  */
4133  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4134  newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4135  buffer, 0, NULL,
4136  &vmbuffer_new, &vmbuffer);
4137  }
4138  else
4139  {
4140  /* OK, it fits here, so we're done. */
4141  newbuf = buffer;
4142  }
4143  }
4144  }
4145  else
4146  {
4147  /* No TOAST work needed, and it'll fit on same page */
4148  newbuf = buffer;
4149  heaptup = newtup;
4150  }
4151 
4152  /*
4153  * We're about to do the actual update -- check for conflict first, to
4154  * avoid possibly having to roll back work we've just done.
4155  *
4156  * This is safe without a recheck as long as there is no possibility of
4157  * another process scanning the pages between this check and the update
4158  * being visible to the scan (i.e., exclusive buffer content lock(s) are
4159  * continuously held from this point until the tuple update is visible).
4160  *
4161  * For the new tuple the only check needed is at the relation level, but
4162  * since both tuples are in the same relation and the check for oldtup
4163  * will include checking the relation level, there is no benefit to a
4164  * separate check for the new tuple.
4165  */
4166  CheckForSerializableConflictIn(relation, &oldtup, buffer);
4167 
4168  /*
4169  * At this point newbuf and buffer are both pinned and locked, and newbuf
4170  * has enough space for the new tuple. If they are the same buffer, only
4171  * one pin is held.
4172  */
4173 
4174  if (newbuf == buffer)
4175  {
4176  /*
4177  * Since the new tuple is going into the same page, we might be able
4178  * to do a HOT update. Check if any of the index columns have been
4179  * changed. If the page was already full, we may have skipped checking
4180  * for index columns. If so, HOT update is possible.
4181  */
4182  if (hot_attrs_checked && !bms_overlap(modified_attrs, hot_attrs))
4183  use_hot_update = true;
4184  }
4185  else
4186  {
4187  /* Set a hint that the old page could use prune/defrag */
4188  PageSetFull(page);
4189  }
4190 
4191  /*
4192  * Compute replica identity tuple before entering the critical section so
4193  * we don't PANIC upon a memory allocation failure.
4194  * ExtractReplicaIdentity() will return NULL if nothing needs to be
4195  * logged.
4196  */
4197  old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
4198  bms_overlap(modified_attrs, id_attrs),
4199  &old_key_copied);
4200 
4201  /* NO EREPORT(ERROR) from here till changes are logged */
4203 
4204  /*
4205  * If this transaction commits, the old tuple will become DEAD sooner or
4206  * later. Set flag that this page is a candidate for pruning once our xid
4207  * falls below the OldestXmin horizon. If the transaction finally aborts,
4208  * the subsequent page pruning will be a no-op and the hint will be
4209  * cleared.
4210  *
4211  * XXX Should we set hint on newbuf as well? If the transaction aborts,
4212  * there would be a prunable tuple in the newbuf; but for now we choose
4213  * not to optimize for aborts. Note that heap_xlog_update must be kept in
4214  * sync if this decision changes.
4215  */
4216  PageSetPrunable(page, xid);
4217 
4218  if (use_hot_update)
4219  {
4220  /* Mark the old tuple as HOT-updated */
4221  HeapTupleSetHotUpdated(&oldtup);
4222  /* And mark the new tuple as heap-only */
4223  HeapTupleSetHeapOnly(heaptup);
4224  /* Mark the caller's copy too, in case different from heaptup */
4225  HeapTupleSetHeapOnly(newtup);
4226  }
4227  else
4228  {
4229  /* Make sure tuples are correctly marked as not-HOT */
4230  HeapTupleClearHotUpdated(&oldtup);
4231  HeapTupleClearHeapOnly(heaptup);
4232  HeapTupleClearHeapOnly(newtup);
4233  }
4234 
4235  RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4236 
4237 
4238  /* Clear obsolete visibility flags, possibly set by ourselves above... */
4239  oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4240  oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4241  /* ... and store info about transaction updating this tuple */
4242  Assert(TransactionIdIsValid(xmax_old_tuple));
4243  HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
4244  oldtup.t_data->t_infomask |= infomask_old_tuple;
4245  oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4246  HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
4247 
4248  /* record address of new tuple in t_ctid of old one */
4249  oldtup.t_data->t_ctid = heaptup->t_self;
4250 
4251  /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4252  if (PageIsAllVisible(BufferGetPage(buffer)))
4253  {
4254  all_visible_cleared = true;
4256  visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4257  vmbuffer, VISIBILITYMAP_VALID_BITS);
4258  }
4259  if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4260  {
4261  all_visible_cleared_new = true;
4263  visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
4264  vmbuffer_new, VISIBILITYMAP_VALID_BITS);
4265  }
4266 
4267  if (newbuf != buffer)
4268  MarkBufferDirty(newbuf);
4269  MarkBufferDirty(buffer);
4270 
4271  /* XLOG stuff */
4272  if (RelationNeedsWAL(relation))
4273  {
4274  XLogRecPtr recptr;
4275 
4276  /*
4277  * For logical decoding we need combocids to properly decode the
4278  * catalog.
4279  */
4281  {
4282  log_heap_new_cid(relation, &oldtup);
4283  log_heap_new_cid(relation, heaptup);
4284  }
4285 
4286  recptr = log_heap_update(relation, buffer,
4287  newbuf, &oldtup, heaptup,
4288  old_key_tuple,
4289  all_visible_cleared,
4290  all_visible_cleared_new);
4291  if (newbuf != buffer)
4292  {
4293  PageSetLSN(BufferGetPage(newbuf), recptr);
4294  }
4295  PageSetLSN(BufferGetPage(buffer), recptr);
4296  }
4297 
4298  END_CRIT_SECTION();
4299 
4300  if (newbuf != buffer)
4301  LockBuffer(newbuf, BUFFER_LOCK_UNLOCK);
4302  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
4303 
4304  /*
4305  * Mark old tuple for invalidation from system caches at next command
4306  * boundary, and mark the new tuple for invalidation in case we abort. We
4307  * have to do this before releasing the buffer because oldtup is in the
4308  * buffer. (heaptup is all in local memory, but it's necessary to process
4309  * both tuple versions in one call to inval.c so we can avoid redundant
4310  * sinval messages.)
4311  */
4312  CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4313 
4314  /* Now we can release the buffer(s) */
4315  if (newbuf != buffer)
4316  ReleaseBuffer(newbuf);
4317  ReleaseBuffer(buffer);
4318  if (BufferIsValid(vmbuffer_new))
4319  ReleaseBuffer(vmbuffer_new);
4320  if (BufferIsValid(vmbuffer))
4321  ReleaseBuffer(vmbuffer);
4322 
4323  /*
4324  * Release the lmgr tuple lock, if we had it.
4325  */
4326  if (have_tuple_lock)
4327  UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4328 
4329  pgstat_count_heap_update(relation, use_hot_update);
4330 
4331  /*
4332  * If heaptup is a private copy, release it. Don't forget to copy t_self
4333  * back to the caller's image, too.
4334  */
4335  if (heaptup != newtup)
4336  {
4337  newtup->t_self = heaptup->t_self;
4338  heap_freetuple(heaptup);
4339  }
4340 
4341  if (old_key_tuple != NULL && old_key_copied)
4342  heap_freetuple(old_key_tuple);
4343 
4344  bms_free(hot_attrs);
4345  bms_free(key_attrs);
4346  bms_free(id_attrs);
4347  bms_free(modified_attrs);
4348  bms_free(interesting_attrs);
4349 
4350  return HeapTupleMayBeUpdated;
4351 }
4352 
4353 /*
4354  * Check if the specified attribute's value is same in both given tuples.
4355  * Subroutine for HeapDetermineModifiedColumns.
4356  */
4357 static bool
4358 heap_tuple_attr_equals(TupleDesc tupdesc, int attrnum,
4359  HeapTuple tup1, HeapTuple tup2)
4360 {
4361  Datum value1,
4362  value2;
4363  bool isnull1,
4364  isnull2;
4365  Form_pg_attribute att;
4366 
4367  /*
4368  * If it's a whole-tuple reference, say "not equal". It's not really
4369  * worth supporting this case, since it could only succeed after a no-op
4370  * update, which is hardly a case worth optimizing for.
4371  */
4372  if (attrnum == 0)
4373  return false;
4374 
4375  /*
4376  * Likewise, automatically say "not equal" for any system attribute other
4377  * than OID and tableOID; we cannot expect these to be consistent in a HOT
4378  * chain, or even to be set correctly yet in the new tuple.
4379  */
4380  if (attrnum < 0)
4381  {
4382  if (attrnum != ObjectIdAttributeNumber &&
4383  attrnum != TableOidAttributeNumber)
4384  return false;
4385  }
4386 
4387  /*
4388  * Extract the corresponding values. XXX this is pretty inefficient if
4389  * there are many indexed columns. Should HeapDetermineModifiedColumns do
4390  * a single heap_deform_tuple call on each tuple, instead? But that
4391  * doesn't work for system columns ...
4392  */
4393  value1 = heap_getattr(tup1, attrnum, tupdesc, &isnull1);
4394  value2 = heap_getattr(tup2, attrnum, tupdesc, &isnull2);
4395 
4396  /*
4397  * If one value is NULL and other is not, then they are certainly not
4398  * equal
4399  */
4400  if (isnull1 != isnull2)
4401  return false;
4402 
4403  /*
4404  * If both are NULL, they can be considered equal.
4405  */
4406  if (isnull1)
4407  return true;
4408 
4409  /*
4410  * We do simple binary comparison of the two datums. This may be overly
4411  * strict because there can be multiple binary representations for the
4412  * same logical value. But we should be OK as long as there are no false
4413  * positives. Using a type-specific equality operator is messy because
4414  * there could be multiple notions of equality in different operator
4415  * classes; furthermore, we cannot safely invoke user-defined functions
4416  * while holding exclusive buffer lock.
4417  */
4418  if (attrnum <= 0)
4419  {
4420  /* The only allowed system columns are OIDs, so do this */
4421  return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4422  }
4423  else
4424  {
4425  Assert(attrnum <= tupdesc->natts);
4426  att = TupleDescAttr(tupdesc, attrnum - 1);
4427  return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4428  }
4429 }
4430 
4431 /*
4432  * Check which columns are being updated.
4433  *
4434  * Given an updated tuple, determine (and return into the output bitmapset),
4435  * from those listed as interesting, the set of columns that changed.
4436  *
4437  * The input bitmapset is destructively modified; that is OK since this is
4438  * invoked at most once in heap_update.
4439  */
4440 static Bitmapset *
4442  HeapTuple oldtup, HeapTuple newtup)
4443 {
4444  int attnum;
4445  Bitmapset *modified = NULL;
4446 
4447  while ((attnum = bms_first_member(interesting_cols)) >= 0)
4448  {
4450 
4452  attnum, oldtup, newtup))
4453  modified = bms_add_member(modified,
4455  }
4456 
4457  return modified;
4458 }
4459 
4460 /*
4461  * simple_heap_update - replace a tuple
4462  *
4463  * This routine may be used to update a tuple when concurrent updates of
4464  * the target tuple are not expected (for example, because we have a lock
4465  * on the relation associated with the tuple). Any failure is reported
4466  * via ereport().
4467  */
4468 void
4470 {
4471  HTSU_Result result;
4472  HeapUpdateFailureData hufd;
4473  LockTupleMode lockmode;
4474 
4475  result = heap_update(relation, otid, tup,
4477  true /* wait for commit */ ,
4478  &hufd, &lockmode);
4479  switch (result)
4480  {
4481  case HeapTupleSelfUpdated:
4482  /* Tuple was already updated in current command? */
4483  elog(ERROR, "tuple already updated by self");
4484  break;
4485 
4486  case HeapTupleMayBeUpdated:
4487  /* done successfully */
4488  break;
4489 
4490  case HeapTupleUpdated:
4491  elog(ERROR, "tuple concurrently updated");
4492  break;
4493 
4494  default:
4495  elog(ERROR, "unrecognized heap_update status: %u", result);
4496  break;
4497  }
4498 }
4499 
4500 
4501 /*
4502  * Return the MultiXactStatus corresponding to the given tuple lock mode.
4503  */
4504 static MultiXactStatus
4506 {
4507  int retval;
4508 
4509  if (is_update)
4510  retval = tupleLockExtraInfo[mode].updstatus;
4511  else
4512  retval = tupleLockExtraInfo[mode].lockstatus;
4513 
4514  if (retval == -1)
4515  elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4516  is_update ? "true" : "false");
4517 
4518  return (MultiXactStatus) retval;
4519 }
4520 
4521 /*
4522  * heap_lock_tuple - lock a tuple in shared or exclusive mode
4523  *
4524  * Note that this acquires a buffer pin, which the caller must release.
4525  *
4526  * Input parameters:
4527  * relation: relation containing tuple (caller must hold suitable lock)
4528  * tuple->t_self: TID of tuple to lock (rest of struct need not be valid)
4529  * cid: current command ID (used for visibility test, and stored into
4530  * tuple's cmax if lock is successful)
4531  * mode: indicates if shared or exclusive tuple lock is desired
4532  * wait_policy: what to do if tuple lock is not available
4533  * follow_updates: if true, follow the update chain to also lock descendant
4534  * tuples.
4535  *
4536  * Output parameters:
4537  * *tuple: all fields filled in
4538  * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4539  * *hufd: filled in failure cases (see below)
4540  *
4541  * Function result may be:
4542  * HeapTupleMayBeUpdated: lock was successfully acquired
4543  * HeapTupleInvisible: lock failed because tuple was never visible to us
4544  * HeapTupleSelfUpdated: lock failed because tuple updated by self
4545  * HeapTupleUpdated: lock failed because tuple updated by other xact
4546  * HeapTupleWouldBlock: lock couldn't be acquired and wait_policy is skip
4547  *
4548  * In the failure cases other than HeapTupleInvisible, the routine fills
4549  * *hufd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4550  * if necessary), and t_cmax (the last only for HeapTupleSelfUpdated,
4551  * since we cannot obtain cmax from a combocid generated by another
4552  * transaction).
4553  * See comments for struct HeapUpdateFailureData for additional info.
4554  *
4555  * See README.tuplock for a thorough explanation of this mechanism.
4556  */
4559  CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4560  bool follow_updates,
4562 {
4563  HTSU_Result result;
4564  ItemPointer tid = &(tuple->t_self);
4565  ItemId lp;
4566  Page page;
4567  Buffer vmbuffer = InvalidBuffer;
4568  BlockNumber block;
4569  TransactionId xid,
4570  xmax;
4571  uint16 old_infomask,
4572  new_infomask,
4573  new_infomask2;
4574  bool first_time = true;
4575  bool have_tuple_lock = false;
4576  bool cleared_all_frozen = false;
4577 
4578  *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4579  block = ItemPointerGetBlockNumber(tid);
4580 
4581  /*
4582  * Before locking the buffer, pin the visibility map page if it appears to
4583  * be necessary. Since we haven't got the lock yet, someone else might be
4584  * in the middle of changing this, so we'll need to recheck after we have
4585  * the lock.
4586  */
4587  if (PageIsAllVisible(BufferGetPage(*buffer)))
4588  visibilitymap_pin(relation, block, &vmbuffer);
4589 
4591 
4592  page = BufferGetPage(*buffer);
4593  lp = PageGetItemId(page, ItemPointerGetOffsetNumber(tid));
4594  Assert(ItemIdIsNormal(lp));
4595 
4596  tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4597  tuple->t_len = ItemIdGetLength(lp);
4598  tuple->t_tableOid = RelationGetRelid(relation);
4599 
4600 l3:
4601  result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4602 
4603  if (result == HeapTupleInvisible)
4604  {
4605  /*
4606  * This is possible, but only when locking a tuple for ON CONFLICT
4607  * UPDATE. We return this value here rather than throwing an error in
4608  * order to give that case the opportunity to throw a more specific
4609  * error.
4610  */
4611  result = HeapTupleInvisible;
4612  goto out_locked;
4613  }
4614  else if (result == HeapTupleBeingUpdated || result == HeapTupleUpdated)
4615  {
4616  TransactionId xwait;
4617  uint16 infomask;
4618  uint16 infomask2;
4619  bool require_sleep;
4620  ItemPointerData t_ctid;
4621 
4622  /* must copy state data before unlocking buffer */
4623  xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4624  infomask = tuple->t_data->t_infomask;
4625  infomask2 = tuple->t_data->t_infomask2;
4626  ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4627 
4628  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
4629 
4630  /*
4631  * If any subtransaction of the current top transaction already holds
4632  * a lock as strong as or stronger than what we're requesting, we
4633  * effectively hold the desired lock already. We *must* succeed
4634  * without trying to take the tuple lock, else we will deadlock
4635  * against anyone wanting to acquire a stronger lock.
4636  *
4637  * Note we only do this the first time we loop on the HTSU result;
4638  * there is no point in testing in subsequent passes, because
4639  * evidently our own transaction cannot have acquired a new lock after
4640  * the first time we checked.
4641  */
4642  if (first_time)
4643  {
4644  first_time = false;
4645 
4646  if (infomask & HEAP_XMAX_IS_MULTI)
4647  {
4648  int i;
4649  int nmembers;
4650  MultiXactMember *members;
4651 
4652  /*
4653  * We don't need to allow old multixacts here; if that had
4654  * been the case, HeapTupleSatisfiesUpdate would have returned
4655  * MayBeUpdated and we wouldn't be here.
4656  */
4657  nmembers =
4658  GetMultiXactIdMembers(xwait, &members, false,
4659  HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4660 
4661  for (i = 0; i < nmembers; i++)
4662  {
4663  /* only consider members of our own transaction */
4664  if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4665  continue;
4666 
4667  if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4668  {
4669  pfree(members);
4670  result = HeapTupleMayBeUpdated;
4671  goto out_unlocked;
4672  }
4673  }
4674 
4675  if (members)
4676  pfree(members);
4677  }
4678  else if (TransactionIdIsCurrentTransactionId(xwait))
4679  {
4680  switch (mode)
4681  {
4682  case LockTupleKeyShare:
4683  Assert(HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) ||
4684  HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4685  HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4686  result = HeapTupleMayBeUpdated;
4687  goto out_unlocked;
4688  case LockTupleShare:
4689  if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4690  HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4691  {
4692  result = HeapTupleMayBeUpdated;
4693  goto out_unlocked;
4694  }
4695  break;
4697  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4698  {
4699  result = HeapTupleMayBeUpdated;
4700  goto out_unlocked;
4701  }
4702  break;
4703  case LockTupleExclusive:
4704  if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4705  infomask2 & HEAP_KEYS_UPDATED)
4706  {
4707  result = HeapTupleMayBeUpdated;
4708  goto out_unlocked;
4709  }
4710  break;
4711  }
4712  }
4713  }
4714 
4715  /*
4716  * Initially assume that we will have to wait for the locking
4717  * transaction(s) to finish. We check various cases below in which
4718  * this can be turned off.
4719  */
4720  require_sleep = true;
4721  if (mode == LockTupleKeyShare)
4722  {
4723  /*
4724  * If we're requesting KeyShare, and there's no update present, we
4725  * don't need to wait. Even if there is an update, we can still
4726  * continue if the key hasn't been modified.
4727  *
4728  * However, if there are updates, we need to walk the update chain
4729  * to mark future versions of the row as locked, too. That way,
4730  * if somebody deletes that future version, we're protected
4731  * against the key going away. This locking of future versions
4732  * could block momentarily, if a concurrent transaction is
4733  * deleting a key; or it could return a value to the effect that
4734  * the transaction deleting the key has already committed. So we
4735  * do this before re-locking the buffer; otherwise this would be
4736  * prone to deadlocks.
4737  *
4738  * Note that the TID we're locking was grabbed before we unlocked
4739  * the buffer. For it to change while we're not looking, the
4740  * other properties we're testing for below after re-locking the
4741  * buffer would also change, in which case we would restart this
4742  * loop above.
4743  */
4744  if (!(infomask2 & HEAP_KEYS_UPDATED))
4745  {
4746  bool updated;
4747 
4748  updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4749 
4750  /*
4751  * If there are updates, follow the update chain; bail out if
4752  * that cannot be done.
4753  */
4754  if (follow_updates && updated)
4755  {
4756  HTSU_Result res;
4757 
4758  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4760  mode);
4761  if (res != HeapTupleMayBeUpdated)
4762  {
4763  result = res;
4764  /* recovery code expects to have buffer lock held */
4766  goto failed;
4767  }
4768  }
4769 
4771 
4772  /*
4773  * Make sure it's still an appropriate lock, else start over.
4774  * Also, if it wasn't updated before we released the lock, but
4775  * is updated now, we start over too; the reason is that we
4776  * now need to follow the update chain to lock the new
4777  * versions.
4778  */
4779  if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4780  ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4781  !updated))
4782  goto l3;
4783 
4784  /* Things look okay, so we can skip sleeping */
4785  require_sleep = false;
4786 
4787  /*
4788  * Note we allow Xmax to change here; other updaters/lockers
4789  * could have modified it before we grabbed the buffer lock.
4790  * However, this is not a problem, because with the recheck we
4791  * just did we ensure that they still don't conflict with the
4792  * lock we want.
4793  */
4794  }
4795  }
4796  else if (mode == LockTupleShare)
4797  {
4798  /*
4799  * If we're requesting Share, we can similarly avoid sleeping if
4800  * there's no update and no exclusive lock present.
4801  */
4802  if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4803  !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4804  {
4806 
4807  /*
4808  * Make sure it's still an appropriate lock, else start over.
4809  * See above about allowing xmax to change.
4810  */
4811  if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_data->t_infomask) ||
4813  goto l3;
4814  require_sleep = false;
4815  }
4816  }
4817  else if (mode == LockTupleNoKeyExclusive)
4818  {
4819  /*
4820  * If we're requesting NoKeyExclusive, we might also be able to
4821  * avoid sleeping; just ensure that there no conflicting lock
4822  * already acquired.
4823  */
4824  if (infomask & HEAP_XMAX_IS_MULTI)
4825  {
4826  if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4827  mode))
4828  {
4829  /*
4830  * No conflict, but if the xmax changed under us in the
4831  * meantime, start over.
4832  */
4834  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4836  xwait))
4837  goto l3;
4838 
4839  /* otherwise, we're good */
4840  require_sleep = false;
4841  }
4842  }
4843  else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4844  {
4846 
4847  /* if the xmax changed in the meantime, start over */
4848  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4851  xwait))
4852  goto l3;
4853  /* otherwise, we're good */
4854  require_sleep = false;
4855  }
4856  }
4857 
4858  /*
4859  * As a check independent from those above, we can also avoid sleeping
4860  * if the current transaction is the sole locker of the tuple. Note
4861  * that the strength of the lock already held is irrelevant; this is
4862  * not about recording the lock in Xmax (which will be done regardless
4863  * of this optimization, below). Also, note that the cases where we
4864  * hold a lock stronger than we are requesting are already handled
4865  * above by not doing anything.
4866  *
4867  * Note we only deal with the non-multixact case here; MultiXactIdWait
4868  * is well equipped to deal with this situation on its own.
4869  */
4870  if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4872  {
4873  /* ... but if the xmax changed in the meantime, start over */
4875  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4877  xwait))
4878  goto l3;
4880  require_sleep = false;
4881  }
4882 
4883  /*
4884  * Time to sleep on the other transaction/multixact, if necessary.
4885  *
4886  * If the other transaction is an update that's already committed,
4887  * then sleeping cannot possibly do any good: if we're required to
4888  * sleep, get out to raise an error instead.
4889  *
4890  * By here, we either have already acquired the buffer exclusive lock,
4891  * or we must wait for the locking transaction or multixact; so below
4892  * we ensure that we grab buffer lock after the sleep.
4893  */
4894  if (require_sleep && result == HeapTupleUpdated)
4895  {
4897  goto failed;
4898  }
4899  else if (require_sleep)
4900  {
4901  /*
4902  * Acquire tuple lock to establish our priority for the tuple, or
4903  * die trying. LockTuple will release us when we are next-in-line
4904  * for the tuple. We must do this even if we are share-locking.
4905  *
4906  * If we are forced to "start over" below, we keep the tuple lock;
4907  * this arranges that we stay at the head of the line while
4908  * rechecking tuple state.
4909  */
4910  if (!heap_acquire_tuplock(relation, tid, mode, wait_policy,
4911  &have_tuple_lock))
4912  {
4913  /*
4914  * This can only happen if wait_policy is Skip and the lock
4915  * couldn't be obtained.
4916  */
4917  result = HeapTupleWouldBlock;
4918  /* recovery code expects to have buffer lock held */
4920  goto failed;
4921  }
4922 
4923  if (infomask & HEAP_XMAX_IS_MULTI)
4924  {
4926 
4927  /* We only ever lock tuples, never update them */
4928  if (status >= MultiXactStatusNoKeyUpdate)
4929  elog(ERROR, "invalid lock mode in heap_lock_tuple");
4930 
4931  /* wait for multixact to end, or die trying */
4932  switch (wait_policy)
4933  {
4934  case LockWaitBlock:
4935  MultiXactIdWait((MultiXactId) xwait, status, infomask,
4936  relation, &tuple->t_self, XLTW_Lock, NULL);
4937  break;
4938  case LockWaitSkip:
4940  status, infomask, relation,
4941  NULL))
4942  {
4943  result = HeapTupleWouldBlock;
4944  /* recovery code expects to have buffer lock held */
4946  goto failed;
4947  }
4948  break;
4949  case LockWaitError:
4951  status, infomask, relation,
4952  NULL))
4953  ereport(ERROR,
4954  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4955  errmsg("could not obtain lock on row in relation \"%s\"",
4956  RelationGetRelationName(relation))));
4957 
4958  break;
4959  }
4960 
4961  /*
4962  * Of course, the multixact might not be done here: if we're
4963  * requesting a light lock mode, other transactions with light
4964  * locks could still be alive, as well as locks owned by our
4965  * own xact or other subxacts of this backend. We need to
4966  * preserve the surviving MultiXact members. Note that it
4967  * isn't absolutely necessary in the latter case, but doing so
4968  * is simpler.
4969  */
4970  }
4971  else
4972  {
4973  /* wait for regular transaction to end, or die trying */
4974  switch (wait_policy)
4975  {
4976  case LockWaitBlock:
4977  XactLockTableWait(xwait, relation, &tuple->t_self,
4978  XLTW_Lock);
4979  break;
4980  case LockWaitSkip:
4981  if (!ConditionalXactLockTableWait(xwait))
4982  {
4983  result = HeapTupleWouldBlock;
4984  /* recovery code expects to have buffer lock held */
4986  goto failed;
4987  }
4988  break;
4989  case LockWaitError:
4990  if (!ConditionalXactLockTableWait(xwait))
4991  ereport(ERROR,
4992  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4993  errmsg("could not obtain lock on row in relation \"%s\"",
4994  RelationGetRelationName(relation))));
4995  break;
4996  }
4997  }
4998 
4999  /* if there are updates, follow the update chain */
5000  if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
5001  {
5002  HTSU_Result res;
5003 
5004  res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
5006  mode);
5007  if (res != HeapTupleMayBeUpdated)
5008  {
5009  result = res;
5010  /* recovery code expects to have buffer lock held */
5012  goto failed;
5013  }
5014  }
5015 
5017 
5018  /*
5019  * xwait is done, but if xwait had just locked the tuple then some
5020  * other xact could update this tuple before we get to this point.
5021  * Check for xmax change, and start over if so.
5022  */
5023  if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
5025  xwait))
5026  goto l3;
5027 
5028  if (!(infomask & HEAP_XMAX_IS_MULTI))
5029  {
5030  /*
5031  * Otherwise check if it committed or aborted. Note we cannot
5032  * be here if the tuple was only locked by somebody who didn't
5033  * conflict with us; that would have been handled above. So
5034  * that transaction must necessarily be gone by now. But
5035  * don't check for this in the multixact case, because some
5036  * locker transactions might still be running.
5037  */
5038  UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5039  }
5040  }
5041 
5042  /* By here, we're certain that we hold buffer exclusive lock again */
5043 
5044  /*
5045  * We may lock if previous xmax aborted, or if it committed but only
5046  * locked the tuple without updating it; or if we didn't have to wait
5047  * at all for whatever reason.
5048  */
5049  if (!require_sleep ||
5050  (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5053  result = HeapTupleMayBeUpdated;
5054  else
5055  result = HeapTupleUpdated;
5056  }
5057 
5058 failed:
5059  if (result != HeapTupleMayBeUpdated)
5060  {
5061  Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated ||
5062  result == HeapTupleWouldBlock);
5063  Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5064  hufd->ctid = tuple->t_data->t_ctid;
5065  hufd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5066  if (result == HeapTupleSelfUpdated)
5067  hufd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5068  else
5069  hufd->cmax = InvalidCommandId;
5070  goto out_locked;
5071  }
5072 
5073  /*
5074  * If we didn't pin the visibility map page and the page has become all
5075  * visible while we were busy locking the buffer, or during some
5076  * subsequent window during which we had it unlocked, we'll have to unlock
5077  * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5078  * unfortunate, especially since we'll now have to recheck whether the
5079  * tuple has been locked or updated under us, but hopefully it won't
5080  * happen very often.
5081  */
5082  if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5083  {
5084  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5085  visibilitymap_pin(relation, block, &vmbuffer);
5087  goto l3;
5088  }
5089 
5090  xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5091  old_infomask = tuple->t_data->t_infomask;
5092 
5093  /*
5094  * If this is the first possibly-multixact-able operation in the current
5095  * transaction, set my per-backend OldestMemberMXactId setting. We can be
5096  * certain that the transaction will never become a member of any older
5097  * MultiXactIds than that. (We have to do this even if we end up just
5098  * using our own TransactionId below, since some other backend could
5099  * incorporate our XID into a MultiXact immediately afterwards.)
5100  */
5102 
5103  /*
5104  * Compute the new xmax and infomask to store into the tuple. Note we do
5105  * not modify the tuple just yet, because that would leave it in the wrong
5106  * state if multixact.c elogs.
5107  */
5108  compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
5109  GetCurrentTransactionId(), mode, false,
5110  &xid, &new_infomask, &new_infomask2);
5111 
5113 
5114  /*
5115  * Store transaction information of xact locking the tuple.
5116  *
5117  * Note: Cmax is meaningless in this context, so don't set it; this avoids
5118  * possibly generating a useless combo CID. Moreover, if we're locking a
5119  * previously updated tuple, it's important to preserve the Cmax.
5120  *
5121  * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5122  * we would break the HOT chain.
5123  */
5124  tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
5125  tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5126  tuple->t_data->t_infomask |= new_infomask;
5127  tuple->t_data->t_infomask2 |= new_infomask2;
5128  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5130  HeapTupleHeaderSetXmax(tuple->t_data, xid);
5131 
5132  /*
5133  * Make sure there is no forward chain link in t_ctid. Note that in the
5134  * cases where the tuple has been updated, we must not overwrite t_ctid,
5135  * because it was set by the updater. Moreover, if the tuple has been
5136  * updated, we need to follow the update chain to lock the new versions of
5137  * the tuple as well.
5138  */
5139  if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5140  tuple->t_data->t_ctid = *tid;
5141 
5142  /* Clear only the all-frozen bit on visibility map if needed */
5143  if (PageIsAllVisible(page) &&
5144  visibilitymap_clear(relation, block, vmbuffer,
5146  cleared_all_frozen = true;
5147 
5148 
5149  MarkBufferDirty(*buffer);
5150 
5151  /*
5152  * XLOG stuff. You might think that we don't need an XLOG record because
5153  * there is no state change worth restoring after a crash. You would be
5154  * wrong however: we have just written either a TransactionId or a
5155  * MultiXactId that may never have been seen on disk before, and we need
5156  * to make sure that there are XLOG entries covering those ID numbers.
5157  * Else the same IDs might be re-used after a crash, which would be
5158  * disastrous if this page made it to disk before the crash. Essentially
5159  * we have to enforce the WAL log-before-data rule even in this case.
5160  * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5161  * entries for everything anyway.)
5162  */
5163  if (RelationNeedsWAL(relation))
5164  {
5165  xl_heap_lock xlrec;
5166  XLogRecPtr recptr;
5167 
5168  XLogBeginInsert();
5169  XLogRegisterBuffer(0, *buffer, REGBUF_STANDARD);
5170 
5171  xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5172  xlrec.locking_xid = xid;
5173  xlrec.infobits_set = compute_infobits(new_infomask,
5174  tuple->t_data->t_infomask2);
5175  xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5176  XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
5177 
5178  /* we don't decode row locks atm, so no need to log the origin */
5179 
5180  recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
5181 
5182  PageSetLSN(page, recptr);
5183  }
5184 
5185  END_CRIT_SECTION();
5186 
5187  result = HeapTupleMayBeUpdated;
5188 
5189 out_locked:
5190  LockBuffer(*buffer, BUFFER_LOCK_UNLOCK);
5191 
5192 out_unlocked:
5193  if (BufferIsValid(vmbuffer))
5194  ReleaseBuffer(vmbuffer);
5195 
5196  /*
5197  * Don't update the visibility map here. Locking a tuple doesn't change
5198  * visibility info.
5199  */
5200 
5201  /*
5202  * Now that we have successfully marked the tuple as locked, we can
5203  * release the lmgr tuple lock, if we had it.
5204  */
5205  if (have_tuple_lock)
5206  UnlockTupleTuplock(relation, tid, mode);
5207 
5208  return result;
5209 }
5210 
5211 /*
5212  * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5213  * its normal, Xmax-based tuple lock.
5214  *
5215  * have_tuple_lock is an input and output parameter: on input, it indicates
5216  * whether the lock has previously been acquired (and this function does
5217  * nothing in that case). If this function returns success, have_tuple_lock
5218  * has been flipped to true.
5219  *
5220  * Returns false if it was unable to obtain the lock; this can only happen if
5221  * wait_policy is Skip.
5222  */
5223 static bool
5225  LockWaitPolicy wait_policy, bool *have_tuple_lock)
5226 {
5227  if (*have_tuple_lock)
5228  return true;
5229 
5230  switch (wait_policy)
5231  {
5232  case LockWaitBlock:
5233  LockTupleTuplock(relation, tid, mode);
5234  break;
5235 
5236  case LockWaitSkip:
5237  if (!ConditionalLockTupleTuplock(relation, tid, mode))
5238  return false;
5239  break;
5240 
5241  case LockWaitError:
5242  if (!ConditionalLockTupleTuplock(relation, tid, mode))
5243  ereport(ERROR,
5244  (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5245  errmsg("could not obtain lock on row in relation \"%s\"",
5246  RelationGetRelationName(relation))));
5247  break;
5248  }
5249  *have_tuple_lock = true;
5250 
5251  return true;
5252 }
5253 
5254 /*
5255  * Given an original set of Xmax and infomask, and a transaction (identified by
5256  * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5257  * corresponding infomasks to use on the tuple.
5258  *
5259  * Note that this might have side effects such as creating a new MultiXactId.
5260  *
5261  * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5262  * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5263  * but it was not running anymore. There is a race condition, which is that the
5264  * MultiXactId may have finished since then, but that uncommon case is handled
5265  * either here, or within MultiXactIdExpand.
5266  *
5267  * There is a similar race condition possible when the old xmax was a regular
5268  * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5269  * window, but it's still possible to end up creating an unnecessary
5270  * MultiXactId. Fortunately this is harmless.
5271  */
5272 static void
5274  uint16 old_infomask2, TransactionId add_to_xmax,
5275  LockTupleMode mode, bool is_update,
5276  TransactionId *result_xmax, uint16 *result_infomask,
5277  uint16 *result_infomask2)
5278 {
5279  TransactionId new_xmax;
5280  uint16 new_infomask,
5281  new_infomask2;
5282 
5284 
5285 l5:
5286  new_infomask = 0;
5287  new_infomask2 = 0;
5288  if (old_infomask & HEAP_XMAX_INVALID)
5289  {
5290  /*
5291  * No previous locker; we just insert our own TransactionId.
5292  *
5293  * Note that it's critical that this case be the first one checked,
5294  * because there are several blocks below that come back to this one
5295  * to implement certain optimizations; old_infomask might contain
5296  * other dirty bits in those cases, but we don't really care.
5297  */
5298  if (is_update)
5299  {
5300  new_xmax = add_to_xmax;
5301  if (mode == LockTupleExclusive)
5302  new_infomask2 |= HEAP_KEYS_UPDATED;
5303  }
5304  else
5305  {
5306  new_infomask |= HEAP_XMAX_LOCK_ONLY;
5307  switch (mode)
5308  {
5309  case LockTupleKeyShare:
5310  new_xmax = add_to_xmax;
5311  new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5312  break;
5313  case LockTupleShare:
5314  new_xmax = add_to_xmax;
5315  new_infomask |= HEAP_XMAX_SHR_LOCK;
5316  break;
5318  new_xmax = add_to_xmax;
5319  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5320  break;
5321  case LockTupleExclusive:
5322  new_xmax = add_to_xmax;
5323  new_infomask |= HEAP_XMAX_EXCL_LOCK;
5324  new_infomask2 |= HEAP_KEYS_UPDATED;
5325  break;
5326  default:
5327  new_xmax = InvalidTransactionId; /* silence compiler */
5328  elog(ERROR, "invalid lock mode");
5329  }
5330  }
5331  }
5332  else if (old_infomask & HEAP_XMAX_IS_MULTI)
5333  {
5334  MultiXactStatus new_status;
5335 
5336  /*
5337  * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5338  * cross-check.
5339  */
5340  Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5341 
5342  /*
5343  * A multixact together with LOCK_ONLY set but neither lock bit set
5344  * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5345  * anymore. This check is critical for databases upgraded by
5346  * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5347  * that such multis are never passed.
5348  */
5349  if (HEAP_LOCKED_UPGRADED(old_infomask))
5350  {
5351  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5352  old_infomask |= HEAP_XMAX_INVALID;
5353  goto l5;
5354  }
5355 
5356  /*
5357  * If the XMAX is already a MultiXactId, then we need to expand it to
5358  * include add_to_xmax; but if all the members were lockers and are
5359  * all gone, we can do away with the IS_MULTI bit and just set
5360  * add_to_xmax as the only locker/updater. If all lockers are gone
5361  * and we have an updater that aborted, we can also do without a
5362  * multi.
5363  *
5364  * The cost of doing GetMultiXactIdMembers would be paid by
5365  * MultiXactIdExpand if we weren't to do this, so this check is not
5366  * incurring extra work anyhow.
5367  */
5368  if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5369  {
5370  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5372  old_infomask)))
5373  {
5374  /*
5375  * Reset these bits and restart; otherwise fall through to
5376  * create a new multi below.
5377  */
5378  old_infomask &= ~HEAP_XMAX_IS_MULTI;
5379  old_infomask |= HEAP_XMAX_INVALID;
5380  goto l5;
5381  }
5382  }
5383 
5384  new_status = get_mxact_status_for_lock(mode, is_update);
5385 
5386  new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5387  new_status);
5388  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5389  }
5390  else if (old_infomask & HEAP_XMAX_COMMITTED)
5391  {
5392  /*
5393  * It's a committed update, so we need to preserve him as updater of
5394  * the tuple.
5395  */
5397  MultiXactStatus new_status;
5398 
5399  if (old_infomask2 & HEAP_KEYS_UPDATED)
5400  status = MultiXactStatusUpdate;
5401  else
5402  status = MultiXactStatusNoKeyUpdate;
5403 
5404  new_status = get_mxact_status_for_lock(mode, is_update);
5405 
5406  /*
5407  * since it's not running, it's obviously impossible for the old
5408  * updater to be identical to the current one, so we need not check
5409  * for that case as we do in the block above.
5410  */
5411  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5412  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5413  }
5414  else if (TransactionIdIsInProgress(xmax))
5415  {
5416  /*
5417  * If the XMAX is a valid, in-progress TransactionId, then we need to
5418  * create a new MultiXactId that includes both the old locker or
5419  * updater and our own TransactionId.
5420  */
5421  MultiXactStatus new_status;
5422  MultiXactStatus old_status;
5423  LockTupleMode old_mode;
5424 
5425  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5426  {
5427  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5428  old_status = MultiXactStatusForKeyShare;
5429  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5430  old_status = MultiXactStatusForShare;
5431  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5432  {
5433  if (old_infomask2 & HEAP_KEYS_UPDATED)
5434  old_status = MultiXactStatusForUpdate;
5435  else
5436  old_status = MultiXactStatusForNoKeyUpdate;
5437  }
5438  else
5439  {
5440  /*
5441  * LOCK_ONLY can be present alone only when a page has been
5442  * upgraded by pg_upgrade. But in that case,
5443  * TransactionIdIsInProgress() should have returned false. We
5444  * assume it's no longer locked in this case.
5445  */
5446  elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5447  old_infomask |= HEAP_XMAX_INVALID;
5448  old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5449  goto l5;
5450  }
5451  }
5452  else
5453  {
5454  /* it's an update, but which kind? */
5455  if (old_infomask2 & HEAP_KEYS_UPDATED)
5456  old_status = MultiXactStatusUpdate;
5457  else
5458  old_status = MultiXactStatusNoKeyUpdate;
5459  }
5460 
5461  old_mode = TUPLOCK_from_mxstatus(old_status);
5462 
5463  /*
5464  * If the lock to be acquired is for the same TransactionId as the
5465  * existing lock, there's an optimization possible: consider only the
5466  * strongest of both locks as the only one present, and restart.
5467  */
5468  if (xmax == add_to_xmax)
5469  {
5470  /*
5471  * Note that it's not possible for the original tuple to be
5472  * updated: we wouldn't be here because the tuple would have been
5473  * invisible and we wouldn't try to update it. As a subtlety,
5474  * this code can also run when traversing an update chain to lock
5475  * future versions of a tuple. But we wouldn't be here either,
5476  * because the add_to_xmax would be different from the original
5477  * updater.
5478  */
5479  Assert(HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5480 
5481  /* acquire the strongest of both */
5482  if (mode < old_mode)
5483  mode = old_mode;
5484  /* mustn't touch is_update */
5485 
5486  old_infomask |= HEAP_XMAX_INVALID;
5487  goto l5;
5488  }
5489 
5490  /* otherwise, just fall back to creating a new multixact */
5491  new_status = get_mxact_status_for_lock(mode, is_update);
5492  new_xmax = MultiXactIdCreate(xmax, old_status,
5493  add_to_xmax, new_status);
5494  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5495  }
5496  else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) &&
5497  TransactionIdDidCommit(xmax))
5498  {
5499  /*
5500  * It's a committed update, so we gotta preserve him as updater of the
5501  * tuple.
5502  */
5504  MultiXactStatus new_status;
5505 
5506  if (old_infomask2 & HEAP_KEYS_UPDATED)
5507  status = MultiXactStatusUpdate;
5508  else
5509  status = MultiXactStatusNoKeyUpdate;
5510 
5511  new_status = get_mxact_status_for_lock(mode, is_update);
5512 
5513  /*
5514  * since it's not running, it's obviously impossible for the old
5515  * updater to be identical to the current one, so we need not check
5516  * for that case as we do in the block above.
5517  */
5518  new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5519  GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5520  }
5521  else
5522  {
5523  /*
5524  * Can get here iff the locking/updating transaction was running when
5525  * the infomask was extracted from the tuple, but finished before
5526  * TransactionIdIsInProgress got to run. Deal with it as if there was
5527  * no locker at all in the first place.
5528  */
5529  old_infomask |= HEAP_XMAX_INVALID;
5530  goto l5;
5531  }
5532 
5533  *result_infomask = new_infomask;
5534  *result_infomask2 = new_infomask2;
5535  *result_xmax = new_xmax;
5536 }
5537 
5538 /*
5539  * Subroutine for heap_lock_updated_tuple_rec.
5540  *
5541  * Given a hypothetical multixact status held by the transaction identified
5542  * with the given xid, does the current transaction need to wait, fail, or can
5543  * it continue if it wanted to acquire a lock of the given mode? "needwait"
5544  * is set to true if waiting is necessary; if it can continue, then
5545  * HeapTupleMayBeUpdated is returned. If the lock is already held by the
5546  * current transaction, return HeapTupleSelfUpdated. In case of a conflict
5547  * with another transaction, a different HeapTupleSatisfiesUpdate return code
5548  * is returned.
5549  *
5550  * The held status is said to be hypothetical because it might correspond to a
5551  * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5552  * way for simplicity of API.
5553  */
5554 static HTSU_Result
5556  LockTupleMode mode, bool *needwait)
5557 {
5558  MultiXactStatus wantedstatus;
5559 
5560  *needwait = false;
5561  wantedstatus = get_mxact_status_for_lock(mode, false);
5562 
5563  /*
5564  * Note: we *must* check TransactionIdIsInProgress before
5565  * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an
5566  * explanation.
5567  */
5569  {
5570  /*
5571  * The tuple has already been locked by our own transaction. This is
5572  * very rare but can happen if multiple transactions are trying to
5573  * lock an ancient version of the same tuple.
5574  */
5575  return HeapTupleSelfUpdated;
5576  }
5577  else if (TransactionIdIsInProgress(xid))
5578  {
5579  /*
5580  * If the locking transaction is running, what we do depends on
5581  * whether the lock modes conflict: if they do, then we must wait for
5582  * it to finish; otherwise we can fall through to lock this tuple
5583  * version without waiting.
5584  */
5586  LOCKMODE_from_mxstatus(wantedstatus)))
5587  {
5588  *needwait = true;
5589  }
5590 
5591  /*
5592  * If we set needwait above, then this value doesn't matter;
5593  * otherwise, this value signals to caller that it's okay to proceed.
5594  */
5595  return HeapTupleMayBeUpdated;
5596  }
5597  else if (TransactionIdDidAbort(xid))
5598  return HeapTupleMayBeUpdated;
5599  else if (TransactionIdDidCommit(xid))
5600  {
5601  /*
5602  * The other transaction committed. If it was only a locker, then the
5603  * lock is completely gone now and we can return success; but if it
5604  * was an update, then what we do depends on whether the two lock
5605  * modes conflict. If they conflict, then we must report error to
5606  * caller. But if they don't, we can fall through to allow the current
5607  * transaction to lock the tuple.
5608  *
5609  * Note: the reason we worry about ISUPDATE here is because as soon as
5610  * a transaction ends, all its locks are gone and meaningless, and
5611  * thus we can ignore them; whereas its updates persist. In the
5612  * TransactionIdIsInProgress case, above, we don't need to check
5613  * because we know the lock is still "alive" and thus a conflict needs
5614  * always be checked.
5615  */
5616  if (!ISUPDATE_from_mxstatus(status))
5617  return HeapTupleMayBeUpdated;
5618 
5620  LOCKMODE_from_mxstatus(wantedstatus)))
5621  /* bummer */
5622  return HeapTupleUpdated;
5623 
5624  return HeapTupleMayBeUpdated;
5625  }
5626 
5627  /* Not in progress, not aborted, not committed -- must have crashed */
5628  return HeapTupleMayBeUpdated;
5629 }
5630 
5631 
5632 /*
5633  * Recursive part of heap_lock_updated_tuple
5634  *
5635  * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5636  * xid with the given mode; if this tuple is updated, recurse to lock the new
5637  * version as well.
5638  */
5639 static HTSU_Result
5641  LockTupleMode mode)
5642 {
5643  HTSU_Result result;
5644  ItemPointerData tupid;
5645  HeapTupleData mytup;
5646  Buffer buf;
5647  uint16 new_infomask,
5648  new_infomask2,
5649  old_infomask,
5650  old_infomask2;
5651  TransactionId xmax,
5652  new_xmax;
5653  TransactionId priorXmax = InvalidTransactionId;
5654  bool cleared_all_frozen = false;
5655  Buffer vmbuffer = InvalidBuffer;
5656  BlockNumber block;
5657 
5658  ItemPointerCopy(tid, &tupid);
5659 
5660  for (;;)
5661  {
5662  new_infomask = 0;
5663  new_xmax = InvalidTransactionId;
5664  block = ItemPointerGetBlockNumber(&tupid);
5665  ItemPointerCopy(&tupid, &(mytup.t_self));
5666 
5667  if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false, NULL))
5668  {
5669  /*
5670  * if we fail to find the updated version of the tuple, it's
5671  * because it was vacuumed/pruned away after its creator
5672  * transaction aborted. So behave as if we got to the end of the
5673  * chain, and there's no further tuple to lock: return success to
5674  * caller.
5675  */
5676  return HeapTupleMayBeUpdated;
5677  }
5678 
5679 l4:
5681 
5682  /*
5683  * Before locking the buffer, pin the visibility map page if it
5684  * appears to be necessary. Since we haven't got the lock yet,
5685  * someone else might be in the middle of changing this, so we'll need
5686  * to recheck after we have the lock.
5687  */
5688  if (PageIsAllVisible(BufferGetPage(buf)))
5689  visibilitymap_pin(rel, block, &vmbuffer);
5690  else
5691  vmbuffer = InvalidBuffer;
5692 
5694 
5695  /*
5696  * If we didn't pin the visibility map page and the page has become
5697  * all visible while we were busy locking the buffer, we'll have to
5698  * unlock and re-lock, to avoid holding the buffer lock across I/O.
5699  * That's a bit unfortunate, but hopefully shouldn't happen often.
5700  */
5701  if (vmbuffer == InvalidBuffer && PageIsAllVisible(BufferGetPage(buf)))
5702  {
5704  visibilitymap_pin(rel, block, &vmbuffer);
5706  }
5707 
5708  /*
5709  * Check the tuple XMIN against prior XMAX, if any. If we reached the
5710  * end of the chain, we're done, so return success.
5711  */
5712  if (TransactionIdIsValid(priorXmax) &&
5714  priorXmax))
5715  {
5716  result = HeapTupleMayBeUpdated;
5717  goto out_locked;
5718  }
5719 
5720  /*
5721  * Also check Xmin: if this tuple was created by an aborted
5722  * (sub)transaction, then we already locked the last live one in the
5723  * chain, thus we're done, so return success.
5724  */
5726  {
5727  UnlockReleaseBuffer(buf);
5728  return HeapTupleMayBeUpdated;
5729  }
5730 
5731  old_infomask = mytup.t_data->t_infomask;
5732  old_infomask2 = mytup.t_data->t_infomask2;
5733  xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5734 
5735  /*
5736  * If this tuple version has been updated or locked by some concurrent
5737  * transaction(s), what we do depends on whether our lock mode
5738  * conflicts with what those other transactions hold, and also on the
5739  * status of them.
5740  */
5741  if (!(old_infomask & HEAP_XMAX_INVALID))
5742  {
5743  TransactionId rawxmax;
5744  bool needwait;
5745 
5746  rawxmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5747  if (old_infomask & HEAP_XMAX_IS_MULTI)
5748  {
5749  int nmembers;
5750  int i;
5751  MultiXactMember *members;
5752 
5753  /*
5754  * We don't need a test for pg_upgrade'd tuples: this is only
5755  * applied to tuples after the first in an update chain. Said
5756  * first tuple in the chain may well be locked-in-9.2-and-
5757  * pg_upgraded, but that one was already locked by our caller,
5758  * not us; and any subsequent ones cannot be because our
5759  * caller must necessarily have obtained a snapshot later than
5760  * the pg_upgrade itself.
5761  */
5763 
5764  nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5765  HEAP_XMAX_IS_LOCKED_ONLY(old_infomask));
5766  for (i = 0; i < nmembers; i++)
5767  {
5768  result = test_lockmode_for_conflict(members[i].status,
5769  members[i].xid,
5770  mode, &needwait);
5771 
5772  /*
5773  * If the tuple was already locked by ourselves in a
5774  * previous iteration of this (say heap_lock_tuple was
5775  * forced to restart the locking loop because of a change
5776  * in xmax), then we hold the lock already on this tuple
5777  * version and we don't need to do anything; and this is
5778  * not an error condition either. We just need to skip
5779  * this tuple and continue locking the next version in the
5780  * update chain.
5781  */
5782  if (result == HeapTupleSelfUpdated)
5783  {
5784  pfree(members);
5785  goto next;
5786  }
5787 
5788  if (needwait)
5789  {
5791  XactLockTableWait(members[i].xid, rel,
5792  &mytup.t_self,
5794  pfree(members);
5795  goto l4;
5796  }
5797  if (result != HeapTupleMayBeUpdated)
5798  {
5799  pfree(members);
5800  goto out_locked;
5801  }
5802  }
5803  if (members)
5804  pfree(members);
5805  }
5806  else
5807  {
5809 
5810  /*
5811  * For a non-multi Xmax, we first need to compute the
5812  * corresponding MultiXactStatus by using the infomask bits.
5813  */
5814  if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5815  {
5816  if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5817  status = MultiXactStatusForKeyShare;
5818  else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5819  status = MultiXactStatusForShare;
5820  else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5821  {
5822  if (old_infomask2 & HEAP_KEYS_UPDATED)
5823  status = MultiXactStatusForUpdate;
5824  else
5826  }
5827  else
5828  {
5829  /*
5830  * LOCK_ONLY present alone (a pg_upgraded tuple marked
5831  * as share-locked in the old cluster) shouldn't be
5832  * seen in the middle of an update chain.
5833  */
5834  elog(ERROR, "invalid lock status in tuple");
5835  }
5836  }
5837  else
5838  {
5839  /* it's an update, but which kind? */
5840  if (old_infomask2 & HEAP_KEYS_UPDATED)
5841  status = MultiXactStatusUpdate;
5842  else
5843  status = MultiXactStatusNoKeyUpdate;
5844  }
5845 
5846  result = test_lockmode_for_conflict(status, rawxmax, mode,
5847  &needwait);
5848 
5849  /*
5850  * If the tuple was already locked by ourselves in a previous
5851  * iteration of this (say heap_lock_tuple was forced to
5852  * restart the locking loop because of a change in xmax), then
5853  * we hold the lock already on this tuple version and we don't
5854  * need to do anything; and this is not an error condition
5855  * either. We just need to skip this tuple and continue
5856  * locking the next version in the update chain.
5857  */
5858  if (result == HeapTupleSelfUpdated)
5859  goto next;
5860 
5861  if (needwait)
5862  {
5864  XactLockTableWait(rawxmax, rel, &mytup.t_self,
5866  goto l4;
5867  }
5868  if (result !=