PostgreSQL Source Code git master
heapam.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * heapam.c
4 * heap access method code
5 *
6 * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
7 * Portions Copyright (c) 1994, Regents of the University of California
8 *
9 *
10 * IDENTIFICATION
11 * src/backend/access/heap/heapam.c
12 *
13 *
14 * INTERFACE ROUTINES
15 * heap_beginscan - begin relation scan
16 * heap_rescan - restart a relation scan
17 * heap_endscan - end relation scan
18 * heap_getnext - retrieve next tuple in scan
19 * heap_fetch - retrieve tuple with given tid
20 * heap_insert - insert tuple into a relation
21 * heap_multi_insert - insert multiple tuples into a relation
22 * heap_delete - delete a tuple from a relation
23 * heap_update - replace a tuple in a relation with another tuple
24 *
25 * NOTES
26 * This file contains the heap_ routines which implement
27 * the POSTGRES heap access method used for all POSTGRES
28 * relations.
29 *
30 *-------------------------------------------------------------------------
31 */
32#include "postgres.h"
33
34#include "access/heapam.h"
35#include "access/heaptoast.h"
36#include "access/hio.h"
37#include "access/multixact.h"
38#include "access/subtrans.h"
39#include "access/syncscan.h"
40#include "access/valid.h"
42#include "access/xloginsert.h"
43#include "catalog/pg_database.h"
44#include "catalog/pg_database_d.h"
45#include "commands/vacuum.h"
46#include "pgstat.h"
47#include "port/pg_bitutils.h"
48#include "storage/lmgr.h"
49#include "storage/predicate.h"
50#include "storage/procarray.h"
51#include "utils/datum.h"
52#include "utils/inval.h"
53#include "utils/spccache.h"
54
55
57 TransactionId xid, CommandId cid, int options);
58static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
59 Buffer newbuf, HeapTuple oldtup,
60 HeapTuple newtup, HeapTuple old_key_tuple,
61 bool all_visible_cleared, bool new_all_visible_cleared);
62#ifdef USE_ASSERT_CHECKING
63static void check_lock_if_inplace_updateable_rel(Relation relation,
64 ItemPointer otid,
65 HeapTuple newtup);
66static void check_inplace_rel_lock(HeapTuple oldtup);
67#endif
69 Bitmapset *interesting_cols,
70 Bitmapset *external_cols,
71 HeapTuple oldtup, HeapTuple newtup,
72 bool *has_external);
73static bool heap_acquire_tuplock(Relation relation, ItemPointer tid,
75 bool *have_tuple_lock);
77 BlockNumber block,
78 ScanDirection dir);
80 ScanDirection dir);
81static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
82 uint16 old_infomask2, TransactionId add_to_xmax,
83 LockTupleMode mode, bool is_update,
84 TransactionId *result_xmax, uint16 *result_infomask,
85 uint16 *result_infomask2);
87 ItemPointer ctid, TransactionId xid,
89static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
90 uint16 *new_infomask2);
92 uint16 t_infomask);
93static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask,
94 LockTupleMode lockmode, bool *current_is_member);
95static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask,
97 int *remaining);
99 uint16 infomask, Relation rel, int *remaining);
100static void index_delete_sort(TM_IndexDeleteOp *delstate);
101static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate);
102static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup);
103static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required,
104 bool *copy);
105
106
107/*
108 * Each tuple lock mode has a corresponding heavyweight lock, and one or two
109 * corresponding MultiXactStatuses (one to merely lock tuples, another one to
110 * update them). This table (and the macros below) helps us determine the
111 * heavyweight lock mode and MultiXactStatus values to use for any particular
112 * tuple lock strength.
113 *
114 * These interact with InplaceUpdateTupleLock, an alias for ExclusiveLock.
115 *
116 * Don't look at lockstatus/updstatus directly! Use get_mxact_status_for_lock
117 * instead.
118 */
119static const struct
120{
124}
125
127{
128 { /* LockTupleKeyShare */
131 -1 /* KeyShare does not allow updating tuples */
132 },
133 { /* LockTupleShare */
136 -1 /* Share does not allow updating tuples */
137 },
138 { /* LockTupleNoKeyExclusive */
142 },
143 { /* LockTupleExclusive */
147 }
149
150/* Get the LOCKMODE for a given MultiXactStatus */
151#define LOCKMODE_from_mxstatus(status) \
152 (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
153
154/*
155 * Acquire heavyweight locks on tuples, using a LockTupleMode strength value.
156 * This is more readable than having every caller translate it to lock.h's
157 * LOCKMODE.
158 */
159#define LockTupleTuplock(rel, tup, mode) \
160 LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
161#define UnlockTupleTuplock(rel, tup, mode) \
162 UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
163#define ConditionalLockTupleTuplock(rel, tup, mode) \
164 ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
165
166#ifdef USE_PREFETCH
167/*
168 * heap_index_delete_tuples and index_delete_prefetch_buffer use this
169 * structure to coordinate prefetching activity
170 */
171typedef struct
172{
173 BlockNumber cur_hblkno;
174 int next_item;
175 int ndeltids;
176 TM_IndexDelete *deltids;
177} IndexDeletePrefetchState;
178#endif
179
180/* heap_index_delete_tuples bottom-up index deletion costing constants */
181#define BOTTOMUP_MAX_NBLOCKS 6
182#define BOTTOMUP_TOLERANCE_NBLOCKS 3
183
184/*
185 * heap_index_delete_tuples uses this when determining which heap blocks it
186 * must visit to help its bottom-up index deletion caller
187 */
188typedef struct IndexDeleteCounts
189{
190 int16 npromisingtids; /* Number of "promising" TIDs in group */
191 int16 ntids; /* Number of TIDs in group */
192 int16 ifirsttid; /* Offset to group's first deltid */
194
195/*
196 * This table maps tuple lock strength values for each particular
197 * MultiXactStatus value.
198 */
200{
201 LockTupleKeyShare, /* ForKeyShare */
202 LockTupleShare, /* ForShare */
203 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
204 LockTupleExclusive, /* ForUpdate */
205 LockTupleNoKeyExclusive, /* NoKeyUpdate */
206 LockTupleExclusive /* Update */
207};
208
209/* Get the LockTupleMode for a given MultiXactStatus */
210#define TUPLOCK_from_mxstatus(status) \
211 (MultiXactStatusLock[(status)])
212
213/* ----------------------------------------------------------------
214 * heap support routines
215 * ----------------------------------------------------------------
216 */
217
218/*
219 * Streaming read API callback for parallel sequential scans. Returns the next
220 * block the caller wants from the read stream or InvalidBlockNumber when done.
221 */
222static BlockNumber
224 void *callback_private_data,
225 void *per_buffer_data)
226{
227 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
228
231
232 if (unlikely(!scan->rs_inited))
233 {
234 /* parallel scan */
238
239 /* may return InvalidBlockNumber if there are no more blocks */
243 scan->rs_inited = true;
244 }
245 else
246 {
249 scan->rs_base.rs_parallel);
250 }
251
252 return scan->rs_prefetch_block;
253}
254
255/*
256 * Streaming read API callback for serial sequential and TID range scans.
257 * Returns the next block the caller wants from the read stream or
258 * InvalidBlockNumber when done.
259 */
260static BlockNumber
262 void *callback_private_data,
263 void *per_buffer_data)
264{
265 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
266
267 if (unlikely(!scan->rs_inited))
268 {
270 scan->rs_inited = true;
271 }
272 else
274 scan->rs_prefetch_block,
275 scan->rs_dir);
276
277 return scan->rs_prefetch_block;
278}
279
280/* ----------------
281 * initscan - scan code common to heap_beginscan and heap_rescan
282 * ----------------
283 */
284static void
285initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
286{
287 ParallelBlockTableScanDesc bpscan = NULL;
288 bool allow_strat;
289 bool allow_sync;
290
291 /*
292 * Determine the number of blocks we have to scan.
293 *
294 * It is sufficient to do this once at scan start, since any tuples added
295 * while the scan is in progress will be invisible to my snapshot anyway.
296 * (That is not true when using a non-MVCC snapshot. However, we couldn't
297 * guarantee to return tuples added after scan start anyway, since they
298 * might go into pages we already scanned. To guarantee consistent
299 * results for a non-MVCC snapshot, the caller must hold some higher-level
300 * lock that ensures the interesting tuple(s) won't change.)
301 */
302 if (scan->rs_base.rs_parallel != NULL)
303 {
305 scan->rs_nblocks = bpscan->phs_nblocks;
306 }
307 else
309
310 /*
311 * If the table is large relative to NBuffers, use a bulk-read access
312 * strategy and enable synchronized scanning (see syncscan.c). Although
313 * the thresholds for these features could be different, we make them the
314 * same so that there are only two behaviors to tune rather than four.
315 * (However, some callers need to be able to disable one or both of these
316 * behaviors, independently of the size of the table; also there is a GUC
317 * variable that can disable synchronized scanning.)
318 *
319 * Note that table_block_parallelscan_initialize has a very similar test;
320 * if you change this, consider changing that one, too.
321 */
323 scan->rs_nblocks > NBuffers / 4)
324 {
325 allow_strat = (scan->rs_base.rs_flags & SO_ALLOW_STRAT) != 0;
326 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
327 }
328 else
329 allow_strat = allow_sync = false;
330
331 if (allow_strat)
332 {
333 /* During a rescan, keep the previous strategy object. */
334 if (scan->rs_strategy == NULL)
336 }
337 else
338 {
339 if (scan->rs_strategy != NULL)
341 scan->rs_strategy = NULL;
342 }
343
344 if (scan->rs_base.rs_parallel != NULL)
345 {
346 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
349 else
350 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
351 }
352 else if (keep_startblock)
353 {
354 /*
355 * When rescanning, we want to keep the previous startblock setting,
356 * so that rewinding a cursor doesn't generate surprising results.
357 * Reset the active syncscan setting, though.
358 */
359 if (allow_sync && synchronize_seqscans)
361 else
362 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
363 }
364 else if (allow_sync && synchronize_seqscans)
365 {
368 }
369 else
370 {
371 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
372 scan->rs_startblock = 0;
373 }
374
376 scan->rs_inited = false;
377 scan->rs_ctup.t_data = NULL;
379 scan->rs_cbuf = InvalidBuffer;
381 scan->rs_ntuples = 0;
382 scan->rs_cindex = 0;
383
384 /*
385 * Initialize to ForwardScanDirection because it is most common and
386 * because heap scans go forward before going backward (e.g. CURSORs).
387 */
390
391 /* page-at-a-time fields are always invalid when not rs_inited */
392
393 /*
394 * copy the scan key, if appropriate
395 */
396 if (key != NULL && scan->rs_base.rs_nkeys > 0)
397 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
398
399 /*
400 * Currently, we only have a stats counter for sequential heap scans (but
401 * e.g for bitmap scans the underlying bitmap index scans will be counted,
402 * and for sample scans we update stats for tuple fetches).
403 */
404 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
406}
407
408/*
409 * heap_setscanlimits - restrict range of a heapscan
410 *
411 * startBlk is the page to start at
412 * numBlks is number of pages to scan (InvalidBlockNumber means "all")
413 */
414void
416{
417 HeapScanDesc scan = (HeapScanDesc) sscan;
418
419 Assert(!scan->rs_inited); /* else too late to change */
420 /* else rs_startblock is significant */
422
423 /* Check startBlk is valid (but allow case of zero blocks...) */
424 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
425
426 scan->rs_startblock = startBlk;
427 scan->rs_numblocks = numBlks;
428}
429
430/*
431 * Per-tuple loop for heap_prepare_pagescan(). Pulled out so it can be called
432 * multiple times, with constant arguments for all_visible,
433 * check_serializable.
434 */
436static int
438 Page page, Buffer buffer,
439 BlockNumber block, int lines,
440 bool all_visible, bool check_serializable)
441{
442 int ntup = 0;
443 OffsetNumber lineoff;
444
445 for (lineoff = FirstOffsetNumber; lineoff <= lines; lineoff++)
446 {
447 ItemId lpp = PageGetItemId(page, lineoff);
448 HeapTupleData loctup;
449 bool valid;
450
451 if (!ItemIdIsNormal(lpp))
452 continue;
453
454 loctup.t_data = (HeapTupleHeader) PageGetItem(page, lpp);
455 loctup.t_len = ItemIdGetLength(lpp);
457 ItemPointerSet(&(loctup.t_self), block, lineoff);
458
459 if (all_visible)
460 valid = true;
461 else
462 valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
463
464 if (check_serializable)
466 &loctup, buffer, snapshot);
467
468 if (valid)
469 {
470 scan->rs_vistuples[ntup] = lineoff;
471 ntup++;
472 }
473 }
474
476
477 return ntup;
478}
479
480/*
481 * heap_prepare_pagescan - Prepare current scan page to be scanned in pagemode
482 *
483 * Preparation currently consists of 1. prune the scan's rs_cbuf page, and 2.
484 * fill the rs_vistuples[] array with the OffsetNumbers of visible tuples.
485 */
486void
488{
489 HeapScanDesc scan = (HeapScanDesc) sscan;
490 Buffer buffer = scan->rs_cbuf;
491 BlockNumber block = scan->rs_cblock;
492 Snapshot snapshot;
493 Page page;
494 int lines;
495 bool all_visible;
496 bool check_serializable;
497
498 Assert(BufferGetBlockNumber(buffer) == block);
499
500 /* ensure we're not accidentally being used when not in pagemode */
502 snapshot = scan->rs_base.rs_snapshot;
503
504 /*
505 * Prune and repair fragmentation for the whole page, if possible.
506 */
507 heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
508
509 /*
510 * We must hold share lock on the buffer content while examining tuple
511 * visibility. Afterwards, however, the tuples we have found to be
512 * visible are guaranteed good as long as we hold the buffer pin.
513 */
515
516 page = BufferGetPage(buffer);
517 lines = PageGetMaxOffsetNumber(page);
518
519 /*
520 * If the all-visible flag indicates that all tuples on the page are
521 * visible to everyone, we can skip the per-tuple visibility tests.
522 *
523 * Note: In hot standby, a tuple that's already visible to all
524 * transactions on the primary might still be invisible to a read-only
525 * transaction in the standby. We partly handle this problem by tracking
526 * the minimum xmin of visible tuples as the cut-off XID while marking a
527 * page all-visible on the primary and WAL log that along with the
528 * visibility map SET operation. In hot standby, we wait for (or abort)
529 * all transactions that can potentially may not see one or more tuples on
530 * the page. That's how index-only scans work fine in hot standby. A
531 * crucial difference between index-only scans and heap scans is that the
532 * index-only scan completely relies on the visibility map where as heap
533 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
534 * the page-level flag can be trusted in the same way, because it might
535 * get propagated somehow without being explicitly WAL-logged, e.g. via a
536 * full page write. Until we can prove that beyond doubt, let's check each
537 * tuple for visibility the hard way.
538 */
539 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
540 check_serializable =
542
543 /*
544 * We call page_collect_tuples() with constant arguments, to get the
545 * compiler to constant fold the constant arguments. Separate calls with
546 * constant arguments, rather than variables, are needed on several
547 * compilers to actually perform constant folding.
548 */
549 if (likely(all_visible))
550 {
551 if (likely(!check_serializable))
552 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
553 block, lines, true, false);
554 else
555 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
556 block, lines, true, true);
557 }
558 else
559 {
560 if (likely(!check_serializable))
561 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
562 block, lines, false, false);
563 else
564 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
565 block, lines, false, true);
566 }
567
569}
570
571/*
572 * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM.
573 *
574 * Read the next block of the scan relation from the read stream and save it
575 * in the scan descriptor. It is already pinned.
576 */
577static inline void
579{
580 Assert(scan->rs_read_stream);
581
582 /* release previous scan buffer, if any */
583 if (BufferIsValid(scan->rs_cbuf))
584 {
585 ReleaseBuffer(scan->rs_cbuf);
586 scan->rs_cbuf = InvalidBuffer;
587 }
588
589 /*
590 * Be sure to check for interrupts at least once per page. Checks at
591 * higher code levels won't be able to stop a seqscan that encounters many
592 * pages' worth of consecutive dead tuples.
593 */
595
596 /*
597 * If the scan direction is changing, reset the prefetch block to the
598 * current block. Otherwise, we will incorrectly prefetch the blocks
599 * between the prefetch block and the current block again before
600 * prefetching blocks in the new, correct scan direction.
601 */
602 if (unlikely(scan->rs_dir != dir))
603 {
604 scan->rs_prefetch_block = scan->rs_cblock;
606 }
607
608 scan->rs_dir = dir;
609
611 if (BufferIsValid(scan->rs_cbuf))
613}
614
615/*
616 * heapgettup_initial_block - return the first BlockNumber to scan
617 *
618 * Returns InvalidBlockNumber when there are no blocks to scan. This can
619 * occur with empty tables and in parallel scans when parallel workers get all
620 * of the pages before we can get a chance to get our first page.
621 */
624{
625 Assert(!scan->rs_inited);
626 Assert(scan->rs_base.rs_parallel == NULL);
627
628 /* When there are no pages to scan, return InvalidBlockNumber */
629 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
630 return InvalidBlockNumber;
631
632 if (ScanDirectionIsForward(dir))
633 {
634 return scan->rs_startblock;
635 }
636 else
637 {
638 /*
639 * Disable reporting to syncscan logic in a backwards scan; it's not
640 * very likely anyone else is doing the same thing at the same time,
641 * and much more likely that we'll just bollix things for forward
642 * scanners.
643 */
644 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
645
646 /*
647 * Start from last page of the scan. Ensure we take into account
648 * rs_numblocks if it's been adjusted by heap_setscanlimits().
649 */
650 if (scan->rs_numblocks != InvalidBlockNumber)
651 return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
652
653 if (scan->rs_startblock > 0)
654 return scan->rs_startblock - 1;
655
656 return scan->rs_nblocks - 1;
657 }
658}
659
660
661/*
662 * heapgettup_start_page - helper function for heapgettup()
663 *
664 * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
665 * to the number of tuples on this page. Also set *lineoff to the first
666 * offset to scan with forward scans getting the first offset and backward
667 * getting the final offset on the page.
668 */
669static Page
671 OffsetNumber *lineoff)
672{
673 Page page;
674
675 Assert(scan->rs_inited);
677
678 /* Caller is responsible for ensuring buffer is locked if needed */
679 page = BufferGetPage(scan->rs_cbuf);
680
681 *linesleft = PageGetMaxOffsetNumber(page) - FirstOffsetNumber + 1;
682
683 if (ScanDirectionIsForward(dir))
684 *lineoff = FirstOffsetNumber;
685 else
686 *lineoff = (OffsetNumber) (*linesleft);
687
688 /* lineoff now references the physically previous or next tid */
689 return page;
690}
691
692
693/*
694 * heapgettup_continue_page - helper function for heapgettup()
695 *
696 * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
697 * to the number of tuples left to scan on this page. Also set *lineoff to
698 * the next offset to scan according to the ScanDirection in 'dir'.
699 */
700static inline Page
702 OffsetNumber *lineoff)
703{
704 Page page;
705
706 Assert(scan->rs_inited);
708
709 /* Caller is responsible for ensuring buffer is locked if needed */
710 page = BufferGetPage(scan->rs_cbuf);
711
712 if (ScanDirectionIsForward(dir))
713 {
714 *lineoff = OffsetNumberNext(scan->rs_coffset);
715 *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
716 }
717 else
718 {
719 /*
720 * The previous returned tuple may have been vacuumed since the
721 * previous scan when we use a non-MVCC snapshot, so we must
722 * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
723 */
724 *lineoff = Min(PageGetMaxOffsetNumber(page), OffsetNumberPrev(scan->rs_coffset));
725 *linesleft = *lineoff;
726 }
727
728 /* lineoff now references the physically previous or next tid */
729 return page;
730}
731
732/*
733 * heapgettup_advance_block - helper for heap_fetch_next_buffer()
734 *
735 * Given the current block number, the scan direction, and various information
736 * contained in the scan descriptor, calculate the BlockNumber to scan next
737 * and return it. If there are no further blocks to scan, return
738 * InvalidBlockNumber to indicate this fact to the caller.
739 *
740 * This should not be called to determine the initial block number -- only for
741 * subsequent blocks.
742 *
743 * This also adjusts rs_numblocks when a limit has been imposed by
744 * heap_setscanlimits().
745 */
746static inline BlockNumber
748{
749 Assert(scan->rs_base.rs_parallel == NULL);
750
752 {
753 block++;
754
755 /* wrap back to the start of the heap */
756 if (block >= scan->rs_nblocks)
757 block = 0;
758
759 /*
760 * Report our new scan position for synchronization purposes. We don't
761 * do that when moving backwards, however. That would just mess up any
762 * other forward-moving scanners.
763 *
764 * Note: we do this before checking for end of scan so that the final
765 * state of the position hint is back at the start of the rel. That's
766 * not strictly necessary, but otherwise when you run the same query
767 * multiple times the starting position would shift a little bit
768 * backwards on every invocation, which is confusing. We don't
769 * guarantee any specific ordering in general, though.
770 */
771 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
772 ss_report_location(scan->rs_base.rs_rd, block);
773
774 /* we're done if we're back at where we started */
775 if (block == scan->rs_startblock)
776 return InvalidBlockNumber;
777
778 /* check if the limit imposed by heap_setscanlimits() is met */
779 if (scan->rs_numblocks != InvalidBlockNumber)
780 {
781 if (--scan->rs_numblocks == 0)
782 return InvalidBlockNumber;
783 }
784
785 return block;
786 }
787 else
788 {
789 /* we're done if the last block is the start position */
790 if (block == scan->rs_startblock)
791 return InvalidBlockNumber;
792
793 /* check if the limit imposed by heap_setscanlimits() is met */
794 if (scan->rs_numblocks != InvalidBlockNumber)
795 {
796 if (--scan->rs_numblocks == 0)
797 return InvalidBlockNumber;
798 }
799
800 /* wrap to the end of the heap when the last page was page 0 */
801 if (block == 0)
802 block = scan->rs_nblocks;
803
804 block--;
805
806 return block;
807 }
808}
809
810/* ----------------
811 * heapgettup - fetch next heap tuple
812 *
813 * Initialize the scan if not already done; then advance to the next
814 * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
815 * or set scan->rs_ctup.t_data = NULL if no more tuples.
816 *
817 * Note: the reason nkeys/key are passed separately, even though they are
818 * kept in the scan descriptor, is that the caller may not want us to check
819 * the scankeys.
820 *
821 * Note: when we fall off the end of the scan in either direction, we
822 * reset rs_inited. This means that a further request with the same
823 * scan direction will restart the scan, which is a bit odd, but a
824 * request with the opposite scan direction will start a fresh scan
825 * in the proper direction. The latter is required behavior for cursors,
826 * while the former case is generally undefined behavior in Postgres
827 * so we don't care too much.
828 * ----------------
829 */
830static void
832 ScanDirection dir,
833 int nkeys,
834 ScanKey key)
835{
836 HeapTuple tuple = &(scan->rs_ctup);
837 Page page;
838 OffsetNumber lineoff;
839 int linesleft;
840
841 if (likely(scan->rs_inited))
842 {
843 /* continue from previously returned page/tuple */
845 page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
846 goto continue_page;
847 }
848
849 /*
850 * advance the scan until we find a qualifying tuple or run out of stuff
851 * to scan
852 */
853 while (true)
854 {
855 heap_fetch_next_buffer(scan, dir);
856
857 /* did we run out of blocks to scan? */
858 if (!BufferIsValid(scan->rs_cbuf))
859 break;
860
862
864 page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
865continue_page:
866
867 /*
868 * Only continue scanning the page while we have lines left.
869 *
870 * Note that this protects us from accessing line pointers past
871 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
872 * table scan, and for when we start scanning a new page.
873 */
874 for (; linesleft > 0; linesleft--, lineoff += dir)
875 {
876 bool visible;
877 ItemId lpp = PageGetItemId(page, lineoff);
878
879 if (!ItemIdIsNormal(lpp))
880 continue;
881
882 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
883 tuple->t_len = ItemIdGetLength(lpp);
884 ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
885
886 visible = HeapTupleSatisfiesVisibility(tuple,
887 scan->rs_base.rs_snapshot,
888 scan->rs_cbuf);
889
891 tuple, scan->rs_cbuf,
892 scan->rs_base.rs_snapshot);
893
894 /* skip tuples not visible to this snapshot */
895 if (!visible)
896 continue;
897
898 /* skip any tuples that don't match the scan key */
899 if (key != NULL &&
901 nkeys, key))
902 continue;
903
905 scan->rs_coffset = lineoff;
906 return;
907 }
908
909 /*
910 * if we get here, it means we've exhausted the items on this page and
911 * it's time to move to the next.
912 */
914 }
915
916 /* end of scan */
917 if (BufferIsValid(scan->rs_cbuf))
918 ReleaseBuffer(scan->rs_cbuf);
919
920 scan->rs_cbuf = InvalidBuffer;
923 tuple->t_data = NULL;
924 scan->rs_inited = false;
925}
926
927/* ----------------
928 * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
929 *
930 * Same API as heapgettup, but used in page-at-a-time mode
931 *
932 * The internal logic is much the same as heapgettup's too, but there are some
933 * differences: we do not take the buffer content lock (that only needs to
934 * happen inside heap_prepare_pagescan), and we iterate through just the
935 * tuples listed in rs_vistuples[] rather than all tuples on the page. Notice
936 * that lineindex is 0-based, where the corresponding loop variable lineoff in
937 * heapgettup is 1-based.
938 * ----------------
939 */
940static void
942 ScanDirection dir,
943 int nkeys,
944 ScanKey key)
945{
946 HeapTuple tuple = &(scan->rs_ctup);
947 Page page;
948 uint32 lineindex;
949 uint32 linesleft;
950
951 if (likely(scan->rs_inited))
952 {
953 /* continue from previously returned page/tuple */
954 page = BufferGetPage(scan->rs_cbuf);
955
956 lineindex = scan->rs_cindex + dir;
957 if (ScanDirectionIsForward(dir))
958 linesleft = scan->rs_ntuples - lineindex;
959 else
960 linesleft = scan->rs_cindex;
961 /* lineindex now references the next or previous visible tid */
962
963 goto continue_page;
964 }
965
966 /*
967 * advance the scan until we find a qualifying tuple or run out of stuff
968 * to scan
969 */
970 while (true)
971 {
972 heap_fetch_next_buffer(scan, dir);
973
974 /* did we run out of blocks to scan? */
975 if (!BufferIsValid(scan->rs_cbuf))
976 break;
977
979
980 /* prune the page and determine visible tuple offsets */
982 page = BufferGetPage(scan->rs_cbuf);
983 linesleft = scan->rs_ntuples;
984 lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1;
985
986 /* lineindex now references the next or previous visible tid */
987continue_page:
988
989 for (; linesleft > 0; linesleft--, lineindex += dir)
990 {
991 ItemId lpp;
992 OffsetNumber lineoff;
993
994 Assert(lineindex <= scan->rs_ntuples);
995 lineoff = scan->rs_vistuples[lineindex];
996 lpp = PageGetItemId(page, lineoff);
998
999 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1000 tuple->t_len = ItemIdGetLength(lpp);
1001 ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
1002
1003 /* skip any tuples that don't match the scan key */
1004 if (key != NULL &&
1005 !HeapKeyTest(tuple, RelationGetDescr(scan->rs_base.rs_rd),
1006 nkeys, key))
1007 continue;
1008
1009 scan->rs_cindex = lineindex;
1010 return;
1011 }
1012 }
1013
1014 /* end of scan */
1015 if (BufferIsValid(scan->rs_cbuf))
1016 ReleaseBuffer(scan->rs_cbuf);
1017 scan->rs_cbuf = InvalidBuffer;
1020 tuple->t_data = NULL;
1021 scan->rs_inited = false;
1022}
1023
1024
1025/* ----------------------------------------------------------------
1026 * heap access method interface
1027 * ----------------------------------------------------------------
1028 */
1029
1030
1033 int nkeys, ScanKey key,
1034 ParallelTableScanDesc parallel_scan,
1035 uint32 flags)
1036{
1037 HeapScanDesc scan;
1038
1039 /*
1040 * increment relation ref count while scanning relation
1041 *
1042 * This is just to make really sure the relcache entry won't go away while
1043 * the scan has a pointer to it. Caller should be holding the rel open
1044 * anyway, so this is redundant in all normal scenarios...
1045 */
1047
1048 /*
1049 * allocate and initialize scan descriptor
1050 */
1051 scan = (HeapScanDesc) palloc(sizeof(HeapScanDescData));
1052
1053 scan->rs_base.rs_rd = relation;
1054 scan->rs_base.rs_snapshot = snapshot;
1055 scan->rs_base.rs_nkeys = nkeys;
1056 scan->rs_base.rs_flags = flags;
1057 scan->rs_base.rs_parallel = parallel_scan;
1058 scan->rs_strategy = NULL; /* set in initscan */
1059 scan->rs_vmbuffer = InvalidBuffer;
1060 scan->rs_empty_tuples_pending = 0;
1061
1062 /*
1063 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1064 */
1065 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1067
1068 /*
1069 * For seqscan and sample scans in a serializable transaction, acquire a
1070 * predicate lock on the entire relation. This is required not only to
1071 * lock all the matching tuples, but also to conflict with new insertions
1072 * into the table. In an indexscan, we take page locks on the index pages
1073 * covering the range specified in the scan qual, but in a heap scan there
1074 * is nothing more fine-grained to lock. A bitmap scan is a different
1075 * story, there we have already scanned the index and locked the index
1076 * pages covering the predicate. But in that case we still have to lock
1077 * any matching heap tuples. For sample scan we could optimize the locking
1078 * to be at least page-level granularity, but we'd need to add per-tuple
1079 * locking for that.
1080 */
1082 {
1083 /*
1084 * Ensure a missing snapshot is noticed reliably, even if the
1085 * isolation mode means predicate locking isn't performed (and
1086 * therefore the snapshot isn't used here).
1087 */
1088 Assert(snapshot);
1089 PredicateLockRelation(relation, snapshot);
1090 }
1091
1092 /* we only need to set this up once */
1093 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1094
1095 /*
1096 * Allocate memory to keep track of page allocation for parallel workers
1097 * when doing a parallel scan.
1098 */
1099 if (parallel_scan != NULL)
1101 else
1102 scan->rs_parallelworkerdata = NULL;
1103
1104 /*
1105 * we do this here instead of in initscan() because heap_rescan also calls
1106 * initscan() and we don't want to allocate memory again
1107 */
1108 if (nkeys > 0)
1109 scan->rs_base.rs_key = (ScanKey) palloc(sizeof(ScanKeyData) * nkeys);
1110 else
1111 scan->rs_base.rs_key = NULL;
1112
1113 initscan(scan, key, false);
1114
1115 scan->rs_read_stream = NULL;
1116
1117 /*
1118 * Set up a read stream for sequential scans and TID range scans. This
1119 * should be done after initscan() because initscan() allocates the
1120 * BufferAccessStrategy object passed to the read stream API.
1121 */
1122 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN ||
1124 {
1126
1127 if (scan->rs_base.rs_parallel)
1129 else
1131
1133 scan->rs_strategy,
1134 scan->rs_base.rs_rd,
1136 cb,
1137 scan,
1138 0);
1139 }
1140
1141
1142 return (TableScanDesc) scan;
1143}
1144
1145void
1146heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params,
1147 bool allow_strat, bool allow_sync, bool allow_pagemode)
1148{
1149 HeapScanDesc scan = (HeapScanDesc) sscan;
1150
1151 if (set_params)
1152 {
1153 if (allow_strat)
1155 else
1156 scan->rs_base.rs_flags &= ~SO_ALLOW_STRAT;
1157
1158 if (allow_sync)
1160 else
1161 scan->rs_base.rs_flags &= ~SO_ALLOW_SYNC;
1162
1163 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1166 else
1168 }
1169
1170 /*
1171 * unpin scan buffers
1172 */
1173 if (BufferIsValid(scan->rs_cbuf))
1174 ReleaseBuffer(scan->rs_cbuf);
1175
1176 if (BufferIsValid(scan->rs_vmbuffer))
1177 {
1179 scan->rs_vmbuffer = InvalidBuffer;
1180 }
1181
1182 /*
1183 * Reset rs_empty_tuples_pending, a field only used by bitmap heap scan,
1184 * to avoid incorrectly emitting NULL-filled tuples from a previous scan
1185 * on rescan.
1186 */
1187 scan->rs_empty_tuples_pending = 0;
1188
1189 /*
1190 * The read stream is reset on rescan. This must be done before
1191 * initscan(), as some state referred to by read_stream_reset() is reset
1192 * in initscan().
1193 */
1194 if (scan->rs_read_stream)
1196
1197 /*
1198 * reinitialize scan descriptor
1199 */
1200 initscan(scan, key, true);
1201}
1202
1203void
1205{
1206 HeapScanDesc scan = (HeapScanDesc) sscan;
1207
1208 /* Note: no locking manipulations needed */
1209
1210 /*
1211 * unpin scan buffers
1212 */
1213 if (BufferIsValid(scan->rs_cbuf))
1214 ReleaseBuffer(scan->rs_cbuf);
1215
1216 if (BufferIsValid(scan->rs_vmbuffer))
1218
1219 /*
1220 * Must free the read stream before freeing the BufferAccessStrategy.
1221 */
1222 if (scan->rs_read_stream)
1224
1225 /*
1226 * decrement relation reference count and free scan descriptor storage
1227 */
1229
1230 if (scan->rs_base.rs_key)
1231 pfree(scan->rs_base.rs_key);
1232
1233 if (scan->rs_strategy != NULL)
1235
1236 if (scan->rs_parallelworkerdata != NULL)
1238
1239 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1241
1242 pfree(scan);
1243}
1244
1247{
1248 HeapScanDesc scan = (HeapScanDesc) sscan;
1249
1250 /*
1251 * This is still widely used directly, without going through table AM, so
1252 * add a safety check. It's possible we should, at a later point,
1253 * downgrade this to an assert. The reason for checking the AM routine,
1254 * rather than the AM oid, is that this allows to write regression tests
1255 * that create another AM reusing the heap handler.
1256 */
1258 ereport(ERROR,
1259 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1260 errmsg_internal("only heap AM is supported")));
1261
1262 /*
1263 * We don't expect direct calls to heap_getnext with valid CheckXidAlive
1264 * for catalog or regular tables. See detailed comments in xact.c where
1265 * these variables are declared. Normally we have such a check at tableam
1266 * level API but this is called from many places so we need to ensure it
1267 * here.
1268 */
1270 elog(ERROR, "unexpected heap_getnext call during logical decoding");
1271
1272 /* Note: no locking manipulations needed */
1273
1275 heapgettup_pagemode(scan, direction,
1276 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1277 else
1278 heapgettup(scan, direction,
1279 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1280
1281 if (scan->rs_ctup.t_data == NULL)
1282 return NULL;
1283
1284 /*
1285 * if we get here it means we have a new current scan tuple, so point to
1286 * the proper return buffer and return the tuple.
1287 */
1288
1290
1291 return &scan->rs_ctup;
1292}
1293
1294bool
1296{
1297 HeapScanDesc scan = (HeapScanDesc) sscan;
1298
1299 /* Note: no locking manipulations needed */
1300
1301 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1302 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1303 else
1304 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1305
1306 if (scan->rs_ctup.t_data == NULL)
1307 {
1308 ExecClearTuple(slot);
1309 return false;
1310 }
1311
1312 /*
1313 * if we get here it means we have a new current scan tuple, so point to
1314 * the proper return buffer and return the tuple.
1315 */
1316
1318
1319 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1320 scan->rs_cbuf);
1321 return true;
1322}
1323
1324void
1326 ItemPointer maxtid)
1327{
1328 HeapScanDesc scan = (HeapScanDesc) sscan;
1329 BlockNumber startBlk;
1330 BlockNumber numBlks;
1331 ItemPointerData highestItem;
1332 ItemPointerData lowestItem;
1333
1334 /*
1335 * For relations without any pages, we can simply leave the TID range
1336 * unset. There will be no tuples to scan, therefore no tuples outside
1337 * the given TID range.
1338 */
1339 if (scan->rs_nblocks == 0)
1340 return;
1341
1342 /*
1343 * Set up some ItemPointers which point to the first and last possible
1344 * tuples in the heap.
1345 */
1346 ItemPointerSet(&highestItem, scan->rs_nblocks - 1, MaxOffsetNumber);
1347 ItemPointerSet(&lowestItem, 0, FirstOffsetNumber);
1348
1349 /*
1350 * If the given maximum TID is below the highest possible TID in the
1351 * relation, then restrict the range to that, otherwise we scan to the end
1352 * of the relation.
1353 */
1354 if (ItemPointerCompare(maxtid, &highestItem) < 0)
1355 ItemPointerCopy(maxtid, &highestItem);
1356
1357 /*
1358 * If the given minimum TID is above the lowest possible TID in the
1359 * relation, then restrict the range to only scan for TIDs above that.
1360 */
1361 if (ItemPointerCompare(mintid, &lowestItem) > 0)
1362 ItemPointerCopy(mintid, &lowestItem);
1363
1364 /*
1365 * Check for an empty range and protect from would be negative results
1366 * from the numBlks calculation below.
1367 */
1368 if (ItemPointerCompare(&highestItem, &lowestItem) < 0)
1369 {
1370 /* Set an empty range of blocks to scan */
1371 heap_setscanlimits(sscan, 0, 0);
1372 return;
1373 }
1374
1375 /*
1376 * Calculate the first block and the number of blocks we must scan. We
1377 * could be more aggressive here and perform some more validation to try
1378 * and further narrow the scope of blocks to scan by checking if the
1379 * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1380 * advance startBlk by one. Likewise, if highestItem has an offset of 0
1381 * we could scan one fewer blocks. However, such an optimization does not
1382 * seem worth troubling over, currently.
1383 */
1384 startBlk = ItemPointerGetBlockNumberNoCheck(&lowestItem);
1385
1386 numBlks = ItemPointerGetBlockNumberNoCheck(&highestItem) -
1387 ItemPointerGetBlockNumberNoCheck(&lowestItem) + 1;
1388
1389 /* Set the start block and number of blocks to scan */
1390 heap_setscanlimits(sscan, startBlk, numBlks);
1391
1392 /* Finally, set the TID range in sscan */
1393 ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
1394 ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
1395}
1396
1397bool
1399 TupleTableSlot *slot)
1400{
1401 HeapScanDesc scan = (HeapScanDesc) sscan;
1402 ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
1403 ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
1404
1405 /* Note: no locking manipulations needed */
1406 for (;;)
1407 {
1408 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1409 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1410 else
1411 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1412
1413 if (scan->rs_ctup.t_data == NULL)
1414 {
1415 ExecClearTuple(slot);
1416 return false;
1417 }
1418
1419 /*
1420 * heap_set_tidrange will have used heap_setscanlimits to limit the
1421 * range of pages we scan to only ones that can contain the TID range
1422 * we're scanning for. Here we must filter out any tuples from these
1423 * pages that are outside of that range.
1424 */
1425 if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1426 {
1427 ExecClearTuple(slot);
1428
1429 /*
1430 * When scanning backwards, the TIDs will be in descending order.
1431 * Future tuples in this direction will be lower still, so we can
1432 * just return false to indicate there will be no more tuples.
1433 */
1434 if (ScanDirectionIsBackward(direction))
1435 return false;
1436
1437 continue;
1438 }
1439
1440 /*
1441 * Likewise for the final page, we must filter out TIDs greater than
1442 * maxtid.
1443 */
1444 if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1445 {
1446 ExecClearTuple(slot);
1447
1448 /*
1449 * When scanning forward, the TIDs will be in ascending order.
1450 * Future tuples in this direction will be higher still, so we can
1451 * just return false to indicate there will be no more tuples.
1452 */
1453 if (ScanDirectionIsForward(direction))
1454 return false;
1455 continue;
1456 }
1457
1458 break;
1459 }
1460
1461 /*
1462 * if we get here it means we have a new current scan tuple, so point to
1463 * the proper return buffer and return the tuple.
1464 */
1466
1467 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1468 return true;
1469}
1470
1471/*
1472 * heap_fetch - retrieve tuple with given tid
1473 *
1474 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1475 * the tuple, fill in the remaining fields of *tuple, and check the tuple
1476 * against the specified snapshot.
1477 *
1478 * If successful (tuple found and passes snapshot time qual), then *userbuf
1479 * is set to the buffer holding the tuple and true is returned. The caller
1480 * must unpin the buffer when done with the tuple.
1481 *
1482 * If the tuple is not found (ie, item number references a deleted slot),
1483 * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer,
1484 * and false is returned.
1485 *
1486 * If the tuple is found but fails the time qual check, then the behavior
1487 * depends on the keep_buf parameter. If keep_buf is false, the results
1488 * are the same as for the tuple-not-found case. If keep_buf is true,
1489 * then tuple->t_data and *userbuf are returned as for the success case,
1490 * and again the caller must unpin the buffer; but false is returned.
1491 *
1492 * heap_fetch does not follow HOT chains: only the exact TID requested will
1493 * be fetched.
1494 *
1495 * It is somewhat inconsistent that we ereport() on invalid block number but
1496 * return false on invalid item number. There are a couple of reasons though.
1497 * One is that the caller can relatively easily check the block number for
1498 * validity, but cannot check the item number without reading the page
1499 * himself. Another is that when we are following a t_ctid link, we can be
1500 * reasonably confident that the page number is valid (since VACUUM shouldn't
1501 * truncate off the destination page without having killed the referencing
1502 * tuple first), but the item number might well not be good.
1503 */
1504bool
1506 Snapshot snapshot,
1507 HeapTuple tuple,
1508 Buffer *userbuf,
1509 bool keep_buf)
1510{
1511 ItemPointer tid = &(tuple->t_self);
1512 ItemId lp;
1513 Buffer buffer;
1514 Page page;
1515 OffsetNumber offnum;
1516 bool valid;
1517
1518 /*
1519 * Fetch and pin the appropriate page of the relation.
1520 */
1521 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1522
1523 /*
1524 * Need share lock on buffer to examine tuple commit status.
1525 */
1527 page = BufferGetPage(buffer);
1528
1529 /*
1530 * We'd better check for out-of-range offnum in case of VACUUM since the
1531 * TID was obtained.
1532 */
1533 offnum = ItemPointerGetOffsetNumber(tid);
1534 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1535 {
1537 ReleaseBuffer(buffer);
1538 *userbuf = InvalidBuffer;
1539 tuple->t_data = NULL;
1540 return false;
1541 }
1542
1543 /*
1544 * get the item line pointer corresponding to the requested tid
1545 */
1546 lp = PageGetItemId(page, offnum);
1547
1548 /*
1549 * Must check for deleted tuple.
1550 */
1551 if (!ItemIdIsNormal(lp))
1552 {
1554 ReleaseBuffer(buffer);
1555 *userbuf = InvalidBuffer;
1556 tuple->t_data = NULL;
1557 return false;
1558 }
1559
1560 /*
1561 * fill in *tuple fields
1562 */
1563 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1564 tuple->t_len = ItemIdGetLength(lp);
1565 tuple->t_tableOid = RelationGetRelid(relation);
1566
1567 /*
1568 * check tuple visibility, then release lock
1569 */
1570 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1571
1572 if (valid)
1573 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1575
1576 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1577
1579
1580 if (valid)
1581 {
1582 /*
1583 * All checks passed, so return the tuple as valid. Caller is now
1584 * responsible for releasing the buffer.
1585 */
1586 *userbuf = buffer;
1587
1588 return true;
1589 }
1590
1591 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1592 if (keep_buf)
1593 *userbuf = buffer;
1594 else
1595 {
1596 ReleaseBuffer(buffer);
1597 *userbuf = InvalidBuffer;
1598 tuple->t_data = NULL;
1599 }
1600
1601 return false;
1602}
1603
1604/*
1605 * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1606 *
1607 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1608 * of a HOT chain), and buffer is the buffer holding this tuple. We search
1609 * for the first chain member satisfying the given snapshot. If one is
1610 * found, we update *tid to reference that tuple's offset number, and
1611 * return true. If no match, return false without modifying *tid.
1612 *
1613 * heapTuple is a caller-supplied buffer. When a match is found, we return
1614 * the tuple here, in addition to updating *tid. If no match is found, the
1615 * contents of this buffer on return are undefined.
1616 *
1617 * If all_dead is not NULL, we check non-visible tuples to see if they are
1618 * globally dead; *all_dead is set true if all members of the HOT chain
1619 * are vacuumable, false if not.
1620 *
1621 * Unlike heap_fetch, the caller must already have pin and (at least) share
1622 * lock on the buffer; it is still pinned/locked at exit.
1623 */
1624bool
1626 Snapshot snapshot, HeapTuple heapTuple,
1627 bool *all_dead, bool first_call)
1628{
1629 Page page = BufferGetPage(buffer);
1631 BlockNumber blkno;
1632 OffsetNumber offnum;
1633 bool at_chain_start;
1634 bool valid;
1635 bool skip;
1636 GlobalVisState *vistest = NULL;
1637
1638 /* If this is not the first call, previous call returned a (live!) tuple */
1639 if (all_dead)
1640 *all_dead = first_call;
1641
1642 blkno = ItemPointerGetBlockNumber(tid);
1643 offnum = ItemPointerGetOffsetNumber(tid);
1644 at_chain_start = first_call;
1645 skip = !first_call;
1646
1647 /* XXX: we should assert that a snapshot is pushed or registered */
1649 Assert(BufferGetBlockNumber(buffer) == blkno);
1650
1651 /* Scan through possible multiple members of HOT-chain */
1652 for (;;)
1653 {
1654 ItemId lp;
1655
1656 /* check for bogus TID */
1657 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1658 break;
1659
1660 lp = PageGetItemId(page, offnum);
1661
1662 /* check for unused, dead, or redirected items */
1663 if (!ItemIdIsNormal(lp))
1664 {
1665 /* We should only see a redirect at start of chain */
1666 if (ItemIdIsRedirected(lp) && at_chain_start)
1667 {
1668 /* Follow the redirect */
1669 offnum = ItemIdGetRedirect(lp);
1670 at_chain_start = false;
1671 continue;
1672 }
1673 /* else must be end of chain */
1674 break;
1675 }
1676
1677 /*
1678 * Update heapTuple to point to the element of the HOT chain we're
1679 * currently investigating. Having t_self set correctly is important
1680 * because the SSI checks and the *Satisfies routine for historical
1681 * MVCC snapshots need the correct tid to decide about the visibility.
1682 */
1683 heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1684 heapTuple->t_len = ItemIdGetLength(lp);
1685 heapTuple->t_tableOid = RelationGetRelid(relation);
1686 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1687
1688 /*
1689 * Shouldn't see a HEAP_ONLY tuple at chain start.
1690 */
1691 if (at_chain_start && HeapTupleIsHeapOnly(heapTuple))
1692 break;
1693
1694 /*
1695 * The xmin should match the previous xmax value, else chain is
1696 * broken.
1697 */
1698 if (TransactionIdIsValid(prev_xmax) &&
1699 !TransactionIdEquals(prev_xmax,
1700 HeapTupleHeaderGetXmin(heapTuple->t_data)))
1701 break;
1702
1703 /*
1704 * When first_call is true (and thus, skip is initially false) we'll
1705 * return the first tuple we find. But on later passes, heapTuple
1706 * will initially be pointing to the tuple we returned last time.
1707 * Returning it again would be incorrect (and would loop forever), so
1708 * we skip it and return the next match we find.
1709 */
1710 if (!skip)
1711 {
1712 /* If it's visible per the snapshot, we must return it */
1713 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1714 HeapCheckForSerializableConflictOut(valid, relation, heapTuple,
1715 buffer, snapshot);
1716
1717 if (valid)
1718 {
1719 ItemPointerSetOffsetNumber(tid, offnum);
1720 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1721 HeapTupleHeaderGetXmin(heapTuple->t_data));
1722 if (all_dead)
1723 *all_dead = false;
1724 return true;
1725 }
1726 }
1727 skip = false;
1728
1729 /*
1730 * If we can't see it, maybe no one else can either. At caller
1731 * request, check whether all chain members are dead to all
1732 * transactions.
1733 *
1734 * Note: if you change the criterion here for what is "dead", fix the
1735 * planner's get_actual_variable_range() function to match.
1736 */
1737 if (all_dead && *all_dead)
1738 {
1739 if (!vistest)
1740 vistest = GlobalVisTestFor(relation);
1741
1742 if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1743 *all_dead = false;
1744 }
1745
1746 /*
1747 * Check to see if HOT chain continues past this tuple; if so fetch
1748 * the next offnum and loop around.
1749 */
1750 if (HeapTupleIsHotUpdated(heapTuple))
1751 {
1753 blkno);
1754 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1755 at_chain_start = false;
1756 prev_xmax = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1757 }
1758 else
1759 break; /* end of chain */
1760 }
1761
1762 return false;
1763}
1764
1765/*
1766 * heap_get_latest_tid - get the latest tid of a specified tuple
1767 *
1768 * Actually, this gets the latest version that is visible according to the
1769 * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1770 * possibly uncommitted version.
1771 *
1772 * *tid is both an input and an output parameter: it is updated to
1773 * show the latest version of the row. Note that it will not be changed
1774 * if no version of the row passes the snapshot test.
1775 */
1776void
1778 ItemPointer tid)
1779{
1780 Relation relation = sscan->rs_rd;
1781 Snapshot snapshot = sscan->rs_snapshot;
1782 ItemPointerData ctid;
1783 TransactionId priorXmax;
1784
1785 /*
1786 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1787 * Assume that t_ctid links are valid however - there shouldn't be invalid
1788 * ones in the table.
1789 */
1791
1792 /*
1793 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1794 * need to examine, and *tid is the TID we will return if ctid turns out
1795 * to be bogus.
1796 *
1797 * Note that we will loop until we reach the end of the t_ctid chain.
1798 * Depending on the snapshot passed, there might be at most one visible
1799 * version of the row, but we don't try to optimize for that.
1800 */
1801 ctid = *tid;
1802 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1803 for (;;)
1804 {
1805 Buffer buffer;
1806 Page page;
1807 OffsetNumber offnum;
1808 ItemId lp;
1809 HeapTupleData tp;
1810 bool valid;
1811
1812 /*
1813 * Read, pin, and lock the page.
1814 */
1815 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1817 page = BufferGetPage(buffer);
1818
1819 /*
1820 * Check for bogus item number. This is not treated as an error
1821 * condition because it can happen while following a t_ctid link. We
1822 * just assume that the prior tid is OK and return it unchanged.
1823 */
1824 offnum = ItemPointerGetOffsetNumber(&ctid);
1825 if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(page))
1826 {
1827 UnlockReleaseBuffer(buffer);
1828 break;
1829 }
1830 lp = PageGetItemId(page, offnum);
1831 if (!ItemIdIsNormal(lp))
1832 {
1833 UnlockReleaseBuffer(buffer);
1834 break;
1835 }
1836
1837 /* OK to access the tuple */
1838 tp.t_self = ctid;
1839 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1840 tp.t_len = ItemIdGetLength(lp);
1841 tp.t_tableOid = RelationGetRelid(relation);
1842
1843 /*
1844 * After following a t_ctid link, we might arrive at an unrelated
1845 * tuple. Check for XMIN match.
1846 */
1847 if (TransactionIdIsValid(priorXmax) &&
1849 {
1850 UnlockReleaseBuffer(buffer);
1851 break;
1852 }
1853
1854 /*
1855 * Check tuple visibility; if visible, set it as the new result
1856 * candidate.
1857 */
1858 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
1859 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
1860 if (valid)
1861 *tid = ctid;
1862
1863 /*
1864 * If there's a valid t_ctid link, follow it, else we're done.
1865 */
1866 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
1870 {
1871 UnlockReleaseBuffer(buffer);
1872 break;
1873 }
1874
1875 ctid = tp.t_data->t_ctid;
1876 priorXmax = HeapTupleHeaderGetUpdateXid(tp.t_data);
1877 UnlockReleaseBuffer(buffer);
1878 } /* end of loop */
1879}
1880
1881
1882/*
1883 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
1884 *
1885 * This is called after we have waited for the XMAX transaction to terminate.
1886 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
1887 * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
1888 * hint bit if possible --- but beware that that may not yet be possible,
1889 * if the transaction committed asynchronously.
1890 *
1891 * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
1892 * even if it commits.
1893 *
1894 * Hence callers should look only at XMAX_INVALID.
1895 *
1896 * Note this is not allowed for tuples whose xmax is a multixact.
1897 */
1898static void
1900{
1903
1905 {
1906 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
1909 xid);
1910 else
1913 }
1914}
1915
1916
1917/*
1918 * GetBulkInsertState - prepare status object for a bulk insert
1919 */
1922{
1923 BulkInsertState bistate;
1924
1925 bistate = (BulkInsertState) palloc(sizeof(BulkInsertStateData));
1927 bistate->current_buf = InvalidBuffer;
1928 bistate->next_free = InvalidBlockNumber;
1929 bistate->last_free = InvalidBlockNumber;
1930 bistate->already_extended_by = 0;
1931 return bistate;
1932}
1933
1934/*
1935 * FreeBulkInsertState - clean up after finishing a bulk insert
1936 */
1937void
1939{
1940 if (bistate->current_buf != InvalidBuffer)
1941 ReleaseBuffer(bistate->current_buf);
1942 FreeAccessStrategy(bistate->strategy);
1943 pfree(bistate);
1944}
1945
1946/*
1947 * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
1948 */
1949void
1951{
1952 if (bistate->current_buf != InvalidBuffer)
1953 ReleaseBuffer(bistate->current_buf);
1954 bistate->current_buf = InvalidBuffer;
1955
1956 /*
1957 * Despite the name, we also reset bulk relation extension state.
1958 * Otherwise we can end up erroring out due to looking for free space in
1959 * ->next_free of one partition, even though ->next_free was set when
1960 * extending another partition. It could obviously also be bad for
1961 * efficiency to look at existing blocks at offsets from another
1962 * partition, even if we don't error out.
1963 */
1964 bistate->next_free = InvalidBlockNumber;
1965 bistate->last_free = InvalidBlockNumber;
1966}
1967
1968
1969/*
1970 * heap_insert - insert tuple into a heap
1971 *
1972 * The new tuple is stamped with current transaction ID and the specified
1973 * command ID.
1974 *
1975 * See table_tuple_insert for comments about most of the input flags, except
1976 * that this routine directly takes a tuple rather than a slot.
1977 *
1978 * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
1979 * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
1980 * implement table_tuple_insert_speculative().
1981 *
1982 * On return the header fields of *tup are updated to match the stored tuple;
1983 * in particular tup->t_self receives the actual TID where the tuple was
1984 * stored. But note that any toasting of fields within the tuple data is NOT
1985 * reflected into *tup.
1986 */
1987void
1989 int options, BulkInsertState bistate)
1990{
1992 HeapTuple heaptup;
1993 Buffer buffer;
1994 Buffer vmbuffer = InvalidBuffer;
1995 bool all_visible_cleared = false;
1996
1997 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2000
2001 /*
2002 * Fill in tuple header fields and toast the tuple if necessary.
2003 *
2004 * Note: below this point, heaptup is the data we actually intend to store
2005 * into the relation; tup is the caller's original untoasted data.
2006 */
2007 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2008
2009 /*
2010 * Find buffer to insert this tuple into. If the page is all visible,
2011 * this will also pin the requisite visibility map page.
2012 */
2013 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2014 InvalidBuffer, options, bistate,
2015 &vmbuffer, NULL,
2016 0);
2017
2018 /*
2019 * We're about to do the actual insert -- but check for conflict first, to
2020 * avoid possibly having to roll back work we've just done.
2021 *
2022 * This is safe without a recheck as long as there is no possibility of
2023 * another process scanning the page between this check and the insert
2024 * being visible to the scan (i.e., an exclusive buffer content lock is
2025 * continuously held from this point until the tuple insert is visible).
2026 *
2027 * For a heap insert, we only need to check for table-level SSI locks. Our
2028 * new tuple can't possibly conflict with existing tuple locks, and heap
2029 * page locks are only consolidated versions of tuple locks; they do not
2030 * lock "gaps" as index page locks do. So we don't need to specify a
2031 * buffer when making the call, which makes for a faster check.
2032 */
2034
2035 /* NO EREPORT(ERROR) from here till changes are logged */
2037
2038 RelationPutHeapTuple(relation, buffer, heaptup,
2040
2041 if (PageIsAllVisible(BufferGetPage(buffer)))
2042 {
2043 all_visible_cleared = true;
2045 visibilitymap_clear(relation,
2046 ItemPointerGetBlockNumber(&(heaptup->t_self)),
2047 vmbuffer, VISIBILITYMAP_VALID_BITS);
2048 }
2049
2050 /*
2051 * XXX Should we set PageSetPrunable on this page ?
2052 *
2053 * The inserting transaction may eventually abort thus making this tuple
2054 * DEAD and hence available for pruning. Though we don't want to optimize
2055 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2056 * aborted tuple will never be pruned until next vacuum is triggered.
2057 *
2058 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2059 */
2060
2061 MarkBufferDirty(buffer);
2062
2063 /* XLOG stuff */
2064 if (RelationNeedsWAL(relation))
2065 {
2066 xl_heap_insert xlrec;
2067 xl_heap_header xlhdr;
2068 XLogRecPtr recptr;
2069 Page page = BufferGetPage(buffer);
2070 uint8 info = XLOG_HEAP_INSERT;
2071 int bufflags = 0;
2072
2073 /*
2074 * If this is a catalog, we need to transmit combo CIDs to properly
2075 * decode, so log that as well.
2076 */
2078 log_heap_new_cid(relation, heaptup);
2079
2080 /*
2081 * If this is the single and first tuple on page, we can reinit the
2082 * page instead of restoring the whole thing. Set flag, and hide
2083 * buffer references from XLogInsert.
2084 */
2087 {
2088 info |= XLOG_HEAP_INIT_PAGE;
2089 bufflags |= REGBUF_WILL_INIT;
2090 }
2091
2092 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2093 xlrec.flags = 0;
2094 if (all_visible_cleared)
2099
2100 /*
2101 * For logical decoding, we need the tuple even if we're doing a full
2102 * page write, so make sure it's included even if we take a full-page
2103 * image. (XXX We could alternatively store a pointer into the FPW).
2104 */
2105 if (RelationIsLogicallyLogged(relation) &&
2107 {
2109 bufflags |= REGBUF_KEEP_DATA;
2110
2111 if (IsToastRelation(relation))
2113 }
2114
2116 XLogRegisterData((char *) &xlrec, SizeOfHeapInsert);
2117
2118 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2119 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2120 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2121
2122 /*
2123 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2124 * write the whole page to the xlog, we don't need to store
2125 * xl_heap_header in the xlog.
2126 */
2127 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2128 XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
2129 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2131 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2132 heaptup->t_len - SizeofHeapTupleHeader);
2133
2134 /* filtering by origin on a row level is much more efficient */
2136
2137 recptr = XLogInsert(RM_HEAP_ID, info);
2138
2139 PageSetLSN(page, recptr);
2140 }
2141
2143
2144 UnlockReleaseBuffer(buffer);
2145 if (vmbuffer != InvalidBuffer)
2146 ReleaseBuffer(vmbuffer);
2147
2148 /*
2149 * If tuple is cachable, mark it for invalidation from the caches in case
2150 * we abort. Note it is OK to do this after releasing the buffer, because
2151 * the heaptup data structure is all in local memory, not in the shared
2152 * buffer.
2153 */
2154 CacheInvalidateHeapTuple(relation, heaptup, NULL);
2155
2156 /* Note: speculative insertions are counted too, even if aborted later */
2157 pgstat_count_heap_insert(relation, 1);
2158
2159 /*
2160 * If heaptup is a private copy, release it. Don't forget to copy t_self
2161 * back to the caller's image, too.
2162 */
2163 if (heaptup != tup)
2164 {
2165 tup->t_self = heaptup->t_self;
2166 heap_freetuple(heaptup);
2167 }
2168}
2169
2170/*
2171 * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2172 * tuple header fields and toasts the tuple if necessary. Returns a toasted
2173 * version of the tuple if it was toasted, or the original tuple if not. Note
2174 * that in any case, the header fields are also set in the original tuple.
2175 */
2176static HeapTuple
2178 CommandId cid, int options)
2179{
2180 /*
2181 * To allow parallel inserts, we need to ensure that they are safe to be
2182 * performed in workers. We have the infrastructure to allow parallel
2183 * inserts in general except for the cases where inserts generate a new
2184 * CommandId (eg. inserts into a table having a foreign key column).
2185 */
2186 if (IsParallelWorker())
2187 ereport(ERROR,
2188 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2189 errmsg("cannot insert tuples in a parallel worker")));
2190
2191 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2194 HeapTupleHeaderSetXmin(tup->t_data, xid);
2197
2198 HeapTupleHeaderSetCmin(tup->t_data, cid);
2199 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2200 tup->t_tableOid = RelationGetRelid(relation);
2201
2202 /*
2203 * If the new tuple is too big for storage or contains already toasted
2204 * out-of-line attributes from some other relation, invoke the toaster.
2205 */
2206 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2207 relation->rd_rel->relkind != RELKIND_MATVIEW)
2208 {
2209 /* toast table entries should never be recursively toasted */
2211 return tup;
2212 }
2213 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2214 return heap_toast_insert_or_update(relation, tup, NULL, options);
2215 else
2216 return tup;
2217}
2218
2219/*
2220 * Helper for heap_multi_insert() that computes the number of entire pages
2221 * that inserting the remaining heaptuples requires. Used to determine how
2222 * much the relation needs to be extended by.
2223 */
2224static int
2225heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
2226{
2227 size_t page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
2228 int npages = 1;
2229
2230 for (int i = done; i < ntuples; i++)
2231 {
2232 size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2233
2234 if (page_avail < tup_sz)
2235 {
2236 npages++;
2237 page_avail = BLCKSZ - SizeOfPageHeaderData - saveFreeSpace;
2238 }
2239 page_avail -= tup_sz;
2240 }
2241
2242 return npages;
2243}
2244
2245/*
2246 * heap_multi_insert - insert multiple tuples into a heap
2247 *
2248 * This is like heap_insert(), but inserts multiple tuples in one operation.
2249 * That's faster than calling heap_insert() in a loop, because when multiple
2250 * tuples can be inserted on a single page, we can write just a single WAL
2251 * record covering all of them, and only need to lock/unlock the page once.
2252 *
2253 * Note: this leaks memory into the current memory context. You can create a
2254 * temporary context before calling this, if that's a problem.
2255 */
2256void
2257heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2258 CommandId cid, int options, BulkInsertState bistate)
2259{
2261 HeapTuple *heaptuples;
2262 int i;
2263 int ndone;
2264 PGAlignedBlock scratch;
2265 Page page;
2266 Buffer vmbuffer = InvalidBuffer;
2267 bool needwal;
2268 Size saveFreeSpace;
2269 bool need_tuple_data = RelationIsLogicallyLogged(relation);
2270 bool need_cids = RelationIsAccessibleInLogicalDecoding(relation);
2271 bool starting_with_empty_page = false;
2272 int npages = 0;
2273 int npages_used = 0;
2274
2275 /* currently not needed (thus unsupported) for heap_multi_insert() */
2277
2278 needwal = RelationNeedsWAL(relation);
2279 saveFreeSpace = RelationGetTargetPageFreeSpace(relation,
2281
2282 /* Toast and set header data in all the slots */
2283 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2284 for (i = 0; i < ntuples; i++)
2285 {
2286 HeapTuple tuple;
2287
2288 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2289 slots[i]->tts_tableOid = RelationGetRelid(relation);
2290 tuple->t_tableOid = slots[i]->tts_tableOid;
2291 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2292 options);
2293 }
2294
2295 /*
2296 * We're about to do the actual inserts -- but check for conflict first,
2297 * to minimize the possibility of having to roll back work we've just
2298 * done.
2299 *
2300 * A check here does not definitively prevent a serialization anomaly;
2301 * that check MUST be done at least past the point of acquiring an
2302 * exclusive buffer content lock on every buffer that will be affected,
2303 * and MAY be done after all inserts are reflected in the buffers and
2304 * those locks are released; otherwise there is a race condition. Since
2305 * multiple buffers can be locked and unlocked in the loop below, and it
2306 * would not be feasible to identify and lock all of those buffers before
2307 * the loop, we must do a final check at the end.
2308 *
2309 * The check here could be omitted with no loss of correctness; it is
2310 * present strictly as an optimization.
2311 *
2312 * For heap inserts, we only need to check for table-level SSI locks. Our
2313 * new tuples can't possibly conflict with existing tuple locks, and heap
2314 * page locks are only consolidated versions of tuple locks; they do not
2315 * lock "gaps" as index page locks do. So we don't need to specify a
2316 * buffer when making the call, which makes for a faster check.
2317 */
2319
2320 ndone = 0;
2321 while (ndone < ntuples)
2322 {
2323 Buffer buffer;
2324 bool all_visible_cleared = false;
2325 bool all_frozen_set = false;
2326 int nthispage;
2327
2329
2330 /*
2331 * Compute number of pages needed to fit the to-be-inserted tuples in
2332 * the worst case. This will be used to determine how much to extend
2333 * the relation by in RelationGetBufferForTuple(), if needed. If we
2334 * filled a prior page from scratch, we can just update our last
2335 * computation, but if we started with a partially filled page,
2336 * recompute from scratch, the number of potentially required pages
2337 * can vary due to tuples needing to fit onto the page, page headers
2338 * etc.
2339 */
2340 if (ndone == 0 || !starting_with_empty_page)
2341 {
2342 npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2343 saveFreeSpace);
2344 npages_used = 0;
2345 }
2346 else
2347 npages_used++;
2348
2349 /*
2350 * Find buffer where at least the next tuple will fit. If the page is
2351 * all-visible, this will also pin the requisite visibility map page.
2352 *
2353 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2354 * empty page. See all_frozen_set below.
2355 */
2356 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2357 InvalidBuffer, options, bistate,
2358 &vmbuffer, NULL,
2359 npages - npages_used);
2360 page = BufferGetPage(buffer);
2361
2362 starting_with_empty_page = PageGetMaxOffsetNumber(page) == 0;
2363
2364 if (starting_with_empty_page && (options & HEAP_INSERT_FROZEN))
2365 all_frozen_set = true;
2366
2367 /* NO EREPORT(ERROR) from here till changes are logged */
2369
2370 /*
2371 * RelationGetBufferForTuple has ensured that the first tuple fits.
2372 * Put that on the page, and then as many other tuples as fit.
2373 */
2374 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2375
2376 /*
2377 * For logical decoding we need combo CIDs to properly decode the
2378 * catalog.
2379 */
2380 if (needwal && need_cids)
2381 log_heap_new_cid(relation, heaptuples[ndone]);
2382
2383 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2384 {
2385 HeapTuple heaptup = heaptuples[ndone + nthispage];
2386
2387 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2388 break;
2389
2390 RelationPutHeapTuple(relation, buffer, heaptup, false);
2391
2392 /*
2393 * For logical decoding we need combo CIDs to properly decode the
2394 * catalog.
2395 */
2396 if (needwal && need_cids)
2397 log_heap_new_cid(relation, heaptup);
2398 }
2399
2400 /*
2401 * If the page is all visible, need to clear that, unless we're only
2402 * going to add further frozen rows to it.
2403 *
2404 * If we're only adding already frozen rows to a previously empty
2405 * page, mark it as all-visible.
2406 */
2408 {
2409 all_visible_cleared = true;
2410 PageClearAllVisible(page);
2411 visibilitymap_clear(relation,
2412 BufferGetBlockNumber(buffer),
2413 vmbuffer, VISIBILITYMAP_VALID_BITS);
2414 }
2415 else if (all_frozen_set)
2416 PageSetAllVisible(page);
2417
2418 /*
2419 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2420 */
2421
2422 MarkBufferDirty(buffer);
2423
2424 /* XLOG stuff */
2425 if (needwal)
2426 {
2427 XLogRecPtr recptr;
2428 xl_heap_multi_insert *xlrec;
2430 char *tupledata;
2431 int totaldatalen;
2432 char *scratchptr = scratch.data;
2433 bool init;
2434 int bufflags = 0;
2435
2436 /*
2437 * If the page was previously empty, we can reinit the page
2438 * instead of restoring the whole thing.
2439 */
2440 init = starting_with_empty_page;
2441
2442 /* allocate xl_heap_multi_insert struct from the scratch area */
2443 xlrec = (xl_heap_multi_insert *) scratchptr;
2444 scratchptr += SizeOfHeapMultiInsert;
2445
2446 /*
2447 * Allocate offsets array. Unless we're reinitializing the page,
2448 * in that case the tuples are stored in order starting at
2449 * FirstOffsetNumber and we don't need to store the offsets
2450 * explicitly.
2451 */
2452 if (!init)
2453 scratchptr += nthispage * sizeof(OffsetNumber);
2454
2455 /* the rest of the scratch space is used for tuple data */
2456 tupledata = scratchptr;
2457
2458 /* check that the mutually exclusive flags are not both set */
2459 Assert(!(all_visible_cleared && all_frozen_set));
2460
2461 xlrec->flags = 0;
2462 if (all_visible_cleared)
2464 if (all_frozen_set)
2466
2467 xlrec->ntuples = nthispage;
2468
2469 /*
2470 * Write out an xl_multi_insert_tuple and the tuple data itself
2471 * for each tuple.
2472 */
2473 for (i = 0; i < nthispage; i++)
2474 {
2475 HeapTuple heaptup = heaptuples[ndone + i];
2476 xl_multi_insert_tuple *tuphdr;
2477 int datalen;
2478
2479 if (!init)
2480 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2481 /* xl_multi_insert_tuple needs two-byte alignment. */
2482 tuphdr = (xl_multi_insert_tuple *) SHORTALIGN(scratchptr);
2483 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2484
2485 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2486 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2487 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2488
2489 /* write bitmap [+ padding] [+ oid] + data */
2490 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2491 memcpy(scratchptr,
2492 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2493 datalen);
2494 tuphdr->datalen = datalen;
2495 scratchptr += datalen;
2496 }
2497 totaldatalen = scratchptr - tupledata;
2498 Assert((scratchptr - scratch.data) < BLCKSZ);
2499
2500 if (need_tuple_data)
2502
2503 /*
2504 * Signal that this is the last xl_heap_multi_insert record
2505 * emitted by this call to heap_multi_insert(). Needed for logical
2506 * decoding so it knows when to cleanup temporary data.
2507 */
2508 if (ndone + nthispage == ntuples)
2510
2511 if (init)
2512 {
2513 info |= XLOG_HEAP_INIT_PAGE;
2514 bufflags |= REGBUF_WILL_INIT;
2515 }
2516
2517 /*
2518 * If we're doing logical decoding, include the new tuple data
2519 * even if we take a full-page image of the page.
2520 */
2521 if (need_tuple_data)
2522 bufflags |= REGBUF_KEEP_DATA;
2523
2525 XLogRegisterData((char *) xlrec, tupledata - scratch.data);
2526 XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
2527
2528 XLogRegisterBufData(0, tupledata, totaldatalen);
2529
2530 /* filtering by origin on a row level is much more efficient */
2532
2533 recptr = XLogInsert(RM_HEAP2_ID, info);
2534
2535 PageSetLSN(page, recptr);
2536 }
2537
2539
2540 /*
2541 * If we've frozen everything on the page, update the visibilitymap.
2542 * We're already holding pin on the vmbuffer.
2543 */
2544 if (all_frozen_set)
2545 {
2546 Assert(PageIsAllVisible(page));
2548
2549 /*
2550 * It's fine to use InvalidTransactionId here - this is only used
2551 * when HEAP_INSERT_FROZEN is specified, which intentionally
2552 * violates visibility rules.
2553 */
2554 visibilitymap_set(relation, BufferGetBlockNumber(buffer), buffer,
2555 InvalidXLogRecPtr, vmbuffer,
2558 }
2559
2560 UnlockReleaseBuffer(buffer);
2561 ndone += nthispage;
2562
2563 /*
2564 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2565 * likely that we'll insert into subsequent heap pages that are likely
2566 * to use the same vm page.
2567 */
2568 }
2569
2570 /* We're done with inserting all tuples, so release the last vmbuffer. */
2571 if (vmbuffer != InvalidBuffer)
2572 ReleaseBuffer(vmbuffer);
2573
2574 /*
2575 * We're done with the actual inserts. Check for conflicts again, to
2576 * ensure that all rw-conflicts in to these inserts are detected. Without
2577 * this final check, a sequential scan of the heap may have locked the
2578 * table after the "before" check, missing one opportunity to detect the
2579 * conflict, and then scanned the table before the new tuples were there,
2580 * missing the other chance to detect the conflict.
2581 *
2582 * For heap inserts, we only need to check for table-level SSI locks. Our
2583 * new tuples can't possibly conflict with existing tuple locks, and heap
2584 * page locks are only consolidated versions of tuple locks; they do not
2585 * lock "gaps" as index page locks do. So we don't need to specify a
2586 * buffer when making the call.
2587 */
2589
2590 /*
2591 * If tuples are cachable, mark them for invalidation from the caches in
2592 * case we abort. Note it is OK to do this after releasing the buffer,
2593 * because the heaptuples data structure is all in local memory, not in
2594 * the shared buffer.
2595 */
2596 if (IsCatalogRelation(relation))
2597 {
2598 for (i = 0; i < ntuples; i++)
2599 CacheInvalidateHeapTuple(relation, heaptuples[i], NULL);
2600 }
2601
2602 /* copy t_self fields back to the caller's slots */
2603 for (i = 0; i < ntuples; i++)
2604 slots[i]->tts_tid = heaptuples[i]->t_self;
2605
2606 pgstat_count_heap_insert(relation, ntuples);
2607}
2608
2609/*
2610 * simple_heap_insert - insert a tuple
2611 *
2612 * Currently, this routine differs from heap_insert only in supplying
2613 * a default command ID and not allowing access to the speedup options.
2614 *
2615 * This should be used rather than using heap_insert directly in most places
2616 * where we are modifying system catalogs.
2617 */
2618void
2620{
2621 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2622}
2623
2624/*
2625 * Given infomask/infomask2, compute the bits that must be saved in the
2626 * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2627 * xl_heap_lock_updated WAL records.
2628 *
2629 * See fix_infomask_from_infobits.
2630 */
2631static uint8
2632compute_infobits(uint16 infomask, uint16 infomask2)
2633{
2634 return
2635 ((infomask & HEAP_XMAX_IS_MULTI) != 0 ? XLHL_XMAX_IS_MULTI : 0) |
2636 ((infomask & HEAP_XMAX_LOCK_ONLY) != 0 ? XLHL_XMAX_LOCK_ONLY : 0) |
2637 ((infomask & HEAP_XMAX_EXCL_LOCK) != 0 ? XLHL_XMAX_EXCL_LOCK : 0) |
2638 /* note we ignore HEAP_XMAX_SHR_LOCK here */
2639 ((infomask & HEAP_XMAX_KEYSHR_LOCK) != 0 ? XLHL_XMAX_KEYSHR_LOCK : 0) |
2640 ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2641 XLHL_KEYS_UPDATED : 0);
2642}
2643
2644/*
2645 * Given two versions of the same t_infomask for a tuple, compare them and
2646 * return whether the relevant status for a tuple Xmax has changed. This is
2647 * used after a buffer lock has been released and reacquired: we want to ensure
2648 * that the tuple state continues to be the same it was when we previously
2649 * examined it.
2650 *
2651 * Note the Xmax field itself must be compared separately.
2652 */
2653static inline bool
2654xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
2655{
2656 const uint16 interesting =
2658
2659 if ((new_infomask & interesting) != (old_infomask & interesting))
2660 return true;
2661
2662 return false;
2663}
2664
2665/*
2666 * heap_delete - delete a tuple
2667 *
2668 * See table_tuple_delete() for an explanation of the parameters, except that
2669 * this routine directly takes a tuple rather than a slot.
2670 *
2671 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2672 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2673 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2674 * generated by another transaction).
2675 */
2678 CommandId cid, Snapshot crosscheck, bool wait,
2679 TM_FailureData *tmfd, bool changingPart)
2680{
2681 TM_Result result;
2683 ItemId lp;
2684 HeapTupleData tp;
2685 Page page;
2686 BlockNumber block;
2687 Buffer buffer;
2688 Buffer vmbuffer = InvalidBuffer;
2689 TransactionId new_xmax;
2690 uint16 new_infomask,
2691 new_infomask2;
2692 bool have_tuple_lock = false;
2693 bool iscombo;
2694 bool all_visible_cleared = false;
2695 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2696 bool old_key_copied = false;
2697
2699
2700 /*
2701 * Forbid this during a parallel operation, lest it allocate a combo CID.
2702 * Other workers might need that combo CID for visibility checks, and we
2703 * have no provision for broadcasting it to them.
2704 */
2705 if (IsInParallelMode())
2706 ereport(ERROR,
2707 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
2708 errmsg("cannot delete tuples during a parallel operation")));
2709
2710 block = ItemPointerGetBlockNumber(tid);
2711 buffer = ReadBuffer(relation, block);
2712 page = BufferGetPage(buffer);
2713
2714 /*
2715 * Before locking the buffer, pin the visibility map page if it appears to
2716 * be necessary. Since we haven't got the lock yet, someone else might be
2717 * in the middle of changing this, so we'll need to recheck after we have
2718 * the lock.
2719 */
2720 if (PageIsAllVisible(page))
2721 visibilitymap_pin(relation, block, &vmbuffer);
2722
2724
2727
2728 tp.t_tableOid = RelationGetRelid(relation);
2729 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2730 tp.t_len = ItemIdGetLength(lp);
2731 tp.t_self = *tid;
2732
2733l1:
2734
2735 /*
2736 * If we didn't pin the visibility map page and the page has become all
2737 * visible while we were busy locking the buffer, we'll have to unlock and
2738 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2739 * unfortunate, but hopefully shouldn't happen often.
2740 */
2741 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2742 {
2744 visibilitymap_pin(relation, block, &vmbuffer);
2746 }
2747
2748 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2749
2750 if (result == TM_Invisible)
2751 {
2752 UnlockReleaseBuffer(buffer);
2753 ereport(ERROR,
2754 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
2755 errmsg("attempted to delete invisible tuple")));
2756 }
2757 else if (result == TM_BeingModified && wait)
2758 {
2759 TransactionId xwait;
2760 uint16 infomask;
2761
2762 /* must copy state data before unlocking buffer */
2764 infomask = tp.t_data->t_infomask;
2765
2766 /*
2767 * Sleep until concurrent transaction ends -- except when there's a
2768 * single locker and it's our own transaction. Note we don't care
2769 * which lock mode the locker has, because we need the strongest one.
2770 *
2771 * Before sleeping, we need to acquire tuple lock to establish our
2772 * priority for the tuple (see heap_lock_tuple). LockTuple will
2773 * release us when we are next-in-line for the tuple.
2774 *
2775 * If we are forced to "start over" below, we keep the tuple lock;
2776 * this arranges that we stay at the head of the line while rechecking
2777 * tuple state.
2778 */
2779 if (infomask & HEAP_XMAX_IS_MULTI)
2780 {
2781 bool current_is_member = false;
2782
2783 if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
2784 LockTupleExclusive, &current_is_member))
2785 {
2787
2788 /*
2789 * Acquire the lock, if necessary (but skip it when we're
2790 * requesting a lock and already have one; avoids deadlock).
2791 */
2792 if (!current_is_member)
2794 LockWaitBlock, &have_tuple_lock);
2795
2796 /* wait for multixact */
2798 relation, &(tp.t_self), XLTW_Delete,
2799 NULL);
2801
2802 /*
2803 * If xwait had just locked the tuple then some other xact
2804 * could update this tuple before we get to this point. Check
2805 * for xmax change, and start over if so.
2806 *
2807 * We also must start over if we didn't pin the VM page, and
2808 * the page has become all visible.
2809 */
2810 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2811 xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2813 xwait))
2814 goto l1;
2815 }
2816
2817 /*
2818 * You might think the multixact is necessarily done here, but not
2819 * so: it could have surviving members, namely our own xact or
2820 * other subxacts of this backend. It is legal for us to delete
2821 * the tuple in either case, however (the latter case is
2822 * essentially a situation of upgrading our former shared lock to
2823 * exclusive). We don't bother changing the on-disk hint bits
2824 * since we are about to overwrite the xmax altogether.
2825 */
2826 }
2827 else if (!TransactionIdIsCurrentTransactionId(xwait))
2828 {
2829 /*
2830 * Wait for regular transaction to end; but first, acquire tuple
2831 * lock.
2832 */
2835 LockWaitBlock, &have_tuple_lock);
2836 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
2838
2839 /*
2840 * xwait is done, but if xwait had just locked the tuple then some
2841 * other xact could update this tuple before we get to this point.
2842 * Check for xmax change, and start over if so.
2843 *
2844 * We also must start over if we didn't pin the VM page, and the
2845 * page has become all visible.
2846 */
2847 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2848 xmax_infomask_changed(tp.t_data->t_infomask, infomask) ||
2850 xwait))
2851 goto l1;
2852
2853 /* Otherwise check if it committed or aborted */
2854 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
2855 }
2856
2857 /*
2858 * We may overwrite if previous xmax aborted, or if it committed but
2859 * only locked the tuple without updating it.
2860 */
2861 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2864 result = TM_Ok;
2865 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
2866 result = TM_Updated;
2867 else
2868 result = TM_Deleted;
2869 }
2870
2871 /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
2872 if (result != TM_Ok)
2873 {
2874 Assert(result == TM_SelfModified ||
2875 result == TM_Updated ||
2876 result == TM_Deleted ||
2877 result == TM_BeingModified);
2879 Assert(result != TM_Updated ||
2881 }
2882
2883 if (crosscheck != InvalidSnapshot && result == TM_Ok)
2884 {
2885 /* Perform additional check for transaction-snapshot mode RI updates */
2886 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
2887 result = TM_Updated;
2888 }
2889
2890 if (result != TM_Ok)
2891 {
2892 tmfd->ctid = tp.t_data->t_ctid;
2894 if (result == TM_SelfModified)
2896 else
2897 tmfd->cmax = InvalidCommandId;
2898 UnlockReleaseBuffer(buffer);
2899 if (have_tuple_lock)
2901 if (vmbuffer != InvalidBuffer)
2902 ReleaseBuffer(vmbuffer);
2903 return result;
2904 }
2905
2906 /*
2907 * We're about to do the actual delete -- check for conflict first, to
2908 * avoid possibly having to roll back work we've just done.
2909 *
2910 * This is safe without a recheck as long as there is no possibility of
2911 * another process scanning the page between this check and the delete
2912 * being visible to the scan (i.e., an exclusive buffer content lock is
2913 * continuously held from this point until the tuple delete is visible).
2914 */
2916
2917 /* replace cid with a combo CID if necessary */
2918 HeapTupleHeaderAdjustCmax(tp.t_data, &cid, &iscombo);
2919
2920 /*
2921 * Compute replica identity tuple before entering the critical section so
2922 * we don't PANIC upon a memory allocation failure.
2923 */
2924 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
2925
2926 /*
2927 * If this is the first possibly-multixact-able operation in the current
2928 * transaction, set my per-backend OldestMemberMXactId setting. We can be
2929 * certain that the transaction will never become a member of any older
2930 * MultiXactIds than that. (We have to do this even if we end up just
2931 * using our own TransactionId below, since some other backend could
2932 * incorporate our XID into a MultiXact immediately afterwards.)
2933 */
2935
2938 xid, LockTupleExclusive, true,
2939 &new_xmax, &new_infomask, &new_infomask2);
2940
2942
2943 /*
2944 * If this transaction commits, the tuple will become DEAD sooner or
2945 * later. Set flag that this page is a candidate for pruning once our xid
2946 * falls below the OldestXmin horizon. If the transaction finally aborts,
2947 * the subsequent page pruning will be a no-op and the hint will be
2948 * cleared.
2949 */
2950 PageSetPrunable(page, xid);
2951
2952 if (PageIsAllVisible(page))
2953 {
2954 all_visible_cleared = true;
2955 PageClearAllVisible(page);
2956 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
2957 vmbuffer, VISIBILITYMAP_VALID_BITS);
2958 }
2959
2960 /* store transaction information of xact deleting the tuple */
2962 tp.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
2963 tp.t_data->t_infomask |= new_infomask;
2964 tp.t_data->t_infomask2 |= new_infomask2;
2966 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
2967 HeapTupleHeaderSetCmax(tp.t_data, cid, iscombo);
2968 /* Make sure there is no forward chain link in t_ctid */
2969 tp.t_data->t_ctid = tp.t_self;
2970
2971 /* Signal that this is actually a move into another partition */
2972 if (changingPart)
2974
2975 MarkBufferDirty(buffer);
2976
2977 /*
2978 * XLOG stuff
2979 *
2980 * NB: heap_abort_speculative() uses the same xlog record and replay
2981 * routines.
2982 */
2983 if (RelationNeedsWAL(relation))
2984 {
2985 xl_heap_delete xlrec;
2986 xl_heap_header xlhdr;
2987 XLogRecPtr recptr;
2988
2989 /*
2990 * For logical decode we need combo CIDs to properly decode the
2991 * catalog
2992 */
2994 log_heap_new_cid(relation, &tp);
2995
2996 xlrec.flags = 0;
2997 if (all_visible_cleared)
2999 if (changingPart)
3002 tp.t_data->t_infomask2);
3004 xlrec.xmax = new_xmax;
3005
3006 if (old_key_tuple != NULL)
3007 {
3008 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3010 else
3012 }
3013
3015 XLogRegisterData((char *) &xlrec, SizeOfHeapDelete);
3016
3018
3019 /*
3020 * Log replica identity of the deleted tuple if there is one
3021 */
3022 if (old_key_tuple != NULL)
3023 {
3024 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3025 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3026 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3027
3028 XLogRegisterData((char *) &xlhdr, SizeOfHeapHeader);
3029 XLogRegisterData((char *) old_key_tuple->t_data
3031 old_key_tuple->t_len
3033 }
3034
3035 /* filtering by origin on a row level is much more efficient */
3037
3038 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_DELETE);
3039
3040 PageSetLSN(page, recptr);
3041 }
3042
3044
3046
3047 if (vmbuffer != InvalidBuffer)
3048 ReleaseBuffer(vmbuffer);
3049
3050 /*
3051 * If the tuple has toasted out-of-line attributes, we need to delete
3052 * those items too. We have to do this before releasing the buffer
3053 * because we need to look at the contents of the tuple, but it's OK to
3054 * release the content lock on the buffer first.
3055 */
3056 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3057 relation->rd_rel->relkind != RELKIND_MATVIEW)
3058 {
3059 /* toast table entries should never be recursively toasted */
3061 }
3062 else if (HeapTupleHasExternal(&tp))
3063 heap_toast_delete(relation, &tp, false);
3064
3065 /*
3066 * Mark tuple for invalidation from system caches at next command
3067 * boundary. We have to do this before releasing the buffer because we
3068 * need to look at the contents of the tuple.
3069 */
3070 CacheInvalidateHeapTuple(relation, &tp, NULL);
3071
3072 /* Now we can release the buffer */
3073 ReleaseBuffer(buffer);
3074
3075 /*
3076 * Release the lmgr tuple lock, if we had it.
3077 */
3078 if (have_tuple_lock)
3080
3081 pgstat_count_heap_delete(relation);
3082
3083 if (old_key_tuple != NULL && old_key_copied)
3084 heap_freetuple(old_key_tuple);
3085
3086 return TM_Ok;
3087}
3088
3089/*
3090 * simple_heap_delete - delete a tuple
3091 *
3092 * This routine may be used to delete a tuple when concurrent updates of
3093 * the target tuple are not expected (for example, because we have a lock
3094 * on the relation associated with the tuple). Any failure is reported
3095 * via ereport().
3096 */
3097void
3099{
3100 TM_Result result;
3101 TM_FailureData tmfd;
3102
3103 result = heap_delete(relation, tid,
3105 true /* wait for commit */ ,
3106 &tmfd, false /* changingPart */ );
3107 switch (result)
3108 {
3109 case TM_SelfModified:
3110 /* Tuple was already updated in current command? */
3111 elog(ERROR, "tuple already updated by self");
3112 break;
3113
3114 case TM_Ok:
3115 /* done successfully */
3116 break;
3117
3118 case TM_Updated:
3119 elog(ERROR, "tuple concurrently updated");
3120 break;
3121
3122 case TM_Deleted:
3123 elog(ERROR, "tuple concurrently deleted");
3124 break;
3125
3126 default:
3127 elog(ERROR, "unrecognized heap_delete status: %u", result);
3128 break;
3129 }
3130}
3131
3132/*
3133 * heap_update - replace a tuple
3134 *
3135 * See table_tuple_update() for an explanation of the parameters, except that
3136 * this routine directly takes a tuple rather than a slot.
3137 *
3138 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
3139 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
3140 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
3141 * generated by another transaction).
3142 */
3145 CommandId cid, Snapshot crosscheck, bool wait,
3146 TM_FailureData *tmfd, LockTupleMode *lockmode,
3147 TU_UpdateIndexes *update_indexes)
3148{
3149 TM_Result result;
3151 Bitmapset *hot_attrs;
3152 Bitmapset *sum_attrs;
3153 Bitmapset *key_attrs;
3154 Bitmapset *id_attrs;
3155 Bitmapset *interesting_attrs;
3156 Bitmapset *modified_attrs;
3157 ItemId lp;
3158 HeapTupleData oldtup;
3159 HeapTuple heaptup;
3160 HeapTuple old_key_tuple = NULL;
3161 bool old_key_copied = false;
3162 Page page;
3163 BlockNumber block;
3164 MultiXactStatus mxact_status;
3165 Buffer buffer,
3166 newbuf,
3167 vmbuffer = InvalidBuffer,
3168 vmbuffer_new = InvalidBuffer;
3169 bool need_toast;
3170 Size newtupsize,
3171 pagefree;
3172 bool have_tuple_lock = false;
3173 bool iscombo;
3174 bool use_hot_update = false;
3175 bool summarized_update = false;
3176 bool key_intact;
3177 bool all_visible_cleared = false;
3178 bool all_visible_cleared_new = false;
3179 bool checked_lockers;
3180 bool locker_remains;
3181 bool id_has_external = false;
3182 TransactionId xmax_new_tuple,
3183 xmax_old_tuple;
3184 uint16 infomask_old_tuple,
3185 infomask2_old_tuple,
3186 infomask_new_tuple,
3187 infomask2_new_tuple;
3188
3190
3191 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3194
3195 /*
3196 * Forbid this during a parallel operation, lest it allocate a combo CID.
3197 * Other workers might need that combo CID for visibility checks, and we
3198 * have no provision for broadcasting it to them.
3199 */
3200 if (IsInParallelMode())
3201 ereport(ERROR,
3202 (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
3203 errmsg("cannot update tuples during a parallel operation")));
3204
3205#ifdef USE_ASSERT_CHECKING
3206 check_lock_if_inplace_updateable_rel(relation, otid, newtup);
3207#endif
3208
3209 /*
3210 * Fetch the list of attributes to be checked for various operations.
3211 *
3212 * For HOT considerations, this is wasted effort if we fail to update or
3213 * have to put the new tuple on a different page. But we must compute the
3214 * list before obtaining buffer lock --- in the worst case, if we are
3215 * doing an update on one of the relevant system catalogs, we could
3216 * deadlock if we try to fetch the list later. In any case, the relcache
3217 * caches the data so this is usually pretty cheap.
3218 *
3219 * We also need columns used by the replica identity and columns that are
3220 * considered the "key" of rows in the table.
3221 *
3222 * Note that we get copies of each bitmap, so we need not worry about
3223 * relcache flush happening midway through.
3224 */
3225 hot_attrs = RelationGetIndexAttrBitmap(relation,
3227 sum_attrs = RelationGetIndexAttrBitmap(relation,
3230 id_attrs = RelationGetIndexAttrBitmap(relation,
3232 interesting_attrs = NULL;
3233 interesting_attrs = bms_add_members(interesting_attrs, hot_attrs);
3234 interesting_attrs = bms_add_members(interesting_attrs, sum_attrs);
3235 interesting_attrs = bms_add_members(interesting_attrs, key_attrs);
3236 interesting_attrs = bms_add_members(interesting_attrs, id_attrs);
3237
3238 block = ItemPointerGetBlockNumber(otid);
3239 buffer = ReadBuffer(relation, block);
3240 page = BufferGetPage(buffer);
3241
3242 /*
3243 * Before locking the buffer, pin the visibility map page if it appears to
3244 * be necessary. Since we haven't got the lock yet, someone else might be
3245 * in the middle of changing this, so we'll need to recheck after we have
3246 * the lock.
3247 */
3248 if (PageIsAllVisible(page))
3249 visibilitymap_pin(relation, block, &vmbuffer);
3250
3252
3253 lp = PageGetItemId(page, ItemPointerGetOffsetNumber(otid));
3255
3256 /*
3257 * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3258 * properly.
3259 */
3260 oldtup.t_tableOid = RelationGetRelid(relation);
3261 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3262 oldtup.t_len = ItemIdGetLength(lp);
3263 oldtup.t_self = *otid;
3264
3265 /* the new tuple is ready, except for this: */
3266 newtup->t_tableOid = RelationGetRelid(relation);
3267
3268 /*
3269 * Determine columns modified by the update. Additionally, identify
3270 * whether any of the unmodified replica identity key attributes in the
3271 * old tuple is externally stored or not. This is required because for
3272 * such attributes the flattened value won't be WAL logged as part of the
3273 * new tuple so we must include it as part of the old_key_tuple. See
3274 * ExtractReplicaIdentity.
3275 */
3276 modified_attrs = HeapDetermineColumnsInfo(relation, interesting_attrs,
3277 id_attrs, &oldtup,
3278 newtup, &id_has_external);
3279
3280 /*
3281 * If we're not updating any "key" column, we can grab a weaker lock type.
3282 * This allows for more concurrency when we are running simultaneously
3283 * with foreign key checks.
3284 *
3285 * Note that if a column gets detoasted while executing the update, but
3286 * the value ends up being the same, this test will fail and we will use
3287 * the stronger lock. This is acceptable; the important case to optimize
3288 * is updates that don't manipulate key columns, not those that
3289 * serendipitously arrive at the same key values.
3290 */
3291 if (!bms_overlap(modified_attrs, key_attrs))
3292 {
3293 *lockmode = LockTupleNoKeyExclusive;
3294 mxact_status = MultiXactStatusNoKeyUpdate;
3295 key_intact = true;
3296
3297 /*
3298 * If this is the first possibly-multixact-able operation in the
3299 * current transaction, set my per-backend OldestMemberMXactId
3300 * setting. We can be certain that the transaction will never become a
3301 * member of any older MultiXactIds than that. (We have to do this
3302 * even if we end up just using our own TransactionId below, since
3303 * some other backend could incorporate our XID into a MultiXact
3304 * immediately afterwards.)
3305 */
3307 }
3308 else
3309 {
3310 *lockmode = LockTupleExclusive;
3311 mxact_status = MultiXactStatusUpdate;
3312 key_intact = false;
3313 }
3314
3315 /*
3316 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3317 * otid may very well point at newtup->t_self, which we will overwrite
3318 * with the new tuple's location, so there's great risk of confusion if we
3319 * use otid anymore.
3320 */
3321
3322l2:
3323 checked_lockers = false;
3324 locker_remains = false;
3325 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3326
3327 /* see below about the "no wait" case */
3328 Assert(result != TM_BeingModified || wait);
3329
3330 if (result == TM_Invisible)
3331 {
3332 UnlockReleaseBuffer(buffer);
3333 ereport(ERROR,
3334 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
3335 errmsg("attempted to update invisible tuple")));
3336 }
3337 else if (result == TM_BeingModified && wait)
3338 {
3339 TransactionId xwait;
3340 uint16 infomask;
3341 bool can_continue = false;
3342
3343 /*
3344 * XXX note that we don't consider the "no wait" case here. This
3345 * isn't a problem currently because no caller uses that case, but it
3346 * should be fixed if such a caller is introduced. It wasn't a
3347 * problem previously because this code would always wait, but now
3348 * that some tuple locks do not conflict with one of the lock modes we
3349 * use, it is possible that this case is interesting to handle
3350 * specially.
3351 *
3352 * This may cause failures with third-party code that calls
3353 * heap_update directly.
3354 */
3355
3356 /* must copy state data before unlocking buffer */
3357 xwait = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3358 infomask = oldtup.t_data->t_infomask;
3359
3360 /*
3361 * Now we have to do something about the existing locker. If it's a
3362 * multi, sleep on it; we might be awakened before it is completely
3363 * gone (or even not sleep at all in some cases); we need to preserve
3364 * it as locker, unless it is gone completely.
3365 *
3366 * If it's not a multi, we need to check for sleeping conditions
3367 * before actually going to sleep. If the update doesn't conflict
3368 * with the locks, we just continue without sleeping (but making sure
3369 * it is preserved).
3370 *
3371 * Before sleeping, we need to acquire tuple lock to establish our
3372 * priority for the tuple (see heap_lock_tuple). LockTuple will
3373 * release us when we are next-in-line for the tuple. Note we must
3374 * not acquire the tuple lock until we're sure we're going to sleep;
3375 * otherwise we're open for race conditions with other transactions
3376 * holding the tuple lock which sleep on us.
3377 *
3378 * If we are forced to "start over" below, we keep the tuple lock;
3379 * this arranges that we stay at the head of the line while rechecking
3380 * tuple state.
3381 */
3382 if (infomask & HEAP_XMAX_IS_MULTI)
3383 {
3384 TransactionId update_xact;
3385 int remain;
3386 bool current_is_member = false;
3387
3388 if (DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
3389 *lockmode, &current_is_member))
3390 {
3392
3393 /*
3394 * Acquire the lock, if necessary (but skip it when we're
3395 * requesting a lock and already have one; avoids deadlock).
3396 */
3397 if (!current_is_member)
3398 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3399 LockWaitBlock, &have_tuple_lock);
3400
3401 /* wait for multixact */
3402 MultiXactIdWait((MultiXactId) xwait, mxact_status, infomask,
3403 relation, &oldtup.t_self, XLTW_Update,
3404 &remain);
3405 checked_lockers = true;
3406 locker_remains = remain != 0;
3408
3409 /*
3410 * If xwait had just locked the tuple then some other xact
3411 * could update this tuple before we get to this point. Check
3412 * for xmax change, and start over if so.
3413 */
3415 infomask) ||
3417 xwait))
3418 goto l2;
3419 }
3420
3421 /*
3422 * Note that the multixact may not be done by now. It could have
3423 * surviving members; our own xact or other subxacts of this
3424 * backend, and also any other concurrent transaction that locked
3425 * the tuple with LockTupleKeyShare if we only got
3426 * LockTupleNoKeyExclusive. If this is the case, we have to be
3427 * careful to mark the updated tuple with the surviving members in
3428 * Xmax.
3429 *
3430 * Note that there could have been another update in the
3431 * MultiXact. In that case, we need to check whether it committed
3432 * or aborted. If it aborted we are safe to update it again;
3433 * otherwise there is an update conflict, and we have to return
3434 * TableTuple{Deleted, Updated} below.
3435 *
3436 * In the LockTupleExclusive case, we still need to preserve the
3437 * surviving members: those would include the tuple locks we had
3438 * before this one, which are important to keep in case this
3439 * subxact aborts.
3440 */
3442 update_xact = HeapTupleGetUpdateXid(oldtup.t_data);
3443 else
3444 update_xact = InvalidTransactionId;
3445
3446 /*
3447 * There was no UPDATE in the MultiXact; or it aborted. No
3448 * TransactionIdIsInProgress() call needed here, since we called
3449 * MultiXactIdWait() above.
3450 */
3451 if (!TransactionIdIsValid(update_xact) ||
3452 TransactionIdDidAbort(update_xact))
3453 can_continue = true;
3454 }
3456 {
3457 /*
3458 * The only locker is ourselves; we can avoid grabbing the tuple
3459 * lock here, but must preserve our locking information.
3460 */
3461 checked_lockers = true;
3462 locker_remains = true;
3463 can_continue = true;
3464 }
3465 else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask) && key_intact)
3466 {
3467 /*
3468 * If it's just a key-share locker, and we're not changing the key
3469 * columns, we don't need to wait for it to end; but we need to
3470 * preserve it as locker.
3471 */
3472 checked_lockers = true;
3473 locker_remains = true;
3474 can_continue = true;
3475 }
3476 else
3477 {
3478 /*
3479 * Wait for regular transaction to end; but first, acquire tuple
3480 * lock.
3481 */
3483 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3484 LockWaitBlock, &have_tuple_lock);
3485 XactLockTableWait(xwait, relation, &oldtup.t_self,
3486 XLTW_Update);
3487 checked_lockers = true;
3489
3490 /*
3491 * xwait is done, but if xwait had just locked the tuple then some
3492 * other xact could update this tuple before we get to this point.
3493 * Check for xmax change, and start over if so.
3494 */
3495 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3496 !TransactionIdEquals(xwait,
3498 goto l2;
3499
3500 /* Otherwise check if it committed or aborted */
3501 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3502 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3503 can_continue = true;
3504 }
3505
3506 if (can_continue)
3507 result = TM_Ok;
3508 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3509 result = TM_Updated;
3510 else
3511 result = TM_Deleted;
3512 }
3513
3514 /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3515 if (result != TM_Ok)
3516 {
3517 Assert(result == TM_SelfModified ||
3518 result == TM_Updated ||
3519 result == TM_Deleted ||
3520 result == TM_BeingModified);
3522 Assert(result != TM_Updated ||
3523 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3524 }
3525
3526 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3527 {
3528 /* Perform additional check for transaction-snapshot mode RI updates */
3529 if (!HeapTupleSatisfiesVisibility(&oldtup, crosscheck, buffer))
3530 result = TM_Updated;
3531 }
3532
3533 if (result != TM_Ok)
3534 {
3535 tmfd->ctid = oldtup.t_data->t_ctid;
3536 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3537 if (result == TM_SelfModified)
3538 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3539 else
3540 tmfd->cmax = InvalidCommandId;
3541 UnlockReleaseBuffer(buffer);
3542 if (have_tuple_lock)
3543 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3544 if (vmbuffer != InvalidBuffer)
3545 ReleaseBuffer(vmbuffer);
3546 *update_indexes = TU_None;
3547
3548 bms_free(hot_attrs);
3549 bms_free(sum_attrs);
3550 bms_free(key_attrs);
3551 bms_free(id_attrs);
3552 bms_free(modified_attrs);
3553 bms_free(interesting_attrs);
3554 return result;
3555 }
3556
3557 /*
3558 * If we didn't pin the visibility map page and the page has become all
3559 * visible while we were busy locking the buffer, or during some
3560 * subsequent window during which we had it unlocked, we'll have to unlock
3561 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3562 * bit unfortunate, especially since we'll now have to recheck whether the
3563 * tuple has been locked or updated under us, but hopefully it won't
3564 * happen very often.
3565 */
3566 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3567 {
3569 visibilitymap_pin(relation, block, &vmbuffer);
3571 goto l2;
3572 }
3573
3574 /* Fill in transaction status data */
3575
3576 /*
3577 * If the tuple we're updating is locked, we need to preserve the locking
3578 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3579 */
3581 oldtup.t_data->t_infomask,
3582 oldtup.t_data->t_infomask2,
3583 xid, *lockmode, true,
3584 &xmax_old_tuple, &infomask_old_tuple,
3585 &infomask2_old_tuple);
3586
3587 /*
3588 * And also prepare an Xmax value for the new copy of the tuple. If there
3589 * was no xmax previously, or there was one but all lockers are now gone,
3590 * then use InvalidTransactionId; otherwise, get the xmax from the old
3591 * tuple. (In rare cases that might also be InvalidTransactionId and yet
3592 * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3593 */
3594 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3596 (checked_lockers && !locker_remains))
3597 xmax_new_tuple = InvalidTransactionId;
3598 else
3599 xmax_new_tuple = HeapTupleHeaderGetRawXmax(oldtup.t_data);
3600
3601 if (!TransactionIdIsValid(xmax_new_tuple))
3602 {
3603 infomask_new_tuple = HEAP_XMAX_INVALID;
3604 infomask2_new_tuple = 0;
3605 }
3606 else
3607 {
3608 /*
3609 * If we found a valid Xmax for the new tuple, then the infomask bits
3610 * to use on the new tuple depend on what was there on the old one.
3611 * Note that since we're doing an update, the only possibility is that
3612 * the lockers had FOR KEY SHARE lock.
3613 */
3614 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3615 {
3616 GetMultiXactIdHintBits(xmax_new_tuple, &infomask_new_tuple,
3617 &infomask2_new_tuple);
3618 }
3619 else
3620 {
3621 infomask_new_tuple = HEAP_XMAX_KEYSHR_LOCK | HEAP_XMAX_LOCK_ONLY;
3622 infomask2_new_tuple = 0;
3623 }
3624 }
3625
3626 /*
3627 * Prepare the new tuple with the appropriate initial values of Xmin and
3628 * Xmax, as well as initial infomask bits as computed above.
3629 */
3630 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3631 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3632 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3633 HeapTupleHeaderSetCmin(newtup->t_data, cid);
3634 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3635 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3636 HeapTupleHeaderSetXmax(newtup->t_data, xmax_new_tuple);
3637
3638 /*
3639 * Replace cid with a combo CID if necessary. Note that we already put
3640 * the plain cid into the new tuple.
3641 */
3642 HeapTupleHeaderAdjustCmax(oldtup.t_data, &cid, &iscombo);
3643
3644 /*
3645 * If the toaster needs to be activated, OR if the new tuple will not fit
3646 * on the same page as the old, then we need to release the content lock
3647 * (but not the pin!) on the old tuple's buffer while we are off doing
3648 * TOAST and/or table-file-extension work. We must mark the old tuple to
3649 * show that it's locked, else other processes may try to update it
3650 * themselves.
3651 *
3652 * We need to invoke the toaster if there are already any out-of-line
3653 * toasted values present, or if the new tuple is over-threshold.
3654 */
3655 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3656 relation->rd_rel->relkind != RELKIND_MATVIEW)
3657 {
3658 /* toast table entries should never be recursively toasted */
3659 Assert(!HeapTupleHasExternal(&oldtup));
3660 Assert(!HeapTupleHasExternal(newtup));
3661 need_toast = false;
3662 }
3663 else
3664 need_toast = (HeapTupleHasExternal(&oldtup) ||
3665 HeapTupleHasExternal(newtup) ||
3666 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3667
3668 pagefree = PageGetHeapFreeSpace(page);
3669
3670 newtupsize = MAXALIGN(newtup->t_len);
3671
3672 if (need_toast || newtupsize > pagefree)
3673 {
3674 TransactionId xmax_lock_old_tuple;
3675 uint16 infomask_lock_old_tuple,
3676 infomask2_lock_old_tuple;
3677 bool cleared_all_frozen = false;
3678
3679 /*
3680 * To prevent concurrent sessions from updating the tuple, we have to
3681 * temporarily mark it locked, while we release the page-level lock.
3682 *
3683 * To satisfy the rule that any xid potentially appearing in a buffer
3684 * written out to disk, we unfortunately have to WAL log this
3685 * temporary modification. We can reuse xl_heap_lock for this
3686 * purpose. If we crash/error before following through with the
3687 * actual update, xmax will be of an aborted transaction, allowing
3688 * other sessions to proceed.
3689 */
3690
3691 /*
3692 * Compute xmax / infomask appropriate for locking the tuple. This has
3693 * to be done separately from the combo that's going to be used for
3694 * updating, because the potentially created multixact would otherwise
3695 * be wrong.
3696 */
3698 oldtup.t_data->t_infomask,
3699 oldtup.t_data->t_infomask2,
3700 xid, *lockmode, false,
3701 &xmax_lock_old_tuple, &infomask_lock_old_tuple,
3702 &infomask2_lock_old_tuple);
3703
3704 Assert(HEAP_XMAX_IS_LOCKED_ONLY(infomask_lock_old_tuple));
3705
3707
3708 /* Clear obsolete visibility flags ... */
3710 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3711 HeapTupleClearHotUpdated(&oldtup);
3712 /* ... and store info about transaction updating this tuple */
3713 Assert(TransactionIdIsValid(xmax_lock_old_tuple));
3714 HeapTupleHeaderSetXmax(oldtup.t_data, xmax_lock_old_tuple);
3715 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3716 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3717 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3718
3719 /* temporarily make it look not-updated, but locked */
3720 oldtup.t_data->t_ctid = oldtup.t_self;
3721
3722 /*
3723 * Clear all-frozen bit on visibility map if needed. We could
3724 * immediately reset ALL_VISIBLE, but given that the WAL logging
3725 * overhead would be unchanged, that doesn't seem necessarily
3726 * worthwhile.
3727 */
3728 if (PageIsAllVisible(page) &&
3729 visibilitymap_clear(relation, block, vmbuffer,
3731 cleared_all_frozen = true;
3732
3733 MarkBufferDirty(buffer);
3734
3735 if (RelationNeedsWAL(relation))
3736 {
3737 xl_heap_lock xlrec;
3738 XLogRecPtr recptr;
3739
3742
3743 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3744 xlrec.xmax = xmax_lock_old_tuple;
3746 oldtup.t_data->t_infomask2);
3747 xlrec.flags =
3748 cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
3749 XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
3750 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
3751 PageSetLSN(page, recptr);
3752 }
3753
3755
3757
3758 /*
3759 * Let the toaster do its thing, if needed.
3760 *
3761 * Note: below this point, heaptup is the data we actually intend to
3762 * store into the relation; newtup is the caller's original untoasted
3763 * data.
3764 */
3765 if (need_toast)
3766 {
3767 /* Note we always use WAL and FSM during updates */
3768 heaptup = heap_toast_insert_or_update(relation, newtup, &oldtup, 0);
3769 newtupsize = MAXALIGN(heaptup->t_len);
3770 }
3771 else
3772 heaptup = newtup;
3773
3774 /*
3775 * Now, do we need a new page for the tuple, or not? This is a bit
3776 * tricky since someone else could have added tuples to the page while
3777 * we weren't looking. We have to recheck the available space after
3778 * reacquiring the buffer lock. But don't bother to do that if the
3779 * former amount of free space is still not enough; it's unlikely
3780 * there's more free now than before.
3781 *
3782 * What's more, if we need to get a new page, we will need to acquire
3783 * buffer locks on both old and new pages. To avoid deadlock against
3784 * some other backend trying to get the same two locks in the other
3785 * order, we must be consistent about the order we get the locks in.
3786 * We use the rule "lock the lower-numbered page of the relation
3787 * first". To implement this, we must do RelationGetBufferForTuple
3788 * while not holding the lock on the old page, and we must rely on it
3789 * to get the locks on both pages in the correct order.
3790 *
3791 * Another consideration is that we need visibility map page pin(s) if
3792 * we will have to clear the all-visible flag on either page. If we
3793 * call RelationGetBufferForTuple, we rely on it to acquire any such
3794 * pins; but if we don't, we have to handle that here. Hence we need
3795 * a loop.
3796 */
3797 for (;;)
3798 {
3799 if (newtupsize > pagefree)
3800 {
3801 /* It doesn't fit, must use RelationGetBufferForTuple. */
3802 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
3803 buffer, 0, NULL,
3804 &vmbuffer_new, &vmbuffer,
3805 0);
3806 /* We're all done. */
3807 break;
3808 }
3809 /* Acquire VM page pin if needed and we don't have it. */
3810 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3811 visibilitymap_pin(relation, block, &vmbuffer);
3812 /* Re-acquire the lock on the old tuple's page. */
3814 /* Re-check using the up-to-date free space */
3815 pagefree = PageGetHeapFreeSpace(page);
3816 if (newtupsize > pagefree ||
3817 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
3818 {
3819 /*
3820 * Rats, it doesn't fit anymore, or somebody just now set the
3821 * all-visible flag. We must now unlock and loop to avoid
3822 * deadlock. Fortunately, this path should seldom be taken.
3823 */
3825 }
3826 else
3827 {
3828 /* We're all done. */
3829 newbuf = buffer;
3830 break;
3831 }
3832 }
3833 }
3834 else
3835 {
3836 /* No TOAST work needed, and it'll fit on same page */
3837 newbuf = buffer;
3838 heaptup = newtup;
3839 }
3840
3841 /*
3842 * We're about to do the actual update -- check for conflict first, to
3843 * avoid possibly having to roll back work we've just done.
3844 *
3845 * This is safe without a recheck as long as there is no possibility of
3846 * another process scanning the pages between this check and the update
3847 * being visible to the scan (i.e., exclusive buffer content lock(s) are
3848 * continuously held from this point until the tuple update is visible).
3849 *
3850 * For the new tuple the only check needed is at the relation level, but
3851 * since both tuples are in the same relation and the check for oldtup
3852 * will include checking the relation level, there is no benefit to a
3853 * separate check for the new tuple.
3854 */
3855 CheckForSerializableConflictIn(relation, &oldtup.t_self,
3856 BufferGetBlockNumber(buffer));
3857
3858 /*
3859 * At this point newbuf and buffer are both pinned and locked, and newbuf
3860 * has enough space for the new tuple. If they are the same buffer, only
3861 * one pin is held.
3862 */
3863
3864 if (newbuf == buffer)
3865 {
3866 /*
3867 * Since the new tuple is going into the same page, we might be able
3868 * to do a HOT update. Check if any of the index columns have been
3869 * changed.
3870 */
3871 if (!bms_overlap(modified_attrs, hot_attrs))
3872 {
3873 use_hot_update = true;
3874
3875 /*
3876 * If none of the columns that are used in hot-blocking indexes
3877 * were updated, we can apply HOT, but we do still need to check
3878 * if we need to update the summarizing indexes, and update those
3879 * indexes if the columns were updated, or we may fail to detect
3880 * e.g. value bound changes in BRIN minmax indexes.
3881 */
3882 if (bms_overlap(modified_attrs, sum_attrs))
3883 summarized_update = true;
3884 }
3885 }
3886 else
3887 {
3888 /* Set a hint that the old page could use prune/defrag */
3889 PageSetFull(page);
3890 }
3891
3892 /*
3893 * Compute replica identity tuple before entering the critical section so
3894 * we don't PANIC upon a memory allocation failure.
3895 * ExtractReplicaIdentity() will return NULL if nothing needs to be
3896 * logged. Pass old key required as true only if the replica identity key
3897 * columns are modified or it has external data.
3898 */
3899 old_key_tuple = ExtractReplicaIdentity(relation, &oldtup,
3900 bms_overlap(modified_attrs, id_attrs) ||
3901 id_has_external,
3902 &old_key_copied);
3903
3904 /* NO EREPORT(ERROR) from here till changes are logged */
3906
3907 /*
3908 * If this transaction commits, the old tuple will become DEAD sooner or
3909 * later. Set flag that this page is a candidate for pruning once our xid
3910 * falls below the OldestXmin horizon. If the transaction finally aborts,
3911 * the subsequent page pruning will be a no-op and the hint will be
3912 * cleared.
3913 *
3914 * XXX Should we set hint on newbuf as well? If the transaction aborts,
3915 * there would be a prunable tuple in the newbuf; but for now we choose
3916 * not to optimize for aborts. Note that heap_xlog_update must be kept in
3917 * sync if this decision changes.
3918 */
3919 PageSetPrunable(page, xid);
3920
3921 if (use_hot_update)
3922 {
3923 /* Mark the old tuple as HOT-updated */
3924 HeapTupleSetHotUpdated(&oldtup);
3925 /* And mark the new tuple as heap-only */
3926 HeapTupleSetHeapOnly(heaptup);
3927 /* Mark the caller's copy too, in case different from heaptup */
3928 HeapTupleSetHeapOnly(newtup);
3929 }
3930 else
3931 {
3932 /* Make sure tuples are correctly marked as not-HOT */
3933 HeapTupleClearHotUpdated(&oldtup);
3934 HeapTupleClearHeapOnly(heaptup);
3935 HeapTupleClearHeapOnly(newtup);
3936 }
3937
3938 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
3939
3940
3941 /* Clear obsolete visibility flags, possibly set by ourselves above... */
3943 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3944 /* ... and store info about transaction updating this tuple */
3945 Assert(TransactionIdIsValid(xmax_old_tuple));
3946 HeapTupleHeaderSetXmax(oldtup.t_data, xmax_old_tuple);
3947 oldtup.t_data->t_infomask |= infomask_old_tuple;
3948 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
3949 HeapTupleHeaderSetCmax(oldtup.t_data, cid, iscombo);
3950
3951 /* record address of new tuple in t_ctid of old one */
3952 oldtup.t_data->t_ctid = heaptup->t_self;
3953
3954 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
3955 if (PageIsAllVisible(BufferGetPage(buffer)))
3956 {
3957 all_visible_cleared = true;
3959 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3960 vmbuffer, VISIBILITYMAP_VALID_BITS);
3961 }
3962 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
3963 {
3964 all_visible_cleared_new = true;
3966 visibilitymap_clear(relation, BufferGetBlockNumber(newbuf),
3967 vmbuffer_new, VISIBILITYMAP_VALID_BITS);
3968 }
3969
3970 if (newbuf != buffer)
3971 MarkBufferDirty(newbuf);
3972 MarkBufferDirty(buffer);
3973
3974 /* XLOG stuff */
3975 if (RelationNeedsWAL(relation))
3976 {
3977 XLogRecPtr recptr;
3978
3979 /*
3980 * For logical decoding we need combo CIDs to properly decode the
3981 * catalog.
3982 */
3984 {
3985 log_heap_new_cid(relation, &oldtup);
3986 log_heap_new_cid(relation, heaptup);
3987 }
3988
3989 recptr = log_heap_update(relation, buffer,
3990 newbuf, &oldtup, heaptup,
3991 old_key_tuple,
3992 all_visible_cleared,
3993 all_visible_cleared_new);
3994 if (newbuf != buffer)
3995 {
3996 PageSetLSN(BufferGetPage(newbuf), recptr);
3997 }
3998 PageSetLSN(BufferGetPage(buffer), recptr);
3999 }
4000
4002
4003 if (newbuf != buffer)
4006
4007 /*
4008 * Mark old tuple for invalidation from system caches at next command
4009 * boundary, and mark the new tuple for invalidation in case we abort. We
4010 * have to do this before releasing the buffer because oldtup is in the
4011 * buffer. (heaptup is all in local memory, but it's necessary to process
4012 * both tuple versions in one call to inval.c so we can avoid redundant
4013 * sinval messages.)
4014 */
4015 CacheInvalidateHeapTuple(relation, &oldtup, heaptup);
4016
4017 /* Now we can release the buffer(s) */
4018 if (newbuf != buffer)
4019 ReleaseBuffer(newbuf);
4020 ReleaseBuffer(buffer);
4021 if (BufferIsValid(vmbuffer_new))
4022 ReleaseBuffer(vmbuffer_new);
4023 if (BufferIsValid(vmbuffer))
4024 ReleaseBuffer(vmbuffer);
4025
4026 /*
4027 * Release the lmgr tuple lock, if we had it.
4028 */
4029 if (have_tuple_lock)
4030 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4031
4032 pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
4033
4034 /*
4035 * If heaptup is a private copy, release it. Don't forget to copy t_self
4036 * back to the caller's image, too.
4037 */
4038 if (heaptup != newtup)
4039 {
4040 newtup->t_self = heaptup->t_self;
4041 heap_freetuple(heaptup);
4042 }
4043
4044 /*
4045 * If it is a HOT update, the update may still need to update summarized
4046 * indexes, lest we fail to update those summaries and get incorrect
4047 * results (for example, minmax bounds of the block may change with this
4048 * update).
4049 */
4050 if (use_hot_update)
4051 {
4052 if (summarized_update)
4053 *update_indexes = TU_Summarizing;
4054 else
4055 *update_indexes = TU_None;
4056 }
4057 else
4058 *update_indexes = TU_All;
4059
4060 if (old_key_tuple != NULL && old_key_copied)
4061 heap_freetuple(old_key_tuple);
4062
4063 bms_free(hot_attrs);
4064 bms_free(sum_attrs);
4065 bms_free(key_attrs);
4066 bms_free(id_attrs);
4067 bms_free(modified_attrs);
4068 bms_free(interesting_attrs);
4069
4070 return TM_Ok;
4071}
4072
4073#ifdef USE_ASSERT_CHECKING
4074/*
4075 * Confirm adequate lock held during heap_update(), per rules from
4076 * README.tuplock section "Locking to write inplace-updated tables".
4077 */
4078static void
4079check_lock_if_inplace_updateable_rel(Relation relation,
4080 ItemPointer otid,
4081 HeapTuple newtup)
4082{
4083 /* LOCKTAG_TUPLE acceptable for any catalog */
4084 switch (RelationGetRelid(relation))
4085 {
4086 case RelationRelationId:
4087 case DatabaseRelationId:
4088 {
4089 LOCKTAG tuptag;
4090
4091 SET_LOCKTAG_TUPLE(tuptag,
4092 relation->rd_lockInfo.lockRelId.dbId,
4093 relation->rd_lockInfo.lockRelId.relId,
4096 if (LockHeldByMe(&tuptag, InplaceUpdateTupleLock, false))
4097 return;
4098 }
4099 break;
4100 default:
4101 Assert(!IsInplaceUpdateRelation(relation));
4102 return;
4103 }
4104
4105 switch (RelationGetRelid(relation))
4106 {
4107 case RelationRelationId:
4108 {
4109 /* LOCKTAG_TUPLE or LOCKTAG_RELATION ok */
4110 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(newtup);
4111 Oid relid = classForm->oid;
4112 Oid dbid;
4113 LOCKTAG tag;
4114
4115 if (IsSharedRelation(relid))
4116 dbid = InvalidOid;
4117 else
4118 dbid = MyDatabaseId;
4119
4120 if (classForm->relkind == RELKIND_INDEX)
4121 {
4122 Relation irel = index_open(relid, AccessShareLock);
4123
4124 SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4126 }
4127 else
4128 SET_LOCKTAG_RELATION(tag, dbid, relid);
4129
4130 if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, false) &&
4131 !LockHeldByMe(&tag, ShareRowExclusiveLock, true))
4132 elog(WARNING,
4133 "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4134 NameStr(classForm->relname),
4135 relid,
4136 classForm->relkind,
4139 }
4140 break;
4141 case DatabaseRelationId:
4142 {
4143 /* LOCKTAG_TUPLE required */
4144 Form_pg_database dbForm = (Form_pg_database) GETSTRUCT(newtup);
4145
4146 elog(WARNING,
4147 "missing lock on database \"%s\" (OID %u) @ TID (%u,%u)",
4148 NameStr(dbForm->datname),
4149 dbForm->oid,
4152 }
4153 break;
4154 }
4155}
4156
4157/*
4158 * Confirm adequate relation lock held, per rules from README.tuplock section
4159 * "Locking to write inplace-updated tables".
4160 */
4161static void
4162check_inplace_rel_lock(HeapTuple oldtup)
4163{
4164 Form_pg_class classForm = (Form_pg_class) GETSTRUCT(oldtup);
4165 Oid relid = classForm->oid;
4166 Oid dbid;
4167 LOCKTAG tag;
4168
4169 if (IsSharedRelation(relid))
4170 dbid = InvalidOid;
4171 else
4172 dbid = MyDatabaseId;
4173
4174 if (classForm->relkind == RELKIND_INDEX)
4175 {
4176 Relation irel = index_open(relid, AccessShareLock);
4177
4178 SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4180 }
4181 else
4182 SET_LOCKTAG_RELATION(tag, dbid, relid);
4183
4184 if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, true))
4185 elog(WARNING,
4186 "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4187 NameStr(classForm->relname),
4188 relid,
4189 classForm->relkind,
4192}
4193#endif
4194
4195/*
4196 * Check if the specified attribute's values are the same. Subroutine for
4197 * HeapDetermineColumnsInfo.
4198 */
4199static bool
4200heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2,
4201 bool isnull1, bool isnull2)
4202{
4203 /*
4204 * If one value is NULL and other is not, then they are certainly not
4205 * equal
4206 */
4207 if (isnull1 != isnull2)
4208 return false;
4209
4210 /*
4211 * If both are NULL, they can be considered equal.
4212 */
4213 if (isnull1)
4214 return true;
4215
4216 /*
4217 * We do simple binary comparison of the two datums. This may be overly
4218 * strict because there can be multiple binary representations for the
4219 * same logical value. But we should be OK as long as there are no false
4220 * positives. Using a type-specific equality operator is messy because
4221 * there could be multiple notions of equality in different operator
4222 * classes; furthermore, we cannot safely invoke user-defined functions
4223 * while holding exclusive buffer lock.
4224 */
4225 if (attrnum <= 0)
4226 {
4227 /* The only allowed system columns are OIDs, so do this */
4228 return (DatumGetObjectId(value1) == DatumGetObjectId(value2));
4229 }
4230 else
4231 {
4232 CompactAttribute *att;
4233
4234 Assert(attrnum <= tupdesc->natts);
4235 att = TupleDescCompactAttr(tupdesc, attrnum - 1);
4236 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4237 }
4238}
4239
4240/*
4241 * Check which columns are being updated.
4242 *
4243 * Given an updated tuple, determine (and return into the output bitmapset),
4244 * from those listed as interesting, the set of columns that changed.
4245 *
4246 * has_external indicates if any of the unmodified attributes (from those
4247 * listed as interesting) of the old tuple is a member of external_cols and is
4248 * stored externally.
4249 */
4250static Bitmapset *
4252 Bitmapset *interesting_cols,
4253 Bitmapset *external_cols,
4254 HeapTuple oldtup, HeapTuple newtup,
4255 bool *has_external)
4256{
4257 int attidx;
4258 Bitmapset *modified = NULL;
4259 TupleDesc tupdesc = RelationGetDescr(relation);
4260
4261 attidx = -1;
4262 while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
4263 {
4264 /* attidx is zero-based, attrnum is the normal attribute number */
4266 Datum value1,
4267 value2;
4268 bool isnull1,
4269 isnull2;
4270
4271 /*
4272 * If it's a whole-tuple reference, say "not equal". It's not really
4273 * worth supporting this case, since it could only succeed after a
4274 * no-op update, which is hardly a case worth optimizing for.
4275 */
4276 if (attrnum == 0)
4277 {
4278 modified = bms_add_member(modified, attidx);
4279 continue;
4280 }
4281
4282 /*
4283 * Likewise, automatically say "not equal" for any system attribute
4284 * other than tableOID; we cannot expect these to be consistent in a
4285 * HOT chain, or even to be set correctly yet in the new tuple.
4286 */
4287 if (attrnum < 0)
4288 {
4289 if (attrnum != TableOidAttributeNumber)
4290 {
4291 modified = bms_add_member(modified, attidx);
4292 continue;
4293 }
4294 }
4295
4296 /*
4297 * Extract the corresponding values. XXX this is pretty inefficient
4298 * if there are many indexed columns. Should we do a single
4299 * heap_deform_tuple call on each tuple, instead? But that doesn't
4300 * work for system columns ...
4301 */
4302 value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4303 value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4304
4305 if (!heap_attr_equals(tupdesc, attrnum, value1,
4306 value2, isnull1, isnull2))
4307 {
4308 modified = bms_add_member(modified, attidx);
4309 continue;
4310 }
4311
4312 /*
4313 * No need to check attributes that can't be stored externally. Note
4314 * that system attributes can't be stored externally.
4315 */
4316 if (attrnum < 0 || isnull1 ||
4317 TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
4318 continue;
4319
4320 /*
4321 * Check if the old tuple's attribute is stored externally and is a
4322 * member of external_cols.
4323 */
4324 if (VARATT_IS_EXTERNAL((struct varlena *) DatumGetPointer(value1)) &&
4325 bms_is_member(attidx, external_cols))
4326 *has_external = true;
4327 }
4328
4329 return modified;
4330}
4331
4332/*
4333 * simple_heap_update - replace a tuple
4334 *
4335 * This routine may be used to update a tuple when concurrent updates of
4336 * the target tuple are not expected (for example, because we have a lock
4337 * on the relation associated with the tuple). Any failure is reported
4338 * via ereport().
4339 */
4340void
4342 TU_UpdateIndexes *update_indexes)
4343{
4344 TM_Result result;
4345 TM_FailureData tmfd;
4346 LockTupleMode lockmode;
4347
4348 result = heap_update(relation, otid, tup,
4350 true /* wait for commit */ ,
4351 &tmfd, &lockmode, update_indexes);
4352 switch (result)
4353 {
4354 case TM_SelfModified:
4355 /* Tuple was already updated in current command? */
4356 elog(ERROR, "tuple already updated by self");
4357 break;
4358
4359 case TM_Ok:
4360 /* done successfully */
4361 break;
4362
4363 case TM_Updated:
4364 elog(ERROR, "tuple concurrently updated");
4365 break;
4366
4367 case TM_Deleted:
4368 elog(ERROR, "tuple concurrently deleted");
4369 break;
4370
4371 default:
4372 elog(ERROR, "unrecognized heap_update status: %u", result);
4373 break;
4374 }
4375}
4376
4377
4378/*
4379 * Return the MultiXactStatus corresponding to the given tuple lock mode.
4380 */
4381static MultiXactStatus
4383{
4384 int retval;
4385
4386 if (is_update)
4387 retval = tupleLockExtraInfo[mode].updstatus;
4388 else
4389 retval = tupleLockExtraInfo[mode].lockstatus;
4390
4391 if (retval == -1)
4392 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4393 is_update ? "true" : "false");
4394
4395 return (MultiXactStatus) retval;
4396}
4397
4398/*
4399 * heap_lock_tuple - lock a tuple in shared or exclusive mode
4400 *
4401 * Note that this acquires a buffer pin, which the caller must release.
4402 *
4403 * Input parameters:
4404 * relation: relation containing tuple (caller must hold suitable lock)
4405 * tid: TID of tuple to lock
4406 * cid: current command ID (used for visibility test, and stored into
4407 * tuple's cmax if lock is successful)
4408 * mode: indicates if shared or exclusive tuple lock is desired
4409 * wait_policy: what to do if tuple lock is not available
4410 * follow_updates: if true, follow the update chain to also lock descendant
4411 * tuples.
4412 *
4413 * Output parameters:
4414 * *tuple: all fields filled in
4415 * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4416 * *tmfd: filled in failure cases (see below)
4417 *
4418 * Function results are the same as the ones for table_tuple_lock().
4419 *
4420 * In the failure cases other than TM_Invisible, the routine fills
4421 * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4422 * if necessary), and t_cmax (the last only for TM_SelfModified,
4423 * since we cannot obtain cmax from a combo CID generated by another
4424 * transaction).
4425 * See comments for struct TM_FailureData for additional info.
4426 *
4427 * See README.tuplock for a thorough explanation of this mechanism.
4428 */
4431 CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy,
4432 bool follow_updates,
4433 Buffer *buffer, TM_FailureData *tmfd)
4434{
4435 TM_Result result;
4436 ItemPointer tid = &(tuple->t_self);
4437 ItemId lp;
4438 Page page;
4439 Buffer vmbuffer = InvalidBuffer;
4440 BlockNumber block;
4441 TransactionId xid,
4442 xmax;
4443 uint16 old_infomask,
4444 new_infomask,
4445 new_infomask2;
4446 bool first_time = true;
4447 bool skip_tuple_lock = false;
4448 bool have_tuple_lock = false;
4449 bool cleared_all_frozen = false;
4450
4451 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4452 block = ItemPointerGetBlockNumber(tid);
4453
4454 /*
4455 * Before locking the buffer, pin the visibility map page if it appears to
4456 * be necessary. Since we haven't got the lock yet, someone else might be
4457 * in the middle of changing this, so we'll need to recheck after we have
4458 * the lock.
4459 */
4460 if (PageIsAllVisible(BufferGetPage(*buffer)))
4461 visibilitymap_pin(relation, block, &vmbuffer);
4462
4464
4465 page = BufferGetPage(*buffer);
4468
4469 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4470 tuple->t_len = ItemIdGetLength(lp);
4471 tuple->t_tableOid = RelationGetRelid(relation);
4472
4473l3:
4474 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4475
4476 if (result == TM_Invisible)
4477 {
4478 /*
4479 * This is possible, but only when locking a tuple for ON CONFLICT
4480 * UPDATE. We return this value here rather than throwing an error in
4481 * order to give that case the opportunity to throw a more specific
4482 * error.
4483 */
4484 result = TM_Invisible;
4485 goto out_locked;
4486 }
4487 else if (result == TM_BeingModified ||
4488 result == TM_Updated ||
4489 result == TM_Deleted)
4490 {
4491 TransactionId xwait;
4492 uint16 infomask;
4493 uint16 infomask2;
4494 bool require_sleep;
4495 ItemPointerData t_ctid;
4496
4497 /* must copy state data before unlocking buffer */
4498 xwait = HeapTupleHeaderGetRawXmax(tuple->t_data);
4499 infomask = tuple->t_data->t_infomask;
4500 infomask2 = tuple->t_data->t_infomask2;
4501 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4502
4504
4505 /*
4506 * If any subtransaction of the current top transaction already holds
4507 * a lock as strong as or stronger than what we're requesting, we
4508 * effectively hold the desired lock already. We *must* succeed
4509 * without trying to take the tuple lock, else we will deadlock
4510 * against anyone wanting to acquire a stronger lock.
4511 *
4512 * Note we only do this the first time we loop on the HTSU result;
4513 * there is no point in testing in subsequent passes, because
4514 * evidently our own transaction cannot have acquired a new lock after
4515 * the first time we checked.
4516 */
4517 if (first_time)
4518 {
4519 first_time = false;
4520
4521 if (infomask & HEAP_XMAX_IS_MULTI)
4522 {
4523 int i;
4524 int nmembers;
4525 MultiXactMember *members;
4526
4527 /*
4528 * We don't need to allow old multixacts here; if that had
4529 * been the case, HeapTupleSatisfiesUpdate would have returned
4530 * MayBeUpdated and we wouldn't be here.
4531 */
4532 nmembers =
4533 GetMultiXactIdMembers(xwait, &members, false,
4534 HEAP_XMAX_IS_LOCKED_ONLY(infomask));
4535
4536 for (i = 0; i < nmembers; i++)
4537 {
4538 /* only consider members of our own transaction */
4539 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4540 continue;
4541
4542 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4543 {
4544 pfree(members);
4545 result = TM_Ok;
4546 goto out_unlocked;
4547 }
4548 else
4549 {
4550 /*
4551 * Disable acquisition of the heavyweight tuple lock.
4552 * Otherwise, when promoting a weaker lock, we might
4553 * deadlock with another locker that has acquired the
4554 * heavyweight tuple lock and is waiting for our
4555 * transaction to finish.
4556 *
4557 * Note that in this case we still need to wait for
4558 * the multixact if required, to avoid acquiring
4559 * conflicting locks.
4560 */
4561 skip_tuple_lock = true;
4562 }
4563 }
4564
4565 if (members)
4566 pfree(members);
4567 }
4569 {
4570 switch (mode)
4571 {
4572 case LockTupleKeyShare:
4574 HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4575 HEAP_XMAX_IS_EXCL_LOCKED(infomask));
4576 result = TM_Ok;
4577 goto out_unlocked;
4578 case LockTupleShare:
4579 if (HEAP_XMAX_IS_SHR_LOCKED(infomask) ||
4580 HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4581 {
4582 result = TM_Ok;
4583 goto out_unlocked;
4584 }
4585 break;
4587 if (HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4588 {
4589 result = TM_Ok;
4590 goto out_unlocked;
4591 }
4592 break;
4593 case LockTupleExclusive:
4594 if (HEAP_XMAX_IS_EXCL_LOCKED(infomask) &&
4595 infomask2 & HEAP_KEYS_UPDATED)
4596 {
4597 result = TM_Ok;
4598 goto out_unlocked;
4599 }
4600 break;
4601 }
4602 }
4603 }
4604
4605 /*
4606 * Initially assume that we will have to wait for the locking
4607 * transaction(s) to finish. We check various cases below in which
4608 * this can be turned off.
4609 */
4610 require_sleep = true;
4611 if (mode == LockTupleKeyShare)
4612 {
4613 /*
4614 * If we're requesting KeyShare, and there's no update present, we
4615 * don't need to wait. Even if there is an update, we can still
4616 * continue if the key hasn't been modified.
4617 *
4618 * However, if there are updates, we need to walk the update chain
4619 * to mark future versions of the row as locked, too. That way,
4620 * if somebody deletes that future version, we're protected
4621 * against the key going away. This locking of future versions
4622 * could block momentarily, if a concurrent transaction is
4623 * deleting a key; or it could return a value to the effect that
4624 * the transaction deleting the key has already committed. So we
4625 * do this before re-locking the buffer; otherwise this would be
4626 * prone to deadlocks.
4627 *
4628 * Note that the TID we're locking was grabbed before we unlocked
4629 * the buffer. For it to change while we're not looking, the
4630 * other properties we're testing for below after re-locking the
4631 * buffer would also change, in which case we would restart this
4632 * loop above.
4633 */
4634 if (!(infomask2 & HEAP_KEYS_UPDATED))
4635 {
4636 bool updated;
4637
4638 updated = !HEAP_XMAX_IS_LOCKED_ONLY(infomask);
4639
4640 /*
4641 * If there are updates, follow the update chain; bail out if
4642 * that cannot be done.
4643 */
4644 if (follow_updates && updated)
4645 {
4646 TM_Result res;
4647
4648 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4650 mode);
4651 if (res != TM_Ok)
4652 {
4653 result = res;
4654 /* recovery code expects to have buffer lock held */
4656 goto failed;
4657 }
4658 }
4659
4661
4662 /*
4663 * Make sure it's still an appropriate lock, else start over.
4664 * Also, if it wasn't updated before we released the lock, but
4665 * is updated now, we start over too; the reason is that we
4666 * now need to follow the update chain to lock the new
4667 * versions.
4668 */
4669 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4670 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4671 !updated))
4672 goto l3;
4673
4674 /* Things look okay, so we can skip sleeping */
4675 require_sleep = false;
4676
4677 /*
4678 * Note we allow Xmax to change here; other updaters/lockers
4679 * could have modified it before we grabbed the buffer lock.
4680 * However, this is not a problem, because with the recheck we
4681 * just did we ensure that they still don't conflict with the
4682 * lock we want.
4683 */
4684 }
4685 }
4686 else if (mode == LockTupleShare)
4687 {
4688 /*
4689 * If we're requesting Share, we can similarly avoid sleeping if
4690 * there's no update and no exclusive lock present.
4691 */
4692 if (HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
4693 !HEAP_XMAX_IS_EXCL_LOCKED(infomask))
4694 {
4696
4697 /*
4698 * Make sure it's still an appropriate lock, else start over.
4699 * See above about allowing xmax to change.
4700 */
4703 goto l3;
4704 require_sleep = false;
4705 }
4706 }
4707 else if (mode == LockTupleNoKeyExclusive)
4708 {
4709 /*
4710 * If we're requesting NoKeyExclusive, we might also be able to
4711 * avoid sleeping; just ensure that there no conflicting lock
4712 * already acquired.
4713 */
4714 if (infomask & HEAP_XMAX_IS_MULTI)
4715 {
4716 if (!DoesMultiXactIdConflict((MultiXactId) xwait, infomask,
4717 mode, NULL))
4718 {
4719 /*
4720 * No conflict, but if the xmax changed under us in the
4721 * meantime, start over.
4722 */
4724 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4726 xwait))
4727 goto l3;
4728
4729 /* otherwise, we're good */
4730 require_sleep = false;
4731 }
4732 }
4733 else if (HEAP_XMAX_IS_KEYSHR_LOCKED(infomask))
4734 {
4736
4737 /* if the xmax changed in the meantime, start over */
4738 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4740 xwait))
4741 goto l3;
4742 /* otherwise, we're good */
4743 require_sleep = false;
4744 }
4745 }
4746
4747 /*
4748 * As a check independent from those above, we can also avoid sleeping
4749 * if the current transaction is the sole locker of the tuple. Note
4750 * that the strength of the lock already held is irrelevant; this is
4751 * not about recording the lock in Xmax (which will be done regardless
4752 * of this optimization, below). Also, note that the cases where we
4753 * hold a lock stronger than we are requesting are already handled
4754 * above by not doing anything.
4755 *
4756 * Note we only deal with the non-multixact case here; MultiXactIdWait
4757 * is well equipped to deal with this situation on its own.
4758 */
4759 if (require_sleep && !(infomask & HEAP_XMAX_IS_MULTI) &&
4761 {
4762 /* ... but if the xmax changed in the meantime, start over */
4764 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4766 xwait))
4767 goto l3;
4769 require_sleep = false;
4770 }
4771
4772 /*
4773 * Time to sleep on the other transaction/multixact, if necessary.
4774 *
4775 * If the other transaction is an update/delete that's already
4776 * committed, then sleeping cannot possibly do any good: if we're
4777 * required to sleep, get out to raise an error instead.
4778 *
4779 * By here, we either have already acquired the buffer exclusive lock,
4780 * or we must wait for the locking transaction or multixact; so below
4781 * we ensure that we grab buffer lock after the sleep.
4782 */
4783 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4784 {
4786 goto failed;
4787 }
4788 else if (require_sleep)
4789 {
4790 /*
4791 * Acquire tuple lock to establish our priority for the tuple, or
4792 * die trying. LockTuple will release us when we are next-in-line
4793 * for the tuple. We must do this even if we are share-locking,
4794 * but not if we already have a weaker lock on the tuple.
4795 *
4796 * If we are forced to "start over" below, we keep the tuple lock;
4797 * this arranges that we stay at the head of the line while
4798 * rechecking tuple state.
4799 */
4800 if (!skip_tuple_lock &&
4801 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
4802 &have_tuple_lock))
4803 {
4804 /*
4805 * This can only happen if wait_policy is Skip and the lock
4806 * couldn't be obtained.
4807 */
4808 result = TM_WouldBlock;
4809 /* recovery code expects to have buffer lock held */
4811 goto failed;
4812 }
4813
4814 if (infomask & HEAP_XMAX_IS_MULTI)
4815 {
4817
4818 /* We only ever lock tuples, never update them */
4819 if (status >= MultiXactStatusNoKeyUpdate)
4820 elog(ERROR, "invalid lock mode in heap_lock_tuple");
4821
4822 /* wait for multixact to end, or die trying */
4823 switch (wait_policy)
4824 {
4825 case LockWaitBlock:
4826 MultiXactIdWait((MultiXactId) xwait, status, infomask,
4827 relation, &tuple->t_self, XLTW_Lock, NULL);
4828 break;
4829 case LockWaitSkip:
4831 status, infomask, relation,
4832 NULL))
4833 {
4834 result = TM_WouldBlock;
4835 /* recovery code expects to have buffer lock held */
4837 goto failed;
4838 }
4839 break;
4840 case LockWaitError:
4842 status, infomask, relation,
4843 NULL))
4844 ereport(ERROR,
4845 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4846 errmsg("could not obtain lock on row in relation \"%s\"",
4847 RelationGetRelationName(relation))));
4848
4849 break;
4850 }
4851
4852 /*
4853 * Of course, the multixact might not be done here: if we're
4854 * requesting a light lock mode, other transactions with light
4855 * locks could still be alive, as well as locks owned by our
4856 * own xact or other subxacts of this backend. We need to
4857 * preserve the surviving MultiXact members. Note that it
4858 * isn't absolutely necessary in the latter case, but doing so
4859 * is simpler.
4860 */
4861 }
4862 else
4863 {
4864 /* wait for regular transaction to end, or die trying */
4865 switch (wait_policy)
4866 {
4867 case LockWaitBlock:
4868 XactLockTableWait(xwait, relation, &tuple->t_self,
4869 XLTW_Lock);
4870 break;
4871 case LockWaitSkip:
4872 if (!ConditionalXactLockTableWait(xwait))
4873 {
4874 result = TM_WouldBlock;
4875 /* recovery code expects to have buffer lock held */
4877 goto failed;
4878 }
4879 break;
4880 case LockWaitError:
4881 if (!ConditionalXactLockTableWait(xwait))
4882 ereport(ERROR,
4883 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
4884 errmsg("could not obtain lock on row in relation \"%s\"",
4885 RelationGetRelationName(relation))));
4886 break;
4887 }
4888 }
4889
4890 /* if there are updates, follow the update chain */
4891 if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
4892 {
4893 TM_Result res;
4894
4895 res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
4897 mode);
4898 if (res != TM_Ok)
4899 {
4900 result = res;
4901 /* recovery code expects to have buffer lock held */
4903 goto failed;
4904 }
4905 }
4906
4908
4909 /*
4910 * xwait is done, but if xwait had just locked the tuple then some
4911 * other xact could update this tuple before we get to this point.
4912 * Check for xmax change, and start over if so.
4913 */
4914 if (xmax_infomask_changed(tuple->t_data->t_infomask, infomask) ||
4916 xwait))
4917 goto l3;
4918
4919 if (!(infomask & HEAP_XMAX_IS_MULTI))
4920 {
4921 /*
4922 * Otherwise check if it committed or aborted. Note we cannot
4923 * be here if the tuple was only locked by somebody who didn't
4924 * conflict with us; that would have been handled above. So
4925 * that transaction must necessarily be gone by now. But
4926 * don't check for this in the multixact case, because some
4927 * locker transactions might still be running.
4928 */
4929 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
4930 }
4931 }
4932
4933 /* By here, we're certain that we hold buffer exclusive lock again */
4934
4935 /*
4936 * We may lock if previous xmax aborted, or if it committed but only
4937 * locked the tuple without updating it; or if we didn't have to wait
4938 * at all for whatever reason.
4939 */
4940 if (!require_sleep ||
4941 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
4944 result = TM_Ok;
4945 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
4946 result = TM_Updated;
4947 else
4948 result = TM_Deleted;
4949 }
4950
4951failed:
4952 if (result != TM_Ok)
4953 {
4954 Assert(result == TM_SelfModified || result == TM_Updated ||
4955 result == TM_Deleted || result == TM_WouldBlock);
4956
4957 /*
4958 * When locking a tuple under LockWaitSkip semantics and we fail with
4959 * TM_WouldBlock above, it's possible for concurrent transactions to
4960 * release the lock and set HEAP_XMAX_INVALID in the meantime. So
4961 * this assert is slightly different from the equivalent one in
4962 * heap_delete and heap_update.
4963 */
4964 Assert((result == TM_WouldBlock) ||
4965 !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
4966 Assert(result != TM_Updated ||
4967 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
4968 tmfd->ctid = tuple->t_data->t_ctid;
4969 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
4970 if (result == TM_SelfModified)
4971 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
4972 else
4973 tmfd->cmax = InvalidCommandId;
4974 goto out_locked;
4975 }
4976
4977 /*
4978 * If we didn't pin the visibility map page and the page has become all
4979 * visible while we were busy locking the buffer, or during some
4980 * subsequent window during which we had it unlocked, we'll have to unlock
4981 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
4982 * unfortunate, especially since we'll now have to recheck whether the
4983 * tuple has been locked or updated under us, but hopefully it won't
4984 * happen very often.
4985 */
4986 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4987 {
4989 visibilitymap_pin(relation, block, &vmbuffer);
4991 goto l3;
4992 }
4993
4994 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
4995 old_infomask = tuple->t_data->t_infomask;
4996
4997 /*
4998 * If this is the first possibly-multixact-able operation in the current
4999 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5000 * certain that the transaction will never become a member of any older
5001 * MultiXactIds than that. (We have to do this even if we end up just
5002 * using our own TransactionId below, since some other backend could
5003 * incorporate our XID into a MultiXact immediately afterwards.)
5004 */
5006
5007 /*
5008 * Compute the new xmax and infomask to store into the tuple. Note we do
5009 * not modify the tuple just yet, because that would leave it in the wrong
5010 * state if multixact.c elogs.
5011 */
5012 compute_new_xmax_infomask(xmax, old_infomask, tuple->t_data->t_infomask2,
5013 GetCurrentTransactionId(), mode, false,
5014 &xid, &new_infomask, &new_infomask2);
5015
5017
5018 /*
5019 * Store transaction information of xact locking the tuple.
5020 *
5021 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5022 * possibly generating a useless combo CID. Moreover, if we're locking a
5023 * previously updated tuple, it's important to preserve the Cmax.
5024 *
5025 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5026 * we would break the HOT chain.
5027 */
5028 tuple->t_data->t_infomask &= ~HEAP_XMAX_BITS;
5029 tuple->t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
5030 tuple->t_data->t_infomask |= new_infomask;
5031 tuple->t_data->t_infomask2 |= new_infomask2;
5032 if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5034 HeapTupleHeaderSetXmax(tuple->t_data, xid);
5035
5036 /*
5037 * Make sure there is no forward chain link in t_ctid. Note that in the
5038 * cases where the tuple has been updated, we must not overwrite t_ctid,
5039 * because it was set by the updater. Moreover, if the tuple has been
5040 * updated, we need to follow the update chain to lock the new versions of
5041 * the tuple as well.
5042 */
5043 if (HEAP_XMAX_IS_LOCKED_ONLY(new_infomask))
5044 tuple->t_data->t_ctid = *tid;
5045
5046 /* Clear only the all-frozen bit on visibility map if needed */
5047 if (PageIsAllVisible(page) &&
5048 visibilitymap_clear(relation, block, vmbuffer,
5050 cleared_all_frozen = true;
5051
5052
5053 MarkBufferDirty(*buffer);
5054
5055 /*
5056 * XLOG stuff. You might think that we don't need an XLOG record because
5057 * there is no state change worth restoring after a crash. You would be
5058 * wrong however: we have just written either a TransactionId or a
5059 * MultiXactId that may never have been seen on disk before, and we need
5060 * to make sure that there are XLOG entries covering those ID numbers.
5061 * Else the same IDs might be re-used after a crash, which would be
5062 * disastrous if this page made it to disk before the crash. Essentially
5063 * we have to enforce the WAL log-before-data rule even in this case.
5064 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5065 * entries for everything anyway.)
5066 */
5067 if (RelationNeedsWAL(relation))
5068 {
5069 xl_heap_lock xlrec;
5070 XLogRecPtr recptr;
5071
5074
5075 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5076 xlrec.xmax = xid;
5077 xlrec.infobits_set = compute_infobits(new_infomask,
5078 tuple->t_data->t_infomask2);
5079 xlrec.flags = cleared_all_frozen ? XLH_LOCK_ALL_FROZEN_CLEARED : 0;
5080 XLogRegisterData((char *) &xlrec, SizeOfHeapLock);
5081
5082 /* we don't decode row locks atm, so no need to log the origin */
5083
5084 recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_LOCK);
5085
5086 PageSetLSN(page, recptr);
5087 }
5088
5090
5091 result = TM_Ok;
5092
5093out_locked:
5095
5096out_unlocked:
5097 if (BufferIsValid(vmbuffer))
5098 ReleaseBuffer(vmbuffer);
5099
5100 /*
5101 * Don't update the visibility map here. Locking a tuple doesn't change
5102 * visibility info.
5103 */
5104
5105 /*
5106 * Now that we have successfully marked the tuple as locked, we can
5107 * release the lmgr tuple lock, if we had it.
5108 */
5109 if (have_tuple_lock)
5110 UnlockTupleTuplock(relation, tid, mode);
5111
5112 return result;
5113}
5114
5115/*
5116 * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5117 * its normal, Xmax-based tuple lock.
5118 *
5119 * have_tuple_lock is an input and output parameter: on input, it indicates
5120 * whether the lock has previously been acquired (and this function does
5121 * nothing in that case). If this function returns success, have_tuple_lock
5122 * has been flipped to true.
5123 *
5124 * Returns false if it was unable to obtain the lock; this can only happen if
5125 * wait_policy is Skip.
5126 */
5127static bool
5129 LockWaitPolicy wait_policy, bool *have_tuple_lock)
5130{
5131 if (*have_tuple_lock)
5132 return true;
5133
5134 switch (wait_policy)
5135 {
5136 case LockWaitBlock:
5137 LockTupleTuplock(relation, tid, mode);
5138 break;
5139
5140 case LockWaitSkip:
5141 if (!ConditionalLockTupleTuplock(relation, tid, mode))
5142 return false;
5143 break;
5144
5145 case LockWaitError:
5146 if (!ConditionalLockTupleTuplock(relation, tid, mode))
5147 ereport(ERROR,
5148 (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
5149 errmsg("could not obtain lock on row in relation \"%s\"",
5150 RelationGetRelationName(relation))));
5151 break;
5152 }
5153 *have_tuple_lock = true;
5154
5155 return true;
5156}
5157
5158/*
5159 * Given an original set of Xmax and infomask, and a transaction (identified by
5160 * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5161 * corresponding infomasks to use on the tuple.
5162 *
5163 * Note that this might have side effects such as creating a new MultiXactId.
5164 *
5165 * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5166 * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5167 * but it was not running anymore. There is a race condition, which is that the
5168 * MultiXactId may have finished since then, but that uncommon case is handled
5169 * either here, or within MultiXactIdExpand.
5170 *
5171 * There is a similar race condition possible when the old xmax was a regular
5172 * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5173 * window, but it's still possible to end up creating an unnecessary
5174 * MultiXactId. Fortunately this is harmless.
5175 */
5176static void
5178 uint16 old_infomask2, TransactionId add_to_xmax,
5179 LockTupleMode mode, bool is_update,
5180 TransactionId *result_xmax, uint16 *result_infomask,
5181 uint16 *result_infomask2)
5182{
5183 TransactionId new_xmax;
5184 uint16 new_infomask,
5185 new_infomask2;
5186
5188
5189l5:
5190 new_infomask = 0;
5191 new_infomask2 = 0;
5192 if (old_infomask & HEAP_XMAX_INVALID)
5193 {
5194 /*
5195 * No previous locker; we just insert our own TransactionId.
5196 *
5197 * Note that it's critical that this case be the first one checked,
5198 * because there are several blocks below that come back to this one
5199 * to implement certain optimizations; old_infomask might contain
5200 * other dirty bits in those cases, but we don't really care.
5201 */
5202 if (is_update)
5203 {
5204 new_xmax = add_to_xmax;
5205 if (mode == LockTupleExclusive)
5206 new_infomask2 |= HEAP_KEYS_UPDATED;
5207 }
5208 else
5209 {
5210 new_infomask |= HEAP_XMAX_LOCK_ONLY;
5211 switch (mode)
5212 {
5213 case LockTupleKeyShare:
5214 new_xmax = add_to_xmax;
5215 new_infomask |= HEAP_XMAX_KEYSHR_LOCK;
5216 break;
5217 case LockTupleShare:
5218 new_xmax = add_to_xmax;
5219 new_infomask |= HEAP_XMAX_SHR_LOCK;
5220 break;
5222 new_xmax = add_to_xmax;
5223 new_infomask |= HEAP_XMAX_EXCL_LOCK;
5224 break;
5225 case LockTupleExclusive:
5226 new_xmax = add_to_xmax;
5227 new_infomask |= HEAP_XMAX_EXCL_LOCK;
5228 new_infomask2 |= HEAP_KEYS_UPDATED;
5229 break;
5230 default:
5231 new_xmax = InvalidTransactionId; /* silence compiler */
5232 elog(ERROR, "invalid lock mode");
5233 }
5234 }
5235 }
5236 else if (old_infomask & HEAP_XMAX_IS_MULTI)
5237 {
5238 MultiXactStatus new_status;
5239
5240 /*
5241 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5242 * cross-check.
5243 */
5244 Assert(!(old_infomask & HEAP_XMAX_COMMITTED));
5245
5246 /*
5247 * A multixact together with LOCK_ONLY set but neither lock bit set
5248 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5249 * anymore. This check is critical for databases upgraded by
5250 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5251 * that such multis are never passed.
5252 */
5253 if (HEAP_LOCKED_UPGRADED(old_infomask))
5254 {
5255 old_infomask &= ~HEAP_XMAX_IS_MULTI;
5256 old_infomask |= HEAP_XMAX_INVALID;
5257 goto l5;
5258 }
5259
5260 /*
5261 * If the XMAX is already a MultiXactId, then we need to expand it to
5262 * include add_to_xmax; but if all the members were lockers and are
5263 * all gone, we can do away with the IS_MULTI bit and just set
5264 * add_to_xmax as the only locker/updater. If all lockers are gone
5265 * and we have an updater that aborted, we can also do without a
5266 * multi.
5267 *
5268 * The cost of doing GetMultiXactIdMembers would be paid by
5269 * MultiXactIdExpand if we weren't to do this, so this check is not
5270 * incurring extra work anyhow.
5271 */
5272 if (!MultiXactIdIsRunning(xmax, HEAP_XMAX_IS_LOCKED_ONLY(old_infomask)))
5273 {
5274 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) ||
5276 old_infomask)))
5277 {
5278 /*
5279 * Reset these bits and restart; otherwise fall through to
5280 * create a new multi below.
5281 */
5282 old_infomask &= ~HEAP_XMAX_IS_MULTI;
5283 old_infomask |= HEAP_XMAX_INVALID;
5284 goto l5;
5285 }
5286 }
5287
5288 new_status = get_mxact_status_for_lock(mode, is_update);
5289
5290 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5291 new_status);
5292 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5293 }
5294 else if (old_infomask & HEAP_XMAX_COMMITTED)
5295 {
5296 /*
5297 * It's a committed update, so we need to preserve him as updater of
5298 * the tuple.
5299 */
5300 MultiXactStatus status;
5301 MultiXactStatus new_status;
5302
5303 if (old_infomask2 & HEAP_KEYS_UPDATED)
5304 status = MultiXactStatusUpdate;
5305 else
5307
5308 new_status = get_mxact_status_for_lock(mode, is_update);
5309
5310 /*
5311 * since it's not running, it's obviously impossible for the old
5312 * updater to be identical to the current one, so we need not check
5313 * for that case as we do in the block above.
5314 */
5315 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5316 GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2);
5317 }
5318 else if (TransactionIdIsInProgress(xmax))
5319 {
5320 /*
5321 * If the XMAX is a valid, in-progress TransactionId, then we need to
5322 * create a new MultiXactId that includes both the old locker or
5323 * updater and our own TransactionId.
5324 */
5325 MultiXactStatus new_status;
5326 MultiXactStatus old_status;
5327 LockTupleMode old_mode;
5328
5329 if (HEAP_XMAX_IS_LOCKED_ONLY(old_infomask))
5330 {
5331 if (HEAP_XMAX_IS_KEYSHR_LOCKED(old_infomask))
5332 old_status = MultiXactStatusForKeyShare;
5333 else if (HEAP_XMAX_IS_SHR_LOCKED(old_infomask))
5334 old_status = MultiXactStatusForShare;
5335 else if (HEAP_XMAX_IS_EXCL_LOCKED(old_infomask))
5336 {
5337 if (old_infomask2 & HEAP_KEYS_UPDATED)
5338 old_status = MultiXactStatusForUpdate;
5339 else
5340 old_status = MultiXactStatusForNoKeyUpdate;
5341 }
5342 else
5343 {
5344 /*
5345 * LOCK_ONLY can be present alone only when a page has been
5346 * upgraded by pg_upgrade. But in that case,
5347 * TransactionIdIsInProgress() should have returned false. We
5348 * assume it's no longer locked in this case.
5349 */
5350 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5351 old_infomask |= HEAP_XMAX_INVALID;
5352 old_infomask &= ~HEAP_XMAX_LOCK_ONLY;
5353 goto l5;
5354 }
5355 }
5356 else
5357 {
5358 /* it's an update, but which kind? */
5359 if (old_infomask2 & HEAP_KEYS_UPDATED)
5360 old_status = MultiXactStatusUpdate;
5361 else
5362 old_status =