PostgreSQL Source Code git master
Loading...
Searching...
No Matches
heapam.c File Reference
#include "postgres.h"
#include "access/heapam.h"
#include "access/heaptoast.h"
#include "access/hio.h"
#include "access/multixact.h"
#include "access/subtrans.h"
#include "access/syncscan.h"
#include "access/valid.h"
#include "access/visibilitymap.h"
#include "access/xloginsert.h"
#include "catalog/pg_database.h"
#include "catalog/pg_database_d.h"
#include "commands/vacuum.h"
#include "pgstat.h"
#include "port/pg_bitutils.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/datum.h"
#include "utils/injection_point.h"
#include "utils/inval.h"
#include "utils/spccache.h"
#include "utils/syscache.h"
Include dependency graph for heapam.c:

Go to the source code of this file.

Data Structures

struct  IndexDeleteCounts
 

Macros

#define LOCKMODE_from_mxstatus(status)    (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
 
#define LockTupleTuplock(rel, tup, mode)    LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
 
#define UnlockTupleTuplock(rel, tup, mode)    UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
 
#define ConditionalLockTupleTuplock(rel, tup, mode, log)    ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock, (log))
 
#define BOTTOMUP_MAX_NBLOCKS   6
 
#define BOTTOMUP_TOLERANCE_NBLOCKS   3
 
#define TUPLOCK_from_mxstatus(status)    (MultiXactStatusLock[(status)])
 
#define FRM_NOOP   0x0001
 
#define FRM_INVALIDATE_XMAX   0x0002
 
#define FRM_RETURN_IS_XID   0x0004
 
#define FRM_RETURN_IS_MULTI   0x0008
 
#define FRM_MARK_COMMITTED   0x0010
 

Typedefs

typedef struct IndexDeleteCounts IndexDeleteCounts
 

Functions

static HeapTuple heap_prepare_insert (Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options)
 
static XLogRecPtr log_heap_update (Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared)
 
static BitmapsetHeapDetermineColumnsInfo (Relation relation, Bitmapset *interesting_cols, Bitmapset *external_cols, HeapTuple oldtup, HeapTuple newtup, bool *has_external)
 
static bool heap_acquire_tuplock (Relation relation, const ItemPointerData *tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock)
 
static BlockNumber heapgettup_advance_block (HeapScanDesc scan, BlockNumber block, ScanDirection dir)
 
static pg_noinline BlockNumber heapgettup_initial_block (HeapScanDesc scan, ScanDirection dir)
 
static void compute_new_xmax_infomask (TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2)
 
static TM_Result heap_lock_updated_tuple (Relation rel, uint16 prior_infomask, TransactionId prior_raw_xmax, const ItemPointerData *prior_ctid, TransactionId xid, LockTupleMode mode)
 
static void GetMultiXactIdHintBits (MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2)
 
static TransactionId MultiXactIdGetUpdateXid (TransactionId xmax, uint16 t_infomask)
 
static bool DoesMultiXactIdConflict (MultiXactId multi, uint16 infomask, LockTupleMode lockmode, bool *current_is_member)
 
static void MultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining)
 
static bool ConditionalMultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining, bool logLockFailure)
 
static void index_delete_sort (TM_IndexDeleteOp *delstate)
 
static int bottomup_sort_and_shrink (TM_IndexDeleteOp *delstate)
 
static XLogRecPtr log_heap_new_cid (Relation relation, HeapTuple tup)
 
static HeapTuple ExtractReplicaIdentity (Relation relation, HeapTuple tp, bool key_required, bool *copy)
 
static void AssertHasSnapshotForToast (Relation rel)
 
static BlockNumber heap_scan_stream_read_next_parallel (ReadStream *stream, void *callback_private_data, void *per_buffer_data)
 
static BlockNumber heap_scan_stream_read_next_serial (ReadStream *stream, void *callback_private_data, void *per_buffer_data)
 
static BlockNumber bitmapheap_stream_read_next (ReadStream *pgsr, void *private_data, void *per_buffer_data)
 
static void initscan (HeapScanDesc scan, ScanKey key, bool keep_startblock)
 
void heap_setscanlimits (TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
 
static pg_attribute_always_inline int page_collect_tuples (HeapScanDesc scan, Snapshot snapshot, Page page, Buffer buffer, BlockNumber block, int lines, bool all_visible, bool check_serializable)
 
void heap_prepare_pagescan (TableScanDesc sscan)
 
static void heap_fetch_next_buffer (HeapScanDesc scan, ScanDirection dir)
 
static Page heapgettup_start_page (HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
 
static Page heapgettup_continue_page (HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
 
static void heapgettup (HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
 
static void heapgettup_pagemode (HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
 
TableScanDesc heap_beginscan (Relation relation, Snapshot snapshot, int nkeys, ScanKey key, ParallelTableScanDesc parallel_scan, uint32 flags)
 
void heap_rescan (TableScanDesc sscan, ScanKey key, bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode)
 
void heap_endscan (TableScanDesc sscan)
 
HeapTuple heap_getnext (TableScanDesc sscan, ScanDirection direction)
 
bool heap_getnextslot (TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
 
void heap_set_tidrange (TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid)
 
bool heap_getnextslot_tidrange (TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
 
bool heap_fetch (Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf)
 
bool heap_hot_search_buffer (ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call)
 
void heap_get_latest_tid (TableScanDesc sscan, ItemPointer tid)
 
static void UpdateXmaxHintBits (HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
 
BulkInsertState GetBulkInsertState (void)
 
void FreeBulkInsertState (BulkInsertState bistate)
 
void ReleaseBulkInsertStatePin (BulkInsertState bistate)
 
void heap_insert (Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate)
 
static int heap_multi_insert_pages (HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
 
void heap_multi_insert (Relation relation, TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate)
 
void simple_heap_insert (Relation relation, HeapTuple tup)
 
static uint8 compute_infobits (uint16 infomask, uint16 infomask2)
 
static bool xmax_infomask_changed (uint16 new_infomask, uint16 old_infomask)
 
TM_Result heap_delete (Relation relation, const ItemPointerData *tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
 
void simple_heap_delete (Relation relation, const ItemPointerData *tid)
 
TM_Result heap_update (Relation relation, const ItemPointerData *otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
 
static bool heap_attr_equals (TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, bool isnull1, bool isnull2)
 
void simple_heap_update (Relation relation, const ItemPointerData *otid, HeapTuple tup, TU_UpdateIndexes *update_indexes)
 
static MultiXactStatus get_mxact_status_for_lock (LockTupleMode mode, bool is_update)
 
TM_Result heap_lock_tuple (Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_updates, Buffer *buffer, TM_FailureData *tmfd)
 
static TM_Result test_lockmode_for_conflict (MultiXactStatus status, TransactionId xid, LockTupleMode mode, HeapTuple tup, bool *needwait)
 
static TM_Result heap_lock_updated_tuple_rec (Relation rel, TransactionId priorXmax, const ItemPointerData *tid, TransactionId xid, LockTupleMode mode)
 
void heap_finish_speculative (Relation relation, const ItemPointerData *tid)
 
void heap_abort_speculative (Relation relation, const ItemPointerData *tid)
 
bool heap_inplace_lock (Relation relation, HeapTuple oldtup_ptr, Buffer buffer, void(*release_callback)(void *), void *arg)
 
void heap_inplace_update_and_unlock (Relation relation, HeapTuple oldtup, HeapTuple tuple, Buffer buffer)
 
void heap_inplace_unlock (Relation relation, HeapTuple oldtup, Buffer buffer)
 
static TransactionId FreezeMultiXactId (MultiXactId multi, uint16 t_infomask, const struct VacuumCutoffs *cutoffs, uint16 *flags, HeapPageFreeze *pagefrz)
 
bool heap_prepare_freeze_tuple (HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, HeapPageFreeze *pagefrz, HeapTupleFreeze *frz, bool *totally_frozen)
 
void heap_pre_freeze_checks (Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
 
void heap_freeze_prepared_tuples (Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
 
bool heap_freeze_tuple (HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId FreezeLimit, TransactionId MultiXactCutoff)
 
TransactionId HeapTupleGetUpdateXid (const HeapTupleHeaderData *tup)
 
static bool Do_MultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, bool nowait, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining, bool logLockFailure)
 
bool heap_tuple_needs_eventual_freeze (HeapTupleHeader tuple)
 
bool heap_tuple_should_freeze (HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, TransactionId *NoFreezePageRelfrozenXid, MultiXactId *NoFreezePageRelminMxid)
 
void HeapTupleHeaderAdvanceConflictHorizon (HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
 
static void index_delete_check_htid (TM_IndexDeleteOp *delstate, Page page, OffsetNumber maxoff, const ItemPointerData *htid, TM_IndexStatus *istatus)
 
TransactionId heap_index_delete_tuples (Relation rel, TM_IndexDeleteOp *delstate)
 
static int index_delete_sort_cmp (TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
 
static int bottomup_nblocksfavorable (IndexDeleteCounts *blockgroups, int nblockgroups, TM_IndexDelete *deltids)
 
static int bottomup_sort_and_shrink_cmp (const void *arg1, const void *arg2)
 
XLogRecPtr log_heap_visible (Relation rel, Buffer heap_buffer, Buffer vm_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags)
 
void HeapCheckForSerializableConflictOut (bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot)
 

Variables

struct { 
 
   LOCKMODE   hwlock 
 
   int   lockstatus 
 
   int   updstatus 
 
tupleLockExtraInfo [] 
 
static const int MultiXactStatusLock [MaxMultiXactStatus+1]
 

Macro Definition Documentation

◆ BOTTOMUP_MAX_NBLOCKS

#define BOTTOMUP_MAX_NBLOCKS   6

Definition at line 189 of file heapam.c.

◆ BOTTOMUP_TOLERANCE_NBLOCKS

#define BOTTOMUP_TOLERANCE_NBLOCKS   3

Definition at line 190 of file heapam.c.

◆ ConditionalLockTupleTuplock

#define ConditionalLockTupleTuplock (   rel,
  tup,
  mode,
  log 
)     ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock, (log))

Definition at line 171 of file heapam.c.

179{
181 int next_item;
182 int ndeltids;
183 TM_IndexDelete *deltids;
185#endif
186
187/* heap_index_delete_tuples bottom-up index deletion costing constants */
188#define BOTTOMUP_MAX_NBLOCKS 6
189#define BOTTOMUP_TOLERANCE_NBLOCKS 3
190
191/*
192 * heap_index_delete_tuples uses this when determining which heap blocks it
193 * must visit to help its bottom-up index deletion caller
194 */
195typedef struct IndexDeleteCounts
196{
197 int16 npromisingtids; /* Number of "promising" TIDs in group */
198 int16 ntids; /* Number of TIDs in group */
199 int16 ifirsttid; /* Offset to group's first deltid */
201
202/*
203 * This table maps tuple lock strength values for each particular
204 * MultiXactStatus value.
205 */
206static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
207{
208 LockTupleKeyShare, /* ForKeyShare */
209 LockTupleShare, /* ForShare */
210 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
211 LockTupleExclusive, /* ForUpdate */
212 LockTupleNoKeyExclusive, /* NoKeyUpdate */
213 LockTupleExclusive /* Update */
214};
215
216/* Get the LockTupleMode for a given MultiXactStatus */
217#define TUPLOCK_from_mxstatus(status) \
218 (MultiXactStatusLock[(status)])
219
220/*
221 * Check that we have a valid snapshot if we might need TOAST access.
222 */
223static inline void
225{
226#ifdef USE_ASSERT_CHECKING
227
228 /* bootstrap mode in particular breaks this rule */
230 return;
231
232 /* if the relation doesn't have a TOAST table, we are good */
233 if (!OidIsValid(rel->rd_rel->reltoastrelid))
234 return;
235
237
238#endif /* USE_ASSERT_CHECKING */
239}
240
241/* ----------------------------------------------------------------
242 * heap support routines
243 * ----------------------------------------------------------------
244 */
245
246/*
247 * Streaming read API callback for parallel sequential scans. Returns the next
248 * block the caller wants from the read stream or InvalidBlockNumber when done.
249 */
250static BlockNumber
252 void *callback_private_data,
253 void *per_buffer_data)
254{
255 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
256
259
260 if (unlikely(!scan->rs_inited))
261 {
262 /* parallel scan */
266 scan->rs_startblock,
267 scan->rs_numblocks);
268
269 /* may return InvalidBlockNumber if there are no more blocks */
273 scan->rs_inited = true;
274 }
275 else
276 {
279 scan->rs_base.rs_parallel);
280 }
281
282 return scan->rs_prefetch_block;
283}
284
285/*
286 * Streaming read API callback for serial sequential and TID range scans.
287 * Returns the next block the caller wants from the read stream or
288 * InvalidBlockNumber when done.
289 */
290static BlockNumber
292 void *callback_private_data,
293 void *per_buffer_data)
294{
295 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
296
297 if (unlikely(!scan->rs_inited))
298 {
300 scan->rs_inited = true;
301 }
302 else
304 scan->rs_prefetch_block,
305 scan->rs_dir);
306
307 return scan->rs_prefetch_block;
308}
309
310/*
311 * Read stream API callback for bitmap heap scans.
312 * Returns the next block the caller wants from the read stream or
313 * InvalidBlockNumber when done.
314 */
315static BlockNumber
316bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data,
317 void *per_buffer_data)
318{
319 TBMIterateResult *tbmres = per_buffer_data;
322 TableScanDesc sscan = &hscan->rs_base;
323
324 for (;;)
325 {
327
328 /* no more entries in the bitmap */
329 if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
330 return InvalidBlockNumber;
331
332 /*
333 * Ignore any claimed entries past what we think is the end of the
334 * relation. It may have been extended after the start of our scan (we
335 * only hold an AccessShareLock, and it could be inserts from this
336 * backend). We don't take this optimization in SERIALIZABLE
337 * isolation though, as we need to examine all invisible tuples
338 * reachable by the index.
339 */
341 tbmres->blockno >= hscan->rs_nblocks)
342 continue;
343
344 return tbmres->blockno;
345 }
346
347 /* not reachable */
348 Assert(false);
349}
350
351/* ----------------
352 * initscan - scan code common to heap_beginscan and heap_rescan
353 * ----------------
354 */
355static void
357{
359 bool allow_strat;
360 bool allow_sync;
361
362 /*
363 * Determine the number of blocks we have to scan.
364 *
365 * It is sufficient to do this once at scan start, since any tuples added
366 * while the scan is in progress will be invisible to my snapshot anyway.
367 * (That is not true when using a non-MVCC snapshot. However, we couldn't
368 * guarantee to return tuples added after scan start anyway, since they
369 * might go into pages we already scanned. To guarantee consistent
370 * results for a non-MVCC snapshot, the caller must hold some higher-level
371 * lock that ensures the interesting tuple(s) won't change.)
372 */
373 if (scan->rs_base.rs_parallel != NULL)
374 {
376 scan->rs_nblocks = bpscan->phs_nblocks;
377 }
378 else
380
381 /*
382 * If the table is large relative to NBuffers, use a bulk-read access
383 * strategy and enable synchronized scanning (see syncscan.c). Although
384 * the thresholds for these features could be different, we make them the
385 * same so that there are only two behaviors to tune rather than four.
386 * (However, some callers need to be able to disable one or both of these
387 * behaviors, independently of the size of the table; also there is a GUC
388 * variable that can disable synchronized scanning.)
389 *
390 * Note that table_block_parallelscan_initialize has a very similar test;
391 * if you change this, consider changing that one, too.
392 */
394 scan->rs_nblocks > NBuffers / 4)
395 {
397 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
398 }
399 else
400 allow_strat = allow_sync = false;
401
402 if (allow_strat)
403 {
404 /* During a rescan, keep the previous strategy object. */
405 if (scan->rs_strategy == NULL)
407 }
408 else
409 {
410 if (scan->rs_strategy != NULL)
412 scan->rs_strategy = NULL;
413 }
414
415 if (scan->rs_base.rs_parallel != NULL)
416 {
417 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
420 else
422
423 /*
424 * If not rescanning, initialize the startblock. Finding the actual
425 * start location is done in table_block_parallelscan_startblock_init,
426 * based on whether an alternative start location has been set with
427 * heap_setscanlimits, or using the syncscan location, when syncscan
428 * is enabled.
429 */
430 if (!keep_startblock)
432 }
433 else
434 {
435 if (keep_startblock)
436 {
437 /*
438 * When rescanning, we want to keep the previous startblock
439 * setting, so that rewinding a cursor doesn't generate surprising
440 * results. Reset the active syncscan setting, though.
441 */
444 else
446 }
448 {
451 }
452 else
453 {
455 scan->rs_startblock = 0;
456 }
457 }
458
460 scan->rs_inited = false;
461 scan->rs_ctup.t_data = NULL;
463 scan->rs_cbuf = InvalidBuffer;
465 scan->rs_ntuples = 0;
466 scan->rs_cindex = 0;
467
468 /*
469 * Initialize to ForwardScanDirection because it is most common and
470 * because heap scans go forward before going backward (e.g. CURSORs).
471 */
474
475 /* page-at-a-time fields are always invalid when not rs_inited */
476
477 /*
478 * copy the scan key, if appropriate
479 */
480 if (key != NULL && scan->rs_base.rs_nkeys > 0)
481 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
482
483 /*
484 * Currently, we only have a stats counter for sequential heap scans (but
485 * e.g for bitmap scans the underlying bitmap index scans will be counted,
486 * and for sample scans we update stats for tuple fetches).
487 */
488 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
490}
491
492/*
493 * heap_setscanlimits - restrict range of a heapscan
494 *
495 * startBlk is the page to start at
496 * numBlks is number of pages to scan (InvalidBlockNumber means "all")
497 */
498void
500{
502
503 Assert(!scan->rs_inited); /* else too late to change */
504 /* else rs_startblock is significant */
506
507 /* Check startBlk is valid (but allow case of zero blocks...) */
508 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
509
510 scan->rs_startblock = startBlk;
511 scan->rs_numblocks = numBlks;
512}
513
514/*
515 * Per-tuple loop for heap_prepare_pagescan(). Pulled out so it can be called
516 * multiple times, with constant arguments for all_visible,
517 * check_serializable.
518 */
520static int
522 Page page, Buffer buffer,
523 BlockNumber block, int lines,
524 bool all_visible, bool check_serializable)
525{
526 Oid relid = RelationGetRelid(scan->rs_base.rs_rd);
527 int ntup = 0;
528 int nvis = 0;
530
531 /* page at a time should have been disabled otherwise */
532 Assert(IsMVCCSnapshot(snapshot));
533
534 /* first find all tuples on the page */
536 {
539
541 continue;
542
543 /*
544 * If the page is not all-visible or we need to check serializability,
545 * maintain enough state to be able to refind the tuple efficiently,
546 * without again first needing to fetch the item and then via that the
547 * tuple.
548 */
549 if (!all_visible || check_serializable)
550 {
551 tup = &batchmvcc.tuples[ntup];
552
554 tup->t_len = ItemIdGetLength(lpp);
555 tup->t_tableOid = relid;
556 ItemPointerSet(&(tup->t_self), block, lineoff);
557 }
558
559 /*
560 * If the page is all visible, these fields otherwise won't be
561 * populated in loop below.
562 */
563 if (all_visible)
564 {
566 {
567 batchmvcc.visible[ntup] = true;
568 }
569 scan->rs_vistuples[ntup] = lineoff;
570 }
571
572 ntup++;
573 }
574
576
577 /*
578 * Unless the page is all visible, test visibility for all tuples one go.
579 * That is considerably more efficient than calling
580 * HeapTupleSatisfiesMVCC() one-by-one.
581 */
582 if (all_visible)
583 nvis = ntup;
584 else
585 nvis = HeapTupleSatisfiesMVCCBatch(snapshot, buffer,
586 ntup,
587 &batchmvcc,
588 scan->rs_vistuples);
589
590 /*
591 * So far we don't have batch API for testing serializabilty, so do so
592 * one-by-one.
593 */
595 {
596 for (int i = 0; i < ntup; i++)
597 {
599 scan->rs_base.rs_rd,
600 &batchmvcc.tuples[i],
601 buffer, snapshot);
602 }
603 }
604
605 return nvis;
606}
607
608/*
609 * heap_prepare_pagescan - Prepare current scan page to be scanned in pagemode
610 *
611 * Preparation currently consists of 1. prune the scan's rs_cbuf page, and 2.
612 * fill the rs_vistuples[] array with the OffsetNumbers of visible tuples.
613 */
614void
616{
618 Buffer buffer = scan->rs_cbuf;
619 BlockNumber block = scan->rs_cblock;
620 Snapshot snapshot;
621 Page page;
622 int lines;
623 bool all_visible;
625
626 Assert(BufferGetBlockNumber(buffer) == block);
627
628 /* ensure we're not accidentally being used when not in pagemode */
630 snapshot = scan->rs_base.rs_snapshot;
631
632 /*
633 * Prune and repair fragmentation for the whole page, if possible.
634 */
635 heap_page_prune_opt(scan->rs_base.rs_rd, buffer, &scan->rs_vmbuffer);
636
637 /*
638 * We must hold share lock on the buffer content while examining tuple
639 * visibility. Afterwards, however, the tuples we have found to be
640 * visible are guaranteed good as long as we hold the buffer pin.
641 */
643
644 page = BufferGetPage(buffer);
645 lines = PageGetMaxOffsetNumber(page);
646
647 /*
648 * If the all-visible flag indicates that all tuples on the page are
649 * visible to everyone, we can skip the per-tuple visibility tests.
650 *
651 * Note: In hot standby, a tuple that's already visible to all
652 * transactions on the primary might still be invisible to a read-only
653 * transaction in the standby. We partly handle this problem by tracking
654 * the minimum xmin of visible tuples as the cut-off XID while marking a
655 * page all-visible on the primary and WAL log that along with the
656 * visibility map SET operation. In hot standby, we wait for (or abort)
657 * all transactions that can potentially may not see one or more tuples on
658 * the page. That's how index-only scans work fine in hot standby. A
659 * crucial difference between index-only scans and heap scans is that the
660 * index-only scan completely relies on the visibility map where as heap
661 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
662 * the page-level flag can be trusted in the same way, because it might
663 * get propagated somehow without being explicitly WAL-logged, e.g. via a
664 * full page write. Until we can prove that beyond doubt, let's check each
665 * tuple for visibility the hard way.
666 */
667 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
670
671 /*
672 * We call page_collect_tuples() with constant arguments, to get the
673 * compiler to constant fold the constant arguments. Separate calls with
674 * constant arguments, rather than variables, are needed on several
675 * compilers to actually perform constant folding.
676 */
677 if (likely(all_visible))
678 {
680 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
681 block, lines, true, false);
682 else
683 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
684 block, lines, true, true);
685 }
686 else
687 {
689 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
690 block, lines, false, false);
691 else
692 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
693 block, lines, false, true);
694 }
695
697}
698
699/*
700 * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM.
701 *
702 * Read the next block of the scan relation from the read stream and save it
703 * in the scan descriptor. It is already pinned.
704 */
705static inline void
707{
708 Assert(scan->rs_read_stream);
709
710 /* release previous scan buffer, if any */
711 if (BufferIsValid(scan->rs_cbuf))
712 {
713 ReleaseBuffer(scan->rs_cbuf);
714 scan->rs_cbuf = InvalidBuffer;
715 }
716
717 /*
718 * Be sure to check for interrupts at least once per page. Checks at
719 * higher code levels won't be able to stop a seqscan that encounters many
720 * pages' worth of consecutive dead tuples.
721 */
723
724 /*
725 * If the scan direction is changing, reset the prefetch block to the
726 * current block. Otherwise, we will incorrectly prefetch the blocks
727 * between the prefetch block and the current block again before
728 * prefetching blocks in the new, correct scan direction.
729 */
730 if (unlikely(scan->rs_dir != dir))
731 {
732 scan->rs_prefetch_block = scan->rs_cblock;
734 }
735
736 scan->rs_dir = dir;
737
739 if (BufferIsValid(scan->rs_cbuf))
741}
742
743/*
744 * heapgettup_initial_block - return the first BlockNumber to scan
745 *
746 * Returns InvalidBlockNumber when there are no blocks to scan. This can
747 * occur with empty tables and in parallel scans when parallel workers get all
748 * of the pages before we can get a chance to get our first page.
749 */
752{
753 Assert(!scan->rs_inited);
754 Assert(scan->rs_base.rs_parallel == NULL);
755
756 /* When there are no pages to scan, return InvalidBlockNumber */
757 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
758 return InvalidBlockNumber;
759
760 if (ScanDirectionIsForward(dir))
761 {
762 return scan->rs_startblock;
763 }
764 else
765 {
766 /*
767 * Disable reporting to syncscan logic in a backwards scan; it's not
768 * very likely anyone else is doing the same thing at the same time,
769 * and much more likely that we'll just bollix things for forward
770 * scanners.
771 */
773
774 /*
775 * Start from last page of the scan. Ensure we take into account
776 * rs_numblocks if it's been adjusted by heap_setscanlimits().
777 */
778 if (scan->rs_numblocks != InvalidBlockNumber)
779 return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
780
781 if (scan->rs_startblock > 0)
782 return scan->rs_startblock - 1;
783
784 return scan->rs_nblocks - 1;
785 }
786}
787
788
789/*
790 * heapgettup_start_page - helper function for heapgettup()
791 *
792 * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
793 * to the number of tuples on this page. Also set *lineoff to the first
794 * offset to scan with forward scans getting the first offset and backward
795 * getting the final offset on the page.
796 */
797static Page
800{
801 Page page;
802
803 Assert(scan->rs_inited);
805
806 /* Caller is responsible for ensuring buffer is locked if needed */
807 page = BufferGetPage(scan->rs_cbuf);
808
810
811 if (ScanDirectionIsForward(dir))
813 else
815
816 /* lineoff now references the physically previous or next tid */
817 return page;
818}
819
820
821/*
822 * heapgettup_continue_page - helper function for heapgettup()
823 *
824 * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
825 * to the number of tuples left to scan on this page. Also set *lineoff to
826 * the next offset to scan according to the ScanDirection in 'dir'.
827 */
828static inline Page
831{
832 Page page;
833
834 Assert(scan->rs_inited);
836
837 /* Caller is responsible for ensuring buffer is locked if needed */
838 page = BufferGetPage(scan->rs_cbuf);
839
840 if (ScanDirectionIsForward(dir))
841 {
843 *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
844 }
845 else
846 {
847 /*
848 * The previous returned tuple may have been vacuumed since the
849 * previous scan when we use a non-MVCC snapshot, so we must
850 * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
851 */
853 *linesleft = *lineoff;
854 }
855
856 /* lineoff now references the physically previous or next tid */
857 return page;
858}
859
860/*
861 * heapgettup_advance_block - helper for heap_fetch_next_buffer()
862 *
863 * Given the current block number, the scan direction, and various information
864 * contained in the scan descriptor, calculate the BlockNumber to scan next
865 * and return it. If there are no further blocks to scan, return
866 * InvalidBlockNumber to indicate this fact to the caller.
867 *
868 * This should not be called to determine the initial block number -- only for
869 * subsequent blocks.
870 *
871 * This also adjusts rs_numblocks when a limit has been imposed by
872 * heap_setscanlimits().
873 */
874static inline BlockNumber
876{
877 Assert(scan->rs_base.rs_parallel == NULL);
878
880 {
881 block++;
882
883 /* wrap back to the start of the heap */
884 if (block >= scan->rs_nblocks)
885 block = 0;
886
887 /*
888 * Report our new scan position for synchronization purposes. We don't
889 * do that when moving backwards, however. That would just mess up any
890 * other forward-moving scanners.
891 *
892 * Note: we do this before checking for end of scan so that the final
893 * state of the position hint is back at the start of the rel. That's
894 * not strictly necessary, but otherwise when you run the same query
895 * multiple times the starting position would shift a little bit
896 * backwards on every invocation, which is confusing. We don't
897 * guarantee any specific ordering in general, though.
898 */
899 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
900 ss_report_location(scan->rs_base.rs_rd, block);
901
902 /* we're done if we're back at where we started */
903 if (block == scan->rs_startblock)
904 return InvalidBlockNumber;
905
906 /* check if the limit imposed by heap_setscanlimits() is met */
907 if (scan->rs_numblocks != InvalidBlockNumber)
908 {
909 if (--scan->rs_numblocks == 0)
910 return InvalidBlockNumber;
911 }
912
913 return block;
914 }
915 else
916 {
917 /* we're done if the last block is the start position */
918 if (block == scan->rs_startblock)
919 return InvalidBlockNumber;
920
921 /* check if the limit imposed by heap_setscanlimits() is met */
922 if (scan->rs_numblocks != InvalidBlockNumber)
923 {
924 if (--scan->rs_numblocks == 0)
925 return InvalidBlockNumber;
926 }
927
928 /* wrap to the end of the heap when the last page was page 0 */
929 if (block == 0)
930 block = scan->rs_nblocks;
931
932 block--;
933
934 return block;
935 }
936}
937
938/* ----------------
939 * heapgettup - fetch next heap tuple
940 *
941 * Initialize the scan if not already done; then advance to the next
942 * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
943 * or set scan->rs_ctup.t_data = NULL if no more tuples.
944 *
945 * Note: the reason nkeys/key are passed separately, even though they are
946 * kept in the scan descriptor, is that the caller may not want us to check
947 * the scankeys.
948 *
949 * Note: when we fall off the end of the scan in either direction, we
950 * reset rs_inited. This means that a further request with the same
951 * scan direction will restart the scan, which is a bit odd, but a
952 * request with the opposite scan direction will start a fresh scan
953 * in the proper direction. The latter is required behavior for cursors,
954 * while the former case is generally undefined behavior in Postgres
955 * so we don't care too much.
956 * ----------------
957 */
958static void
960 ScanDirection dir,
961 int nkeys,
962 ScanKey key)
963{
964 HeapTuple tuple = &(scan->rs_ctup);
965 Page page;
967 int linesleft;
968
969 if (likely(scan->rs_inited))
970 {
971 /* continue from previously returned page/tuple */
973 page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
974 goto continue_page;
975 }
976
977 /*
978 * advance the scan until we find a qualifying tuple or run out of stuff
979 * to scan
980 */
981 while (true)
982 {
983 heap_fetch_next_buffer(scan, dir);
984
985 /* did we run out of blocks to scan? */
986 if (!BufferIsValid(scan->rs_cbuf))
987 break;
988
990
992 page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
994
995 /*
996 * Only continue scanning the page while we have lines left.
997 *
998 * Note that this protects us from accessing line pointers past
999 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
1000 * table scan, and for when we start scanning a new page.
1001 */
1002 for (; linesleft > 0; linesleft--, lineoff += dir)
1003 {
1004 bool visible;
1006
1007 if (!ItemIdIsNormal(lpp))
1008 continue;
1009
1010 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1011 tuple->t_len = ItemIdGetLength(lpp);
1012 ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
1013
1014 visible = HeapTupleSatisfiesVisibility(tuple,
1015 scan->rs_base.rs_snapshot,
1016 scan->rs_cbuf);
1017
1019 tuple, scan->rs_cbuf,
1020 scan->rs_base.rs_snapshot);
1021
1022 /* skip tuples not visible to this snapshot */
1023 if (!visible)
1024 continue;
1025
1026 /* skip any tuples that don't match the scan key */
1027 if (key != NULL &&
1029 nkeys, key))
1030 continue;
1031
1033 scan->rs_coffset = lineoff;
1034 return;
1035 }
1036
1037 /*
1038 * if we get here, it means we've exhausted the items on this page and
1039 * it's time to move to the next.
1040 */
1042 }
1043
1044 /* end of scan */
1045 if (BufferIsValid(scan->rs_cbuf))
1046 ReleaseBuffer(scan->rs_cbuf);
1047
1048 scan->rs_cbuf = InvalidBuffer;
1051 tuple->t_data = NULL;
1052 scan->rs_inited = false;
1053}
1054
1055/* ----------------
1056 * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
1057 *
1058 * Same API as heapgettup, but used in page-at-a-time mode
1059 *
1060 * The internal logic is much the same as heapgettup's too, but there are some
1061 * differences: we do not take the buffer content lock (that only needs to
1062 * happen inside heap_prepare_pagescan), and we iterate through just the
1063 * tuples listed in rs_vistuples[] rather than all tuples on the page. Notice
1064 * that lineindex is 0-based, where the corresponding loop variable lineoff in
1065 * heapgettup is 1-based.
1066 * ----------------
1067 */
1068static void
1070 ScanDirection dir,
1071 int nkeys,
1072 ScanKey key)
1073{
1074 HeapTuple tuple = &(scan->rs_ctup);
1075 Page page;
1078
1079 if (likely(scan->rs_inited))
1080 {
1081 /* continue from previously returned page/tuple */
1082 page = BufferGetPage(scan->rs_cbuf);
1083
1084 lineindex = scan->rs_cindex + dir;
1085 if (ScanDirectionIsForward(dir))
1086 linesleft = scan->rs_ntuples - lineindex;
1087 else
1088 linesleft = scan->rs_cindex;
1089 /* lineindex now references the next or previous visible tid */
1090
1091 goto continue_page;
1092 }
1093
1094 /*
1095 * advance the scan until we find a qualifying tuple or run out of stuff
1096 * to scan
1097 */
1098 while (true)
1099 {
1100 heap_fetch_next_buffer(scan, dir);
1101
1102 /* did we run out of blocks to scan? */
1103 if (!BufferIsValid(scan->rs_cbuf))
1104 break;
1105
1107
1108 /* prune the page and determine visible tuple offsets */
1110 page = BufferGetPage(scan->rs_cbuf);
1111 linesleft = scan->rs_ntuples;
1113
1114 /* block is the same for all tuples, set it once outside the loop */
1116
1117 /* lineindex now references the next or previous visible tid */
1119
1120 for (; linesleft > 0; linesleft--, lineindex += dir)
1121 {
1122 ItemId lpp;
1124
1125 Assert(lineindex < scan->rs_ntuples);
1127 lpp = PageGetItemId(page, lineoff);
1129
1130 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1131 tuple->t_len = ItemIdGetLength(lpp);
1133
1134 /* skip any tuples that don't match the scan key */
1135 if (key != NULL &&
1137 nkeys, key))
1138 continue;
1139
1140 scan->rs_cindex = lineindex;
1141 return;
1142 }
1143 }
1144
1145 /* end of scan */
1146 if (BufferIsValid(scan->rs_cbuf))
1147 ReleaseBuffer(scan->rs_cbuf);
1148 scan->rs_cbuf = InvalidBuffer;
1151 tuple->t_data = NULL;
1152 scan->rs_inited = false;
1153}
1154
1155
1156/* ----------------------------------------------------------------
1157 * heap access method interface
1158 * ----------------------------------------------------------------
1159 */
1160
1161
1163heap_beginscan(Relation relation, Snapshot snapshot,
1164 int nkeys, ScanKey key,
1165 ParallelTableScanDesc parallel_scan,
1166 uint32 flags)
1167{
1168 HeapScanDesc scan;
1169
1170 /*
1171 * increment relation ref count while scanning relation
1172 *
1173 * This is just to make really sure the relcache entry won't go away while
1174 * the scan has a pointer to it. Caller should be holding the rel open
1175 * anyway, so this is redundant in all normal scenarios...
1176 */
1178
1179 /*
1180 * allocate and initialize scan descriptor
1181 */
1182 if (flags & SO_TYPE_BITMAPSCAN)
1183 {
1185
1186 /*
1187 * Bitmap Heap scans do not have any fields that a normal Heap Scan
1188 * does not have, so no special initializations required here.
1189 */
1190 scan = (HeapScanDesc) bscan;
1191 }
1192 else
1194
1195 scan->rs_base.rs_rd = relation;
1196 scan->rs_base.rs_snapshot = snapshot;
1197 scan->rs_base.rs_nkeys = nkeys;
1198 scan->rs_base.rs_flags = flags;
1199 scan->rs_base.rs_parallel = parallel_scan;
1200 scan->rs_strategy = NULL; /* set in initscan */
1201 scan->rs_cbuf = InvalidBuffer;
1202
1203 /*
1204 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1205 */
1206 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1208
1209 /* Check that a historic snapshot is not used for non-catalog tables */
1210 if (snapshot &&
1211 IsHistoricMVCCSnapshot(snapshot) &&
1213 {
1214 ereport(ERROR,
1216 errmsg("cannot query non-catalog table \"%s\" during logical decoding",
1217 RelationGetRelationName(relation))));
1218 }
1219
1220 /*
1221 * For seqscan and sample scans in a serializable transaction, acquire a
1222 * predicate lock on the entire relation. This is required not only to
1223 * lock all the matching tuples, but also to conflict with new insertions
1224 * into the table. In an indexscan, we take page locks on the index pages
1225 * covering the range specified in the scan qual, but in a heap scan there
1226 * is nothing more fine-grained to lock. A bitmap scan is a different
1227 * story, there we have already scanned the index and locked the index
1228 * pages covering the predicate. But in that case we still have to lock
1229 * any matching heap tuples. For sample scan we could optimize the locking
1230 * to be at least page-level granularity, but we'd need to add per-tuple
1231 * locking for that.
1232 */
1234 {
1235 /*
1236 * Ensure a missing snapshot is noticed reliably, even if the
1237 * isolation mode means predicate locking isn't performed (and
1238 * therefore the snapshot isn't used here).
1239 */
1240 Assert(snapshot);
1241 PredicateLockRelation(relation, snapshot);
1242 }
1243
1244 /* we only need to set this up once */
1245 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1246
1247 /*
1248 * Allocate memory to keep track of page allocation for parallel workers
1249 * when doing a parallel scan.
1250 */
1251 if (parallel_scan != NULL)
1253 else
1255
1256 /*
1257 * we do this here instead of in initscan() because heap_rescan also calls
1258 * initscan() and we don't want to allocate memory again
1259 */
1260 if (nkeys > 0)
1261 scan->rs_base.rs_key = palloc_array(ScanKeyData, nkeys);
1262 else
1263 scan->rs_base.rs_key = NULL;
1264
1265 initscan(scan, key, false);
1266
1267 scan->rs_read_stream = NULL;
1268
1269 /*
1270 * Set up a read stream for sequential scans and TID range scans. This
1271 * should be done after initscan() because initscan() allocates the
1272 * BufferAccessStrategy object passed to the read stream API.
1273 */
1274 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN ||
1276 {
1278
1279 if (scan->rs_base.rs_parallel)
1281 else
1283
1284 /* ---
1285 * It is safe to use batchmode as the only locks taken by `cb`
1286 * are never taken while waiting for IO:
1287 * - SyncScanLock is used in the non-parallel case
1288 * - in the parallel case, only spinlocks and atomics are used
1289 * ---
1290 */
1293 scan->rs_strategy,
1294 scan->rs_base.rs_rd,
1296 cb,
1297 scan,
1298 0);
1299 }
1300 else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
1301 {
1304 scan->rs_strategy,
1305 scan->rs_base.rs_rd,
1308 scan,
1309 sizeof(TBMIterateResult));
1310 }
1311
1312 scan->rs_vmbuffer = InvalidBuffer;
1313
1314 return (TableScanDesc) scan;
1315}
1316
1317void
1319 bool allow_strat, bool allow_sync, bool allow_pagemode)
1320{
1322
1323 if (set_params)
1324 {
1325 if (allow_strat)
1327 else
1329
1330 if (allow_sync)
1332 else
1334
1335 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1338 else
1340 }
1341
1342 /*
1343 * unpin scan buffers
1344 */
1345 if (BufferIsValid(scan->rs_cbuf))
1346 {
1347 ReleaseBuffer(scan->rs_cbuf);
1348 scan->rs_cbuf = InvalidBuffer;
1349 }
1350
1351 if (BufferIsValid(scan->rs_vmbuffer))
1352 {
1354 scan->rs_vmbuffer = InvalidBuffer;
1355 }
1356
1357 /*
1358 * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any
1359 * additional data vs a normal HeapScan
1360 */
1361
1362 /*
1363 * The read stream is reset on rescan. This must be done before
1364 * initscan(), as some state referred to by read_stream_reset() is reset
1365 * in initscan().
1366 */
1367 if (scan->rs_read_stream)
1369
1370 /*
1371 * reinitialize scan descriptor
1372 */
1373 initscan(scan, key, true);
1374}
1375
1376void
1378{
1380
1381 /* Note: no locking manipulations needed */
1382
1383 /*
1384 * unpin scan buffers
1385 */
1386 if (BufferIsValid(scan->rs_cbuf))
1387 ReleaseBuffer(scan->rs_cbuf);
1388
1389 if (BufferIsValid(scan->rs_vmbuffer))
1391
1392 /*
1393 * Must free the read stream before freeing the BufferAccessStrategy.
1394 */
1395 if (scan->rs_read_stream)
1397
1398 /*
1399 * decrement relation reference count and free scan descriptor storage
1400 */
1402
1403 if (scan->rs_base.rs_key)
1404 pfree(scan->rs_base.rs_key);
1405
1406 if (scan->rs_strategy != NULL)
1408
1409 if (scan->rs_parallelworkerdata != NULL)
1411
1412 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1414
1415 pfree(scan);
1416}
1417
1420{
1422
1423 /*
1424 * This is still widely used directly, without going through table AM, so
1425 * add a safety check. It's possible we should, at a later point,
1426 * downgrade this to an assert. The reason for checking the AM routine,
1427 * rather than the AM oid, is that this allows to write regression tests
1428 * that create another AM reusing the heap handler.
1429 */
1430 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1431 ereport(ERROR,
1433 errmsg_internal("only heap AM is supported")));
1434
1435 /* Note: no locking manipulations needed */
1436
1438 heapgettup_pagemode(scan, direction,
1439 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1440 else
1441 heapgettup(scan, direction,
1442 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1443
1444 if (scan->rs_ctup.t_data == NULL)
1445 return NULL;
1446
1447 /*
1448 * if we get here it means we have a new current scan tuple, so point to
1449 * the proper return buffer and return the tuple.
1450 */
1451
1453
1454 return &scan->rs_ctup;
1455}
1456
1457bool
1459{
1461
1462 /* Note: no locking manipulations needed */
1463
1464 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1465 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1466 else
1467 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1468
1469 if (scan->rs_ctup.t_data == NULL)
1470 {
1471 ExecClearTuple(slot);
1472 return false;
1473 }
1474
1475 /*
1476 * if we get here it means we have a new current scan tuple, so point to
1477 * the proper return buffer and return the tuple.
1478 */
1479
1481
1482 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1483 scan->rs_cbuf);
1484 return true;
1485}
1486
1487void
1490{
1496
1497 /*
1498 * For relations without any pages, we can simply leave the TID range
1499 * unset. There will be no tuples to scan, therefore no tuples outside
1500 * the given TID range.
1501 */
1502 if (scan->rs_nblocks == 0)
1503 return;
1504
1505 /*
1506 * Set up some ItemPointers which point to the first and last possible
1507 * tuples in the heap.
1508 */
1511
1512 /*
1513 * If the given maximum TID is below the highest possible TID in the
1514 * relation, then restrict the range to that, otherwise we scan to the end
1515 * of the relation.
1516 */
1519
1520 /*
1521 * If the given minimum TID is above the lowest possible TID in the
1522 * relation, then restrict the range to only scan for TIDs above that.
1523 */
1526
1527 /*
1528 * Check for an empty range and protect from would be negative results
1529 * from the numBlks calculation below.
1530 */
1532 {
1533 /* Set an empty range of blocks to scan */
1535 return;
1536 }
1537
1538 /*
1539 * Calculate the first block and the number of blocks we must scan. We
1540 * could be more aggressive here and perform some more validation to try
1541 * and further narrow the scope of blocks to scan by checking if the
1542 * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1543 * advance startBlk by one. Likewise, if highestItem has an offset of 0
1544 * we could scan one fewer blocks. However, such an optimization does not
1545 * seem worth troubling over, currently.
1546 */
1548
1551
1552 /* Set the start block and number of blocks to scan */
1554
1555 /* Finally, set the TID range in sscan */
1556 ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
1557 ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
1558}
1559
1560bool
1562 TupleTableSlot *slot)
1563{
1565 ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
1566 ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
1567
1568 /* Note: no locking manipulations needed */
1569 for (;;)
1570 {
1571 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1572 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1573 else
1574 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1575
1576 if (scan->rs_ctup.t_data == NULL)
1577 {
1578 ExecClearTuple(slot);
1579 return false;
1580 }
1581
1582 /*
1583 * heap_set_tidrange will have used heap_setscanlimits to limit the
1584 * range of pages we scan to only ones that can contain the TID range
1585 * we're scanning for. Here we must filter out any tuples from these
1586 * pages that are outside of that range.
1587 */
1588 if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1589 {
1590 ExecClearTuple(slot);
1591
1592 /*
1593 * When scanning backwards, the TIDs will be in descending order.
1594 * Future tuples in this direction will be lower still, so we can
1595 * just return false to indicate there will be no more tuples.
1596 */
1597 if (ScanDirectionIsBackward(direction))
1598 return false;
1599
1600 continue;
1601 }
1602
1603 /*
1604 * Likewise for the final page, we must filter out TIDs greater than
1605 * maxtid.
1606 */
1607 if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1608 {
1609 ExecClearTuple(slot);
1610
1611 /*
1612 * When scanning forward, the TIDs will be in ascending order.
1613 * Future tuples in this direction will be higher still, so we can
1614 * just return false to indicate there will be no more tuples.
1615 */
1616 if (ScanDirectionIsForward(direction))
1617 return false;
1618 continue;
1619 }
1620
1621 break;
1622 }
1623
1624 /*
1625 * if we get here it means we have a new current scan tuple, so point to
1626 * the proper return buffer and return the tuple.
1627 */
1629
1630 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1631 return true;
1632}
1633
1634/*
1635 * heap_fetch - retrieve tuple with given tid
1636 *
1637 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1638 * the tuple, fill in the remaining fields of *tuple, and check the tuple
1639 * against the specified snapshot.
1640 *
1641 * If successful (tuple found and passes snapshot time qual), then *userbuf
1642 * is set to the buffer holding the tuple and true is returned. The caller
1643 * must unpin the buffer when done with the tuple.
1644 *
1645 * If the tuple is not found (ie, item number references a deleted slot),
1646 * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer,
1647 * and false is returned.
1648 *
1649 * If the tuple is found but fails the time qual check, then the behavior
1650 * depends on the keep_buf parameter. If keep_buf is false, the results
1651 * are the same as for the tuple-not-found case. If keep_buf is true,
1652 * then tuple->t_data and *userbuf are returned as for the success case,
1653 * and again the caller must unpin the buffer; but false is returned.
1654 *
1655 * heap_fetch does not follow HOT chains: only the exact TID requested will
1656 * be fetched.
1657 *
1658 * It is somewhat inconsistent that we ereport() on invalid block number but
1659 * return false on invalid item number. There are a couple of reasons though.
1660 * One is that the caller can relatively easily check the block number for
1661 * validity, but cannot check the item number without reading the page
1662 * himself. Another is that when we are following a t_ctid link, we can be
1663 * reasonably confident that the page number is valid (since VACUUM shouldn't
1664 * truncate off the destination page without having killed the referencing
1665 * tuple first), but the item number might well not be good.
1666 */
1667bool
1668heap_fetch(Relation relation,
1669 Snapshot snapshot,
1670 HeapTuple tuple,
1671 Buffer *userbuf,
1672 bool keep_buf)
1673{
1674 ItemPointer tid = &(tuple->t_self);
1675 ItemId lp;
1676 Buffer buffer;
1677 Page page;
1678 OffsetNumber offnum;
1679 bool valid;
1680
1681 /*
1682 * Fetch and pin the appropriate page of the relation.
1683 */
1684 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1685
1686 /*
1687 * Need share lock on buffer to examine tuple commit status.
1688 */
1690 page = BufferGetPage(buffer);
1691
1692 /*
1693 * We'd better check for out-of-range offnum in case of VACUUM since the
1694 * TID was obtained.
1695 */
1696 offnum = ItemPointerGetOffsetNumber(tid);
1698 {
1700 ReleaseBuffer(buffer);
1702 tuple->t_data = NULL;
1703 return false;
1704 }
1705
1706 /*
1707 * get the item line pointer corresponding to the requested tid
1708 */
1709 lp = PageGetItemId(page, offnum);
1710
1711 /*
1712 * Must check for deleted tuple.
1713 */
1714 if (!ItemIdIsNormal(lp))
1715 {
1717 ReleaseBuffer(buffer);
1719 tuple->t_data = NULL;
1720 return false;
1721 }
1722
1723 /*
1724 * fill in *tuple fields
1725 */
1726 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1727 tuple->t_len = ItemIdGetLength(lp);
1728 tuple->t_tableOid = RelationGetRelid(relation);
1729
1730 /*
1731 * check tuple visibility, then release lock
1732 */
1733 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1734
1735 if (valid)
1736 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1738
1739 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1740
1742
1743 if (valid)
1744 {
1745 /*
1746 * All checks passed, so return the tuple as valid. Caller is now
1747 * responsible for releasing the buffer.
1748 */
1749 *userbuf = buffer;
1750
1751 return true;
1752 }
1753
1754 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1755 if (keep_buf)
1756 *userbuf = buffer;
1757 else
1758 {
1759 ReleaseBuffer(buffer);
1761 tuple->t_data = NULL;
1762 }
1763
1764 return false;
1765}
1766
1767/*
1768 * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1769 *
1770 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1771 * of a HOT chain), and buffer is the buffer holding this tuple. We search
1772 * for the first chain member satisfying the given snapshot. If one is
1773 * found, we update *tid to reference that tuple's offset number, and
1774 * return true. If no match, return false without modifying *tid.
1775 *
1776 * heapTuple is a caller-supplied buffer. When a match is found, we return
1777 * the tuple here, in addition to updating *tid. If no match is found, the
1778 * contents of this buffer on return are undefined.
1779 *
1780 * If all_dead is not NULL, we check non-visible tuples to see if they are
1781 * globally dead; *all_dead is set true if all members of the HOT chain
1782 * are vacuumable, false if not.
1783 *
1784 * Unlike heap_fetch, the caller must already have pin and (at least) share
1785 * lock on the buffer; it is still pinned/locked at exit.
1786 */
1787bool
1789 Snapshot snapshot, HeapTuple heapTuple,
1790 bool *all_dead, bool first_call)
1791{
1792 Page page = BufferGetPage(buffer);
1794 BlockNumber blkno;
1795 OffsetNumber offnum;
1796 bool at_chain_start;
1797 bool valid;
1798 bool skip;
1799 GlobalVisState *vistest = NULL;
1800
1801 /* If this is not the first call, previous call returned a (live!) tuple */
1802 if (all_dead)
1804
1805 blkno = ItemPointerGetBlockNumber(tid);
1806 offnum = ItemPointerGetOffsetNumber(tid);
1808 skip = !first_call;
1809
1810 /* XXX: we should assert that a snapshot is pushed or registered */
1812 Assert(BufferGetBlockNumber(buffer) == blkno);
1813
1814 /* Scan through possible multiple members of HOT-chain */
1815 for (;;)
1816 {
1817 ItemId lp;
1818
1819 /* check for bogus TID */
1821 break;
1822
1823 lp = PageGetItemId(page, offnum);
1824
1825 /* check for unused, dead, or redirected items */
1826 if (!ItemIdIsNormal(lp))
1827 {
1828 /* We should only see a redirect at start of chain */
1830 {
1831 /* Follow the redirect */
1832 offnum = ItemIdGetRedirect(lp);
1833 at_chain_start = false;
1834 continue;
1835 }
1836 /* else must be end of chain */
1837 break;
1838 }
1839
1840 /*
1841 * Update heapTuple to point to the element of the HOT chain we're
1842 * currently investigating. Having t_self set correctly is important
1843 * because the SSI checks and the *Satisfies routine for historical
1844 * MVCC snapshots need the correct tid to decide about the visibility.
1845 */
1846 heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1847 heapTuple->t_len = ItemIdGetLength(lp);
1848 heapTuple->t_tableOid = RelationGetRelid(relation);
1849 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1850
1851 /*
1852 * Shouldn't see a HEAP_ONLY tuple at chain start.
1853 */
1855 break;
1856
1857 /*
1858 * The xmin should match the previous xmax value, else chain is
1859 * broken.
1860 */
1864 break;
1865
1866 /*
1867 * When first_call is true (and thus, skip is initially false) we'll
1868 * return the first tuple we find. But on later passes, heapTuple
1869 * will initially be pointing to the tuple we returned last time.
1870 * Returning it again would be incorrect (and would loop forever), so
1871 * we skip it and return the next match we find.
1872 */
1873 if (!skip)
1874 {
1875 /* If it's visible per the snapshot, we must return it */
1876 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1878 buffer, snapshot);
1879
1880 if (valid)
1881 {
1882 ItemPointerSetOffsetNumber(tid, offnum);
1883 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1885 if (all_dead)
1886 *all_dead = false;
1887 return true;
1888 }
1889 }
1890 skip = false;
1891
1892 /*
1893 * If we can't see it, maybe no one else can either. At caller
1894 * request, check whether all chain members are dead to all
1895 * transactions.
1896 *
1897 * Note: if you change the criterion here for what is "dead", fix the
1898 * planner's get_actual_variable_range() function to match.
1899 */
1900 if (all_dead && *all_dead)
1901 {
1902 if (!vistest)
1903 vistest = GlobalVisTestFor(relation);
1904
1905 if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1906 *all_dead = false;
1907 }
1908
1909 /*
1910 * Check to see if HOT chain continues past this tuple; if so fetch
1911 * the next offnum and loop around.
1912 */
1914 {
1915 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1916 blkno);
1917 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1918 at_chain_start = false;
1920 }
1921 else
1922 break; /* end of chain */
1923 }
1924
1925 return false;
1926}
1927
1928/*
1929 * heap_get_latest_tid - get the latest tid of a specified tuple
1930 *
1931 * Actually, this gets the latest version that is visible according to the
1932 * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1933 * possibly uncommitted version.
1934 *
1935 * *tid is both an input and an output parameter: it is updated to
1936 * show the latest version of the row. Note that it will not be changed
1937 * if no version of the row passes the snapshot test.
1938 */
1939void
1941 ItemPointer tid)
1942{
1943 Relation relation = sscan->rs_rd;
1944 Snapshot snapshot = sscan->rs_snapshot;
1945 ItemPointerData ctid;
1947
1948 /*
1949 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1950 * Assume that t_ctid links are valid however - there shouldn't be invalid
1951 * ones in the table.
1952 */
1954
1955 /*
1956 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1957 * need to examine, and *tid is the TID we will return if ctid turns out
1958 * to be bogus.
1959 *
1960 * Note that we will loop until we reach the end of the t_ctid chain.
1961 * Depending on the snapshot passed, there might be at most one visible
1962 * version of the row, but we don't try to optimize for that.
1963 */
1964 ctid = *tid;
1965 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1966 for (;;)
1967 {
1968 Buffer buffer;
1969 Page page;
1970 OffsetNumber offnum;
1971 ItemId lp;
1972 HeapTupleData tp;
1973 bool valid;
1974
1975 /*
1976 * Read, pin, and lock the page.
1977 */
1978 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1980 page = BufferGetPage(buffer);
1981
1982 /*
1983 * Check for bogus item number. This is not treated as an error
1984 * condition because it can happen while following a t_ctid link. We
1985 * just assume that the prior tid is OK and return it unchanged.
1986 */
1987 offnum = ItemPointerGetOffsetNumber(&ctid);
1989 {
1990 UnlockReleaseBuffer(buffer);
1991 break;
1992 }
1993 lp = PageGetItemId(page, offnum);
1994 if (!ItemIdIsNormal(lp))
1995 {
1996 UnlockReleaseBuffer(buffer);
1997 break;
1998 }
1999
2000 /* OK to access the tuple */
2001 tp.t_self = ctid;
2002 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2003 tp.t_len = ItemIdGetLength(lp);
2004 tp.t_tableOid = RelationGetRelid(relation);
2005
2006 /*
2007 * After following a t_ctid link, we might arrive at an unrelated
2008 * tuple. Check for XMIN match.
2009 */
2012 {
2013 UnlockReleaseBuffer(buffer);
2014 break;
2015 }
2016
2017 /*
2018 * Check tuple visibility; if visible, set it as the new result
2019 * candidate.
2020 */
2021 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2022 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2023 if (valid)
2024 *tid = ctid;
2025
2026 /*
2027 * If there's a valid t_ctid link, follow it, else we're done.
2028 */
2029 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2033 {
2034 UnlockReleaseBuffer(buffer);
2035 break;
2036 }
2037
2038 ctid = tp.t_data->t_ctid;
2040 UnlockReleaseBuffer(buffer);
2041 } /* end of loop */
2042}
2043
2044
2045/*
2046 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2047 *
2048 * This is called after we have waited for the XMAX transaction to terminate.
2049 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2050 * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2051 * hint bit if possible --- but beware that that may not yet be possible,
2052 * if the transaction committed asynchronously.
2053 *
2054 * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2055 * even if it commits.
2056 *
2057 * Hence callers should look only at XMAX_INVALID.
2058 *
2059 * Note this is not allowed for tuples whose xmax is a multixact.
2060 */
2061static void
2063{
2066
2068 {
2069 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2072 xid);
2073 else
2076 }
2077}
2078
2079
2080/*
2081 * GetBulkInsertState - prepare status object for a bulk insert
2082 */
2085{
2086 BulkInsertState bistate;
2087
2090 bistate->current_buf = InvalidBuffer;
2091 bistate->next_free = InvalidBlockNumber;
2092 bistate->last_free = InvalidBlockNumber;
2093 bistate->already_extended_by = 0;
2094 return bistate;
2095}
2096
2097/*
2098 * FreeBulkInsertState - clean up after finishing a bulk insert
2099 */
2100void
2102{
2103 if (bistate->current_buf != InvalidBuffer)
2104 ReleaseBuffer(bistate->current_buf);
2105 FreeAccessStrategy(bistate->strategy);
2106 pfree(bistate);
2107}
2108
2109/*
2110 * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2111 */
2112void
2114{
2115 if (bistate->current_buf != InvalidBuffer)
2116 ReleaseBuffer(bistate->current_buf);
2117 bistate->current_buf = InvalidBuffer;
2118
2119 /*
2120 * Despite the name, we also reset bulk relation extension state.
2121 * Otherwise we can end up erroring out due to looking for free space in
2122 * ->next_free of one partition, even though ->next_free was set when
2123 * extending another partition. It could obviously also be bad for
2124 * efficiency to look at existing blocks at offsets from another
2125 * partition, even if we don't error out.
2126 */
2127 bistate->next_free = InvalidBlockNumber;
2128 bistate->last_free = InvalidBlockNumber;
2129}
2130
2131
2132/*
2133 * heap_insert - insert tuple into a heap
2134 *
2135 * The new tuple is stamped with current transaction ID and the specified
2136 * command ID.
2137 *
2138 * See table_tuple_insert for comments about most of the input flags, except
2139 * that this routine directly takes a tuple rather than a slot.
2140 *
2141 * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
2142 * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
2143 * implement table_tuple_insert_speculative().
2144 *
2145 * On return the header fields of *tup are updated to match the stored tuple;
2146 * in particular tup->t_self receives the actual TID where the tuple was
2147 * stored. But note that any toasting of fields within the tuple data is NOT
2148 * reflected into *tup.
2149 */
2150void
2152 int options, BulkInsertState bistate)
2153{
2156 Buffer buffer;
2157 Buffer vmbuffer = InvalidBuffer;
2158 bool all_visible_cleared = false;
2159
2160 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2163
2164 AssertHasSnapshotForToast(relation);
2165
2166 /*
2167 * Fill in tuple header fields and toast the tuple if necessary.
2168 *
2169 * Note: below this point, heaptup is the data we actually intend to store
2170 * into the relation; tup is the caller's original untoasted data.
2171 */
2172 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2173
2174 /*
2175 * Find buffer to insert this tuple into. If the page is all visible,
2176 * this will also pin the requisite visibility map page.
2177 */
2178 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2179 InvalidBuffer, options, bistate,
2180 &vmbuffer, NULL,
2181 0);
2182
2183 /*
2184 * We're about to do the actual insert -- but check for conflict first, to
2185 * avoid possibly having to roll back work we've just done.
2186 *
2187 * This is safe without a recheck as long as there is no possibility of
2188 * another process scanning the page between this check and the insert
2189 * being visible to the scan (i.e., an exclusive buffer content lock is
2190 * continuously held from this point until the tuple insert is visible).
2191 *
2192 * For a heap insert, we only need to check for table-level SSI locks. Our
2193 * new tuple can't possibly conflict with existing tuple locks, and heap
2194 * page locks are only consolidated versions of tuple locks; they do not
2195 * lock "gaps" as index page locks do. So we don't need to specify a
2196 * buffer when making the call, which makes for a faster check.
2197 */
2199
2200 /* NO EREPORT(ERROR) from here till changes are logged */
2202
2203 RelationPutHeapTuple(relation, buffer, heaptup,
2205
2206 if (PageIsAllVisible(BufferGetPage(buffer)))
2207 {
2208 all_visible_cleared = true;
2210 visibilitymap_clear(relation,
2212 vmbuffer, VISIBILITYMAP_VALID_BITS);
2213 }
2214
2215 /*
2216 * XXX Should we set PageSetPrunable on this page ?
2217 *
2218 * The inserting transaction may eventually abort thus making this tuple
2219 * DEAD and hence available for pruning. Though we don't want to optimize
2220 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2221 * aborted tuple will never be pruned until next vacuum is triggered.
2222 *
2223 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2224 */
2225
2226 MarkBufferDirty(buffer);
2227
2228 /* XLOG stuff */
2229 if (RelationNeedsWAL(relation))
2230 {
2234 Page page = BufferGetPage(buffer);
2235 uint8 info = XLOG_HEAP_INSERT;
2236 int bufflags = 0;
2237
2238 /*
2239 * If this is a catalog, we need to transmit combo CIDs to properly
2240 * decode, so log that as well.
2241 */
2243 log_heap_new_cid(relation, heaptup);
2244
2245 /*
2246 * If this is the single and first tuple on page, we can reinit the
2247 * page instead of restoring the whole thing. Set flag, and hide
2248 * buffer references from XLogInsert.
2249 */
2252 {
2253 info |= XLOG_HEAP_INIT_PAGE;
2255 }
2256
2257 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2258 xlrec.flags = 0;
2264
2265 /*
2266 * For logical decoding, we need the tuple even if we're doing a full
2267 * page write, so make sure it's included even if we take a full-page
2268 * image. (XXX We could alternatively store a pointer into the FPW).
2269 */
2270 if (RelationIsLogicallyLogged(relation) &&
2272 {
2275
2276 if (IsToastRelation(relation))
2278 }
2279
2282
2283 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2284 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2285 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2286
2287 /*
2288 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2289 * write the whole page to the xlog, we don't need to store
2290 * xl_heap_header in the xlog.
2291 */
2294 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2296 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2298
2299 /* filtering by origin on a row level is much more efficient */
2301
2302 recptr = XLogInsert(RM_HEAP_ID, info);
2303
2304 PageSetLSN(page, recptr);
2305 }
2306
2308
2309 UnlockReleaseBuffer(buffer);
2310 if (vmbuffer != InvalidBuffer)
2311 ReleaseBuffer(vmbuffer);
2312
2313 /*
2314 * If tuple is cacheable, mark it for invalidation from the caches in case
2315 * we abort. Note it is OK to do this after releasing the buffer, because
2316 * the heaptup data structure is all in local memory, not in the shared
2317 * buffer.
2318 */
2320
2321 /* Note: speculative insertions are counted too, even if aborted later */
2322 pgstat_count_heap_insert(relation, 1);
2323
2324 /*
2325 * If heaptup is a private copy, release it. Don't forget to copy t_self
2326 * back to the caller's image, too.
2327 */
2328 if (heaptup != tup)
2329 {
2330 tup->t_self = heaptup->t_self;
2332 }
2333}
2334
2335/*
2336 * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2337 * tuple header fields and toasts the tuple if necessary. Returns a toasted
2338 * version of the tuple if it was toasted, or the original tuple if not. Note
2339 * that in any case, the header fields are also set in the original tuple.
2340 */
2341static HeapTuple
2343 CommandId cid, int options)
2344{
2345 /*
2346 * To allow parallel inserts, we need to ensure that they are safe to be
2347 * performed in workers. We have the infrastructure to allow parallel
2348 * inserts in general except for the cases where inserts generate a new
2349 * CommandId (eg. inserts into a table having a foreign key column).
2350 */
2351 if (IsParallelWorker())
2352 ereport(ERROR,
2354 errmsg("cannot insert tuples in a parallel worker")));
2355
2356 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2357 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2358 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2359 HeapTupleHeaderSetXmin(tup->t_data, xid);
2362
2363 HeapTupleHeaderSetCmin(tup->t_data, cid);
2364 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2365 tup->t_tableOid = RelationGetRelid(relation);
2366
2367 /*
2368 * If the new tuple is too big for storage or contains already toasted
2369 * out-of-line attributes from some other relation, invoke the toaster.
2370 */
2371 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2372 relation->rd_rel->relkind != RELKIND_MATVIEW)
2373 {
2374 /* toast table entries should never be recursively toasted */
2376 return tup;
2377 }
2378 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2379 return heap_toast_insert_or_update(relation, tup, NULL, options);
2380 else
2381 return tup;
2382}
2383
2384/*
2385 * Helper for heap_multi_insert() that computes the number of entire pages
2386 * that inserting the remaining heaptuples requires. Used to determine how
2387 * much the relation needs to be extended by.
2388 */
2389static int
2391{
2393 int npages = 1;
2394
2395 for (int i = done; i < ntuples; i++)
2396 {
2397 size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2398
2399 if (page_avail < tup_sz)
2400 {
2401 npages++;
2403 }
2404 page_avail -= tup_sz;
2405 }
2406
2407 return npages;
2408}
2409
2410/*
2411 * heap_multi_insert - insert multiple tuples into a heap
2412 *
2413 * This is like heap_insert(), but inserts multiple tuples in one operation.
2414 * That's faster than calling heap_insert() in a loop, because when multiple
2415 * tuples can be inserted on a single page, we can write just a single WAL
2416 * record covering all of them, and only need to lock/unlock the page once.
2417 *
2418 * Note: this leaks memory into the current memory context. You can create a
2419 * temporary context before calling this, if that's a problem.
2420 */
2421void
2422heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2423 CommandId cid, int options, BulkInsertState bistate)
2424{
2427 int i;
2428 int ndone;
2430 Page page;
2431 Buffer vmbuffer = InvalidBuffer;
2432 bool needwal;
2436 bool starting_with_empty_page = false;
2437 int npages = 0;
2438 int npages_used = 0;
2439
2440 /* currently not needed (thus unsupported) for heap_multi_insert() */
2442
2443 AssertHasSnapshotForToast(relation);
2444
2445 needwal = RelationNeedsWAL(relation);
2448
2449 /* Toast and set header data in all the slots */
2450 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2451 for (i = 0; i < ntuples; i++)
2452 {
2453 HeapTuple tuple;
2454
2455 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2456 slots[i]->tts_tableOid = RelationGetRelid(relation);
2457 tuple->t_tableOid = slots[i]->tts_tableOid;
2458 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2459 options);
2460 }
2461
2462 /*
2463 * We're about to do the actual inserts -- but check for conflict first,
2464 * to minimize the possibility of having to roll back work we've just
2465 * done.
2466 *
2467 * A check here does not definitively prevent a serialization anomaly;
2468 * that check MUST be done at least past the point of acquiring an
2469 * exclusive buffer content lock on every buffer that will be affected,
2470 * and MAY be done after all inserts are reflected in the buffers and
2471 * those locks are released; otherwise there is a race condition. Since
2472 * multiple buffers can be locked and unlocked in the loop below, and it
2473 * would not be feasible to identify and lock all of those buffers before
2474 * the loop, we must do a final check at the end.
2475 *
2476 * The check here could be omitted with no loss of correctness; it is
2477 * present strictly as an optimization.
2478 *
2479 * For heap inserts, we only need to check for table-level SSI locks. Our
2480 * new tuples can't possibly conflict with existing tuple locks, and heap
2481 * page locks are only consolidated versions of tuple locks; they do not
2482 * lock "gaps" as index page locks do. So we don't need to specify a
2483 * buffer when making the call, which makes for a faster check.
2484 */
2486
2487 ndone = 0;
2488 while (ndone < ntuples)
2489 {
2490 Buffer buffer;
2491 bool all_visible_cleared = false;
2492 bool all_frozen_set = false;
2493 int nthispage;
2494
2496
2497 /*
2498 * Compute number of pages needed to fit the to-be-inserted tuples in
2499 * the worst case. This will be used to determine how much to extend
2500 * the relation by in RelationGetBufferForTuple(), if needed. If we
2501 * filled a prior page from scratch, we can just update our last
2502 * computation, but if we started with a partially filled page,
2503 * recompute from scratch, the number of potentially required pages
2504 * can vary due to tuples needing to fit onto the page, page headers
2505 * etc.
2506 */
2507 if (ndone == 0 || !starting_with_empty_page)
2508 {
2509 npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2511 npages_used = 0;
2512 }
2513 else
2514 npages_used++;
2515
2516 /*
2517 * Find buffer where at least the next tuple will fit. If the page is
2518 * all-visible, this will also pin the requisite visibility map page.
2519 *
2520 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2521 * empty page. See all_frozen_set below.
2522 */
2523 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2524 InvalidBuffer, options, bistate,
2525 &vmbuffer, NULL,
2526 npages - npages_used);
2527 page = BufferGetPage(buffer);
2528
2530
2532 {
2533 all_frozen_set = true;
2534 /* Lock the vmbuffer before entering the critical section */
2536 }
2537
2538 /* NO EREPORT(ERROR) from here till changes are logged */
2540
2541 /*
2542 * RelationGetBufferForTuple has ensured that the first tuple fits.
2543 * Put that on the page, and then as many other tuples as fit.
2544 */
2545 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2546
2547 /*
2548 * For logical decoding we need combo CIDs to properly decode the
2549 * catalog.
2550 */
2551 if (needwal && need_cids)
2552 log_heap_new_cid(relation, heaptuples[ndone]);
2553
2554 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2555 {
2557
2558 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2559 break;
2560
2561 RelationPutHeapTuple(relation, buffer, heaptup, false);
2562
2563 /*
2564 * For logical decoding we need combo CIDs to properly decode the
2565 * catalog.
2566 */
2567 if (needwal && need_cids)
2568 log_heap_new_cid(relation, heaptup);
2569 }
2570
2571 /*
2572 * If the page is all visible, need to clear that, unless we're only
2573 * going to add further frozen rows to it.
2574 *
2575 * If we're only adding already frozen rows to a previously empty
2576 * page, mark it as all-frozen and update the visibility map. We're
2577 * already holding a pin on the vmbuffer.
2578 */
2580 {
2581 all_visible_cleared = true;
2582 PageClearAllVisible(page);
2583 visibilitymap_clear(relation,
2584 BufferGetBlockNumber(buffer),
2585 vmbuffer, VISIBILITYMAP_VALID_BITS);
2586 }
2587 else if (all_frozen_set)
2588 {
2589 PageSetAllVisible(page);
2590 PageClearPrunable(page);
2592 vmbuffer,
2595 relation->rd_locator);
2596 }
2597
2598 /*
2599 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2600 */
2601
2602 MarkBufferDirty(buffer);
2603
2604 /* XLOG stuff */
2605 if (needwal)
2606 {
2610 char *tupledata;
2611 int totaldatalen;
2612 char *scratchptr = scratch.data;
2613 bool init;
2614 int bufflags = 0;
2615
2616 /*
2617 * If the page was previously empty, we can reinit the page
2618 * instead of restoring the whole thing.
2619 */
2621
2622 /* allocate xl_heap_multi_insert struct from the scratch area */
2625
2626 /*
2627 * Allocate offsets array. Unless we're reinitializing the page,
2628 * in that case the tuples are stored in order starting at
2629 * FirstOffsetNumber and we don't need to store the offsets
2630 * explicitly.
2631 */
2632 if (!init)
2633 scratchptr += nthispage * sizeof(OffsetNumber);
2634
2635 /* the rest of the scratch space is used for tuple data */
2636 tupledata = scratchptr;
2637
2638 /* check that the mutually exclusive flags are not both set */
2640
2641 xlrec->flags = 0;
2644
2645 /*
2646 * We don't have to worry about including a conflict xid in the
2647 * WAL record, as HEAP_INSERT_FROZEN intentionally violates
2648 * visibility rules.
2649 */
2650 if (all_frozen_set)
2652
2653 xlrec->ntuples = nthispage;
2654
2655 /*
2656 * Write out an xl_multi_insert_tuple and the tuple data itself
2657 * for each tuple.
2658 */
2659 for (i = 0; i < nthispage; i++)
2660 {
2662 xl_multi_insert_tuple *tuphdr;
2663 int datalen;
2664
2665 if (!init)
2666 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2667 /* xl_multi_insert_tuple needs two-byte alignment. */
2669 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2670
2671 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2672 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2673 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2674
2675 /* write bitmap [+ padding] [+ oid] + data */
2676 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2678 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2679 datalen);
2680 tuphdr->datalen = datalen;
2681 scratchptr += datalen;
2682 }
2683 totaldatalen = scratchptr - tupledata;
2684 Assert((scratchptr - scratch.data) < BLCKSZ);
2685
2686 if (need_tuple_data)
2688
2689 /*
2690 * Signal that this is the last xl_heap_multi_insert record
2691 * emitted by this call to heap_multi_insert(). Needed for logical
2692 * decoding so it knows when to cleanup temporary data.
2693 */
2694 if (ndone + nthispage == ntuples)
2696
2697 if (init)
2698 {
2699 info |= XLOG_HEAP_INIT_PAGE;
2701 }
2702
2703 /*
2704 * If we're doing logical decoding, include the new tuple data
2705 * even if we take a full-page image of the page.
2706 */
2707 if (need_tuple_data)
2709
2711 XLogRegisterData(xlrec, tupledata - scratch.data);
2713 if (all_frozen_set)
2714 XLogRegisterBuffer(1, vmbuffer, 0);
2715
2716 XLogRegisterBufData(0, tupledata, totaldatalen);
2717
2718 /* filtering by origin on a row level is much more efficient */
2720
2721 recptr = XLogInsert(RM_HEAP2_ID, info);
2722
2723 PageSetLSN(page, recptr);
2724 if (all_frozen_set)
2725 {
2726 Assert(BufferIsDirty(vmbuffer));
2727 PageSetLSN(BufferGetPage(vmbuffer), recptr);
2728 }
2729 }
2730
2732
2733 if (all_frozen_set)
2734 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
2735
2736 UnlockReleaseBuffer(buffer);
2737 ndone += nthispage;
2738
2739 /*
2740 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2741 * likely that we'll insert into subsequent heap pages that are likely
2742 * to use the same vm page.
2743 */
2744 }
2745
2746 /* We're done with inserting all tuples, so release the last vmbuffer. */
2747 if (vmbuffer != InvalidBuffer)
2748 ReleaseBuffer(vmbuffer);
2749
2750 /*
2751 * We're done with the actual inserts. Check for conflicts again, to
2752 * ensure that all rw-conflicts in to these inserts are detected. Without
2753 * this final check, a sequential scan of the heap may have locked the
2754 * table after the "before" check, missing one opportunity to detect the
2755 * conflict, and then scanned the table before the new tuples were there,
2756 * missing the other chance to detect the conflict.
2757 *
2758 * For heap inserts, we only need to check for table-level SSI locks. Our
2759 * new tuples can't possibly conflict with existing tuple locks, and heap
2760 * page locks are only consolidated versions of tuple locks; they do not
2761 * lock "gaps" as index page locks do. So we don't need to specify a
2762 * buffer when making the call.
2763 */
2765
2766 /*
2767 * If tuples are cacheable, mark them for invalidation from the caches in
2768 * case we abort. Note it is OK to do this after releasing the buffer,
2769 * because the heaptuples data structure is all in local memory, not in
2770 * the shared buffer.
2771 */
2772 if (IsCatalogRelation(relation))
2773 {
2774 for (i = 0; i < ntuples; i++)
2776 }
2777
2778 /* copy t_self fields back to the caller's slots */
2779 for (i = 0; i < ntuples; i++)
2780 slots[i]->tts_tid = heaptuples[i]->t_self;
2781
2782 pgstat_count_heap_insert(relation, ntuples);
2783}
2784
2785/*
2786 * simple_heap_insert - insert a tuple
2787 *
2788 * Currently, this routine differs from heap_insert only in supplying
2789 * a default command ID and not allowing access to the speedup options.
2790 *
2791 * This should be used rather than using heap_insert directly in most places
2792 * where we are modifying system catalogs.
2793 */
2794void
2796{
2797 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2798}
2799
2800/*
2801 * Given infomask/infomask2, compute the bits that must be saved in the
2802 * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2803 * xl_heap_lock_updated WAL records.
2804 *
2805 * See fix_infomask_from_infobits.
2806 */
2807static uint8
2809{
2810 return
2814 /* note we ignore HEAP_XMAX_SHR_LOCK here */
2816 ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2817 XLHL_KEYS_UPDATED : 0);
2818}
2819
2820/*
2821 * Given two versions of the same t_infomask for a tuple, compare them and
2822 * return whether the relevant status for a tuple Xmax has changed. This is
2823 * used after a buffer lock has been released and reacquired: we want to ensure
2824 * that the tuple state continues to be the same it was when we previously
2825 * examined it.
2826 *
2827 * Note the Xmax field itself must be compared separately.
2828 */
2829static inline bool
2831{
2832 const uint16 interesting =
2834
2835 if ((new_infomask & interesting) != (old_infomask & interesting))
2836 return true;
2837
2838 return false;
2839}
2840
2841/*
2842 * heap_delete - delete a tuple
2843 *
2844 * See table_tuple_delete() for an explanation of the parameters, except that
2845 * this routine directly takes a tuple rather than a slot.
2846 *
2847 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2848 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2849 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2850 * generated by another transaction).
2851 */
2853heap_delete(Relation relation, const ItemPointerData *tid,
2854 CommandId cid, Snapshot crosscheck, bool wait,
2855 TM_FailureData *tmfd, bool changingPart)
2856{
2857 TM_Result result;
2859 ItemId lp;
2860 HeapTupleData tp;
2861 Page page;
2862 BlockNumber block;
2863 Buffer buffer;
2864 Buffer vmbuffer = InvalidBuffer;
2865 TransactionId new_xmax;
2868 bool have_tuple_lock = false;
2869 bool iscombo;
2870 bool all_visible_cleared = false;
2871 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2872 bool old_key_copied = false;
2873
2875
2876 AssertHasSnapshotForToast(relation);
2877
2878 /*
2879 * Forbid this during a parallel operation, lest it allocate a combo CID.
2880 * Other workers might need that combo CID for visibility checks, and we
2881 * have no provision for broadcasting it to them.
2882 */
2883 if (IsInParallelMode())
2884 ereport(ERROR,
2886 errmsg("cannot delete tuples during a parallel operation")));
2887
2888 block = ItemPointerGetBlockNumber(tid);
2889 buffer = ReadBuffer(relation, block);
2890 page = BufferGetPage(buffer);
2891
2892 /*
2893 * Before locking the buffer, pin the visibility map page if it appears to
2894 * be necessary. Since we haven't got the lock yet, someone else might be
2895 * in the middle of changing this, so we'll need to recheck after we have
2896 * the lock.
2897 */
2898 if (PageIsAllVisible(page))
2899 visibilitymap_pin(relation, block, &vmbuffer);
2900
2902
2905
2906 tp.t_tableOid = RelationGetRelid(relation);
2907 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2908 tp.t_len = ItemIdGetLength(lp);
2909 tp.t_self = *tid;
2910
2911l1:
2912
2913 /*
2914 * If we didn't pin the visibility map page and the page has become all
2915 * visible while we were busy locking the buffer, we'll have to unlock and
2916 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2917 * unfortunate, but hopefully shouldn't happen often.
2918 */
2919 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2920 {
2922 visibilitymap_pin(relation, block, &vmbuffer);
2924 }
2925
2926 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2927
2928 if (result == TM_Invisible)
2929 {
2930 UnlockReleaseBuffer(buffer);
2931 ereport(ERROR,
2933 errmsg("attempted to delete invisible tuple")));
2934 }
2935 else if (result == TM_BeingModified && wait)
2936 {
2939
2940 /* must copy state data before unlocking buffer */
2943
2944 /*
2945 * Sleep until concurrent transaction ends -- except when there's a
2946 * single locker and it's our own transaction. Note we don't care
2947 * which lock mode the locker has, because we need the strongest one.
2948 *
2949 * Before sleeping, we need to acquire tuple lock to establish our
2950 * priority for the tuple (see heap_lock_tuple). LockTuple will
2951 * release us when we are next-in-line for the tuple.
2952 *
2953 * If we are forced to "start over" below, we keep the tuple lock;
2954 * this arranges that we stay at the head of the line while rechecking
2955 * tuple state.
2956 */
2958 {
2959 bool current_is_member = false;
2960
2963 {
2965
2966 /*
2967 * Acquire the lock, if necessary (but skip it when we're
2968 * requesting a lock and already have one; avoids deadlock).
2969 */
2970 if (!current_is_member)
2973
2974 /* wait for multixact */
2976 relation, &(tp.t_self), XLTW_Delete,
2977 NULL);
2979
2980 /*
2981 * If xwait had just locked the tuple then some other xact
2982 * could update this tuple before we get to this point. Check
2983 * for xmax change, and start over if so.
2984 *
2985 * We also must start over if we didn't pin the VM page, and
2986 * the page has become all visible.
2987 */
2988 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2991 xwait))
2992 goto l1;
2993 }
2994
2995 /*
2996 * You might think the multixact is necessarily done here, but not
2997 * so: it could have surviving members, namely our own xact or
2998 * other subxacts of this backend. It is legal for us to delete
2999 * the tuple in either case, however (the latter case is
3000 * essentially a situation of upgrading our former shared lock to
3001 * exclusive). We don't bother changing the on-disk hint bits
3002 * since we are about to overwrite the xmax altogether.
3003 */
3004 }
3006 {
3007 /*
3008 * Wait for regular transaction to end; but first, acquire tuple
3009 * lock.
3010 */
3014 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3016
3017 /*
3018 * xwait is done, but if xwait had just locked the tuple then some
3019 * other xact could update this tuple before we get to this point.
3020 * Check for xmax change, and start over if so.
3021 *
3022 * We also must start over if we didn't pin the VM page, and the
3023 * page has become all visible.
3024 */
3025 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
3028 xwait))
3029 goto l1;
3030
3031 /* Otherwise check if it committed or aborted */
3032 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3033 }
3034
3035 /*
3036 * We may overwrite if previous xmax aborted, or if it committed but
3037 * only locked the tuple without updating it.
3038 */
3039 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3042 result = TM_Ok;
3043 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
3044 result = TM_Updated;
3045 else
3046 result = TM_Deleted;
3047 }
3048
3049 /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3050 if (result != TM_Ok)
3051 {
3052 Assert(result == TM_SelfModified ||
3053 result == TM_Updated ||
3054 result == TM_Deleted ||
3055 result == TM_BeingModified);
3057 Assert(result != TM_Updated ||
3059 }
3060
3061 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3062 {
3063 /* Perform additional check for transaction-snapshot mode RI updates */
3064 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3065 result = TM_Updated;
3066 }
3067
3068 if (result != TM_Ok)
3069 {
3070 tmfd->ctid = tp.t_data->t_ctid;
3072 if (result == TM_SelfModified)
3074 else
3075 tmfd->cmax = InvalidCommandId;
3076 UnlockReleaseBuffer(buffer);
3077 if (have_tuple_lock)
3079 if (vmbuffer != InvalidBuffer)
3080 ReleaseBuffer(vmbuffer);
3081 return result;
3082 }
3083
3084 /*
3085 * We're about to do the actual delete -- check for conflict first, to
3086 * avoid possibly having to roll back work we've just done.
3087 *
3088 * This is safe without a recheck as long as there is no possibility of
3089 * another process scanning the page between this check and the delete
3090 * being visible to the scan (i.e., an exclusive buffer content lock is
3091 * continuously held from this point until the tuple delete is visible).
3092 */
3094
3095 /* replace cid with a combo CID if necessary */
3097
3098 /*
3099 * Compute replica identity tuple before entering the critical section so
3100 * we don't PANIC upon a memory allocation failure.
3101 */
3102 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3103
3104 /*
3105 * If this is the first possibly-multixact-able operation in the current
3106 * transaction, set my per-backend OldestMemberMXactId setting. We can be
3107 * certain that the transaction will never become a member of any older
3108 * MultiXactIds than that. (We have to do this even if we end up just
3109 * using our own TransactionId below, since some other backend could
3110 * incorporate our XID into a MultiXact immediately afterwards.)
3111 */
3113
3116 xid, LockTupleExclusive, true,
3117 &new_xmax, &new_infomask, &new_infomask2);
3118
3120
3121 /*
3122 * If this transaction commits, the tuple will become DEAD sooner or
3123 * later. Set flag that this page is a candidate for pruning once our xid
3124 * falls below the OldestXmin horizon. If the transaction finally aborts,
3125 * the subsequent page pruning will be a no-op and the hint will be
3126 * cleared.
3127 */
3128 PageSetPrunable(page, xid);
3129
3130 if (PageIsAllVisible(page))
3131 {
3132 all_visible_cleared = true;
3133 PageClearAllVisible(page);
3134 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3135 vmbuffer, VISIBILITYMAP_VALID_BITS);
3136 }
3137
3138 /* store transaction information of xact deleting the tuple */
3144 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3146 /* Make sure there is no forward chain link in t_ctid */
3147 tp.t_data->t_ctid = tp.t_self;
3148
3149 /* Signal that this is actually a move into another partition */
3150 if (changingPart)
3152
3153 MarkBufferDirty(buffer);
3154
3155 /*
3156 * XLOG stuff
3157 *
3158 * NB: heap_abort_speculative() uses the same xlog record and replay
3159 * routines.
3160 */
3161 if (RelationNeedsWAL(relation))
3162 {
3166
3167 /*
3168 * For logical decode we need combo CIDs to properly decode the
3169 * catalog
3170 */
3172 log_heap_new_cid(relation, &tp);
3173
3174 xlrec.flags = 0;
3177 if (changingPart)
3179 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3180 tp.t_data->t_infomask2);
3182 xlrec.xmax = new_xmax;
3183
3184 if (old_key_tuple != NULL)
3185 {
3186 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3188 else
3190 }
3191
3194
3196
3197 /*
3198 * Log replica identity of the deleted tuple if there is one
3199 */
3200 if (old_key_tuple != NULL)
3201 {
3202 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3203 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3204 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3205
3207 XLogRegisterData((char *) old_key_tuple->t_data
3209 old_key_tuple->t_len
3211 }
3212
3213 /* filtering by origin on a row level is much more efficient */
3215
3217
3218 PageSetLSN(page, recptr);
3219 }
3220
3222
3224
3225 if (vmbuffer != InvalidBuffer)
3226 ReleaseBuffer(vmbuffer);
3227
3228 /*
3229 * If the tuple has toasted out-of-line attributes, we need to delete
3230 * those items too. We have to do this before releasing the buffer
3231 * because we need to look at the contents of the tuple, but it's OK to
3232 * release the content lock on the buffer first.
3233 */
3234 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3235 relation->rd_rel->relkind != RELKIND_MATVIEW)
3236 {
3237 /* toast table entries should never be recursively toasted */
3239 }
3240 else if (HeapTupleHasExternal(&tp))
3241 heap_toast_delete(relation, &tp, false);
3242
3243 /*
3244 * Mark tuple for invalidation from system caches at next command
3245 * boundary. We have to do this before releasing the buffer because we
3246 * need to look at the contents of the tuple.
3247 */
3248 CacheInvalidateHeapTuple(relation, &tp, NULL);
3249
3250 /* Now we can release the buffer */
3251 ReleaseBuffer(buffer);
3252
3253 /*
3254 * Release the lmgr tuple lock, if we had it.
3255 */
3256 if (have_tuple_lock)
3258
3259 pgstat_count_heap_delete(relation);
3260
3263
3264 return TM_Ok;
3265}
3266
3267/*
3268 * simple_heap_delete - delete a tuple
3269 *
3270 * This routine may be used to delete a tuple when concurrent updates of
3271 * the target tuple are not expected (for example, because we have a lock
3272 * on the relation associated with the tuple). Any failure is reported
3273 * via ereport().
3274 */
3275void
3276simple_heap_delete(Relation relation, const ItemPointerData *tid)
3277{
3278 TM_Result result;
3279 TM_FailureData tmfd;
3280
3281 result = heap_delete(relation, tid,
3283 true /* wait for commit */ ,
3284 &tmfd, false /* changingPart */ );
3285 switch (result)
3286 {
3287 case TM_SelfModified:
3288 /* Tuple was already updated in current command? */
3289 elog(ERROR, "tuple already updated by self");
3290 break;
3291
3292 case TM_Ok:
3293 /* done successfully */
3294 break;
3295
3296 case TM_Updated:
3297 elog(ERROR, "tuple concurrently updated");
3298 break;
3299
3300 case TM_Deleted:
3301 elog(ERROR, "tuple concurrently deleted");
3302 break;
3303
3304 default:
3305 elog(ERROR, "unrecognized heap_delete status: %u", result);
3306 break;
3307 }
3308}
3309
3310/*
3311 * heap_update - replace a tuple
3312 *
3313 * See table_tuple_update() for an explanation of the parameters, except that
3314 * this routine directly takes a tuple rather than a slot.
3315 *
3316 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
3317 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
3318 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
3319 * generated by another transaction).
3320 */
3323 CommandId cid, Snapshot crosscheck, bool wait,
3324 TM_FailureData *tmfd, LockTupleMode *lockmode,
3326{
3327 TM_Result result;
3335 ItemId lp;
3339 bool old_key_copied = false;
3340 Page page,
3341 newpage;
3342 BlockNumber block;
3344 Buffer buffer,
3345 newbuf,
3346 vmbuffer = InvalidBuffer,
3348 bool need_toast;
3350 pagefree;
3351 bool have_tuple_lock = false;
3352 bool iscombo;
3353 bool use_hot_update = false;
3354 bool summarized_update = false;
3355 bool key_intact;
3356 bool all_visible_cleared = false;
3357 bool all_visible_cleared_new = false;
3358 bool checked_lockers;
3359 bool locker_remains;
3360 bool id_has_external = false;
3367
3369
3370 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3373
3374 AssertHasSnapshotForToast(relation);
3375
3376 /*
3377 * Forbid this during a parallel operation, lest it allocate a combo CID.
3378 * Other workers might need that combo CID for visibility checks, and we
3379 * have no provision for broadcasting it to them.
3380 */
3381 if (IsInParallelMode())
3382 ereport(ERROR,
3384 errmsg("cannot update tuples during a parallel operation")));
3385
3386#ifdef USE_ASSERT_CHECKING
3388#endif
3389
3390 /*
3391 * Fetch the list of attributes to be checked for various operations.
3392 *
3393 * For HOT considerations, this is wasted effort if we fail to update or
3394 * have to put the new tuple on a different page. But we must compute the
3395 * list before obtaining buffer lock --- in the worst case, if we are
3396 * doing an update on one of the relevant system catalogs, we could
3397 * deadlock if we try to fetch the list later. In any case, the relcache
3398 * caches the data so this is usually pretty cheap.
3399 *
3400 * We also need columns used by the replica identity and columns that are
3401 * considered the "key" of rows in the table.
3402 *
3403 * Note that we get copies of each bitmap, so we need not worry about
3404 * relcache flush happening midway through.
3405 */
3418
3420 INJECTION_POINT("heap_update-before-pin", NULL);
3421 buffer = ReadBuffer(relation, block);
3422 page = BufferGetPage(buffer);
3423
3424 /*
3425 * Before locking the buffer, pin the visibility map page if it appears to
3426 * be necessary. Since we haven't got the lock yet, someone else might be
3427 * in the middle of changing this, so we'll need to recheck after we have
3428 * the lock.
3429 */
3430 if (PageIsAllVisible(page))
3431 visibilitymap_pin(relation, block, &vmbuffer);
3432
3434
3436
3437 /*
3438 * Usually, a buffer pin and/or snapshot blocks pruning of otid, ensuring
3439 * we see LP_NORMAL here. When the otid origin is a syscache, we may have
3440 * neither a pin nor a snapshot. Hence, we may see other LP_ states, each
3441 * of which indicates concurrent pruning.
3442 *
3443 * Failing with TM_Updated would be most accurate. However, unlike other
3444 * TM_Updated scenarios, we don't know the successor ctid in LP_UNUSED and
3445 * LP_DEAD cases. While the distinction between TM_Updated and TM_Deleted
3446 * does matter to SQL statements UPDATE and MERGE, those SQL statements
3447 * hold a snapshot that ensures LP_NORMAL. Hence, the choice between
3448 * TM_Updated and TM_Deleted affects only the wording of error messages.
3449 * Settle on TM_Deleted, for two reasons. First, it avoids complicating
3450 * the specification of when tmfd->ctid is valid. Second, it creates
3451 * error log evidence that we took this branch.
3452 *
3453 * Since it's possible to see LP_UNUSED at otid, it's also possible to see
3454 * LP_NORMAL for a tuple that replaced LP_UNUSED. If it's a tuple for an
3455 * unrelated row, we'll fail with "duplicate key value violates unique".
3456 * XXX if otid is the live, newer version of the newtup row, we'll discard
3457 * changes originating in versions of this catalog row after the version
3458 * the caller got from syscache. See syscache-update-pruned.spec.
3459 */
3460 if (!ItemIdIsNormal(lp))
3461 {
3463
3464 UnlockReleaseBuffer(buffer);
3466 if (vmbuffer != InvalidBuffer)
3467 ReleaseBuffer(vmbuffer);
3468 tmfd->ctid = *otid;
3469 tmfd->xmax = InvalidTransactionId;
3470 tmfd->cmax = InvalidCommandId;
3472
3477 /* modified_attrs not yet initialized */
3479 return TM_Deleted;
3480 }
3481
3482 /*
3483 * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3484 * properly.
3485 */
3486 oldtup.t_tableOid = RelationGetRelid(relation);
3487 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3488 oldtup.t_len = ItemIdGetLength(lp);
3489 oldtup.t_self = *otid;
3490
3491 /* the new tuple is ready, except for this: */
3492 newtup->t_tableOid = RelationGetRelid(relation);
3493
3494 /*
3495 * Determine columns modified by the update. Additionally, identify
3496 * whether any of the unmodified replica identity key attributes in the
3497 * old tuple is externally stored or not. This is required because for
3498 * such attributes the flattened value won't be WAL logged as part of the
3499 * new tuple so we must include it as part of the old_key_tuple. See
3500 * ExtractReplicaIdentity.
3501 */
3503 id_attrs, &oldtup,
3505
3506 /*
3507 * If we're not updating any "key" column, we can grab a weaker lock type.
3508 * This allows for more concurrency when we are running simultaneously
3509 * with foreign key checks.
3510 *
3511 * Note that if a column gets detoasted while executing the update, but
3512 * the value ends up being the same, this test will fail and we will use
3513 * the stronger lock. This is acceptable; the important case to optimize
3514 * is updates that don't manipulate key columns, not those that
3515 * serendipitously arrive at the same key values.
3516 */
3518 {
3519 *lockmode = LockTupleNoKeyExclusive;
3521 key_intact = true;
3522
3523 /*
3524 * If this is the first possibly-multixact-able operation in the
3525 * current transaction, set my per-backend OldestMemberMXactId
3526 * setting. We can be certain that the transaction will never become a
3527 * member of any older MultiXactIds than that. (We have to do this
3528 * even if we end up just using our own TransactionId below, since
3529 * some other backend could incorporate our XID into a MultiXact
3530 * immediately afterwards.)
3531 */
3533 }
3534 else
3535 {
3536 *lockmode = LockTupleExclusive;
3538 key_intact = false;
3539 }
3540
3541 /*
3542 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3543 * otid may very well point at newtup->t_self, which we will overwrite
3544 * with the new tuple's location, so there's great risk of confusion if we
3545 * use otid anymore.
3546 */
3547
3548l2:
3549 checked_lockers = false;
3550 locker_remains = false;
3551 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3552
3553 /* see below about the "no wait" case */
3554 Assert(result != TM_BeingModified || wait);
3555
3556 if (result == TM_Invisible)
3557 {
3558 UnlockReleaseBuffer(buffer);
3559 ereport(ERROR,
3561 errmsg("attempted to update invisible tuple")));
3562 }
3563 else if (result == TM_BeingModified && wait)
3564 {
3567 bool can_continue = false;
3568
3569 /*
3570 * XXX note that we don't consider the "no wait" case here. This
3571 * isn't a problem currently because no caller uses that case, but it
3572 * should be fixed if such a caller is introduced. It wasn't a
3573 * problem previously because this code would always wait, but now
3574 * that some tuple locks do not conflict with one of the lock modes we
3575 * use, it is possible that this case is interesting to handle
3576 * specially.
3577 *
3578 * This may cause failures with third-party code that calls
3579 * heap_update directly.
3580 */
3581
3582 /* must copy state data before unlocking buffer */
3584 infomask = oldtup.t_data->t_infomask;
3585
3586 /*
3587 * Now we have to do something about the existing locker. If it's a
3588 * multi, sleep on it; we might be awakened before it is completely
3589 * gone (or even not sleep at all in some cases); we need to preserve
3590 * it as locker, unless it is gone completely.
3591 *
3592 * If it's not a multi, we need to check for sleeping conditions
3593 * before actually going to sleep. If the update doesn't conflict
3594 * with the locks, we just continue without sleeping (but making sure
3595 * it is preserved).
3596 *
3597 * Before sleeping, we need to acquire tuple lock to establish our
3598 * priority for the tuple (see heap_lock_tuple). LockTuple will
3599 * release us when we are next-in-line for the tuple. Note we must
3600 * not acquire the tuple lock until we're sure we're going to sleep;
3601 * otherwise we're open for race conditions with other transactions
3602 * holding the tuple lock which sleep on us.
3603 *
3604 * If we are forced to "start over" below, we keep the tuple lock;
3605 * this arranges that we stay at the head of the line while rechecking
3606 * tuple state.
3607 */
3609 {
3611 int remain;
3612 bool current_is_member = false;
3613
3615 *lockmode, &current_is_member))
3616 {
3618
3619 /*
3620 * Acquire the lock, if necessary (but skip it when we're
3621 * requesting a lock and already have one; avoids deadlock).
3622 */
3623 if (!current_is_member)
3624 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3626
3627 /* wait for multixact */
3629 relation, &oldtup.t_self, XLTW_Update,
3630 &remain);
3631 checked_lockers = true;
3632 locker_remains = remain != 0;
3634
3635 /*
3636 * If xwait had just locked the tuple then some other xact
3637 * could update this tuple before we get to this point. Check
3638 * for xmax change, and start over if so.
3639 */
3640 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3641 infomask) ||
3643 xwait))
3644 goto l2;
3645 }
3646
3647 /*
3648 * Note that the multixact may not be done by now. It could have
3649 * surviving members; our own xact or other subxacts of this
3650 * backend, and also any other concurrent transaction that locked
3651 * the tuple with LockTupleKeyShare if we only got
3652 * LockTupleNoKeyExclusive. If this is the case, we have to be
3653 * careful to mark the updated tuple with the surviving members in
3654 * Xmax.
3655 *
3656 * Note that there could have been another update in the
3657 * MultiXact. In that case, we need to check whether it committed
3658 * or aborted. If it aborted we are safe to update it again;
3659 * otherwise there is an update conflict, and we have to return
3660 * TableTuple{Deleted, Updated} below.
3661 *
3662 * In the LockTupleExclusive case, we still need to preserve the
3663 * surviving members: those would include the tuple locks we had
3664 * before this one, which are important to keep in case this
3665 * subxact aborts.
3666 */
3667 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3669 else
3671
3672 /*
3673 * There was no UPDATE in the MultiXact; or it aborted. No
3674 * TransactionIdIsInProgress() call needed here, since we called
3675 * MultiXactIdWait() above.
3676 */
3679 can_continue = true;
3680 }
3682 {
3683 /*
3684 * The only locker is ourselves; we can avoid grabbing the tuple
3685 * lock here, but must preserve our locking information.
3686 */
3687 checked_lockers = true;
3688 locker_remains = true;
3689 can_continue = true;
3690 }
3692 {
3693 /*
3694 * If it's just a key-share locker, and we're not changing the key
3695 * columns, we don't need to wait for it to end; but we need to
3696 * preserve it as locker.
3697 */
3698 checked_lockers = true;
3699 locker_remains = true;
3700 can_continue = true;
3701 }
3702 else
3703 {
3704 /*
3705 * Wait for regular transaction to end; but first, acquire tuple
3706 * lock.
3707 */
3709 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3711 XactLockTableWait(xwait, relation, &oldtup.t_self,
3712 XLTW_Update);
3713 checked_lockers = true;
3715
3716 /*
3717 * xwait is done, but if xwait had just locked the tuple then some
3718 * other xact could update this tuple before we get to this point.
3719 * Check for xmax change, and start over if so.
3720 */
3721 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3724 goto l2;
3725
3726 /* Otherwise check if it committed or aborted */
3727 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3728 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3729 can_continue = true;
3730 }
3731
3732 if (can_continue)
3733 result = TM_Ok;
3734 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3735 result = TM_Updated;
3736 else
3737 result = TM_Deleted;
3738 }
3739
3740 /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3741 if (result != TM_Ok)
3742 {
3743 Assert(result == TM_SelfModified ||
3744 result == TM_Updated ||
3745 result == TM_Deleted ||
3746 result == TM_BeingModified);
3747 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3748 Assert(result != TM_Updated ||
3749 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3750 }
3751
3752 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3753 {
3754 /* Perform additional check for transaction-snapshot mode RI updates */
3756 result = TM_Updated;
3757 }
3758
3759 if (result != TM_Ok)
3760 {
3761 tmfd->ctid = oldtup.t_data->t_ctid;
3762 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3763 if (result == TM_SelfModified)
3764 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3765 else
3766 tmfd->cmax = InvalidCommandId;
3767 UnlockReleaseBuffer(buffer);
3768 if (have_tuple_lock)
3769 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3770 if (vmbuffer != InvalidBuffer)
3771 ReleaseBuffer(vmbuffer);
3773
3780 return result;
3781 }
3782
3783 /*
3784 * If we didn't pin the visibility map page and the page has become all
3785 * visible while we were busy locking the buffer, or during some
3786 * subsequent window during which we had it unlocked, we'll have to unlock
3787 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3788 * bit unfortunate, especially since we'll now have to recheck whether the
3789 * tuple has been locked or updated under us, but hopefully it won't
3790 * happen very often.
3791 */
3792 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3793 {
3795 visibilitymap_pin(relation, block, &vmbuffer);
3797 goto l2;
3798 }
3799
3800 /* Fill in transaction status data */
3801
3802 /*
3803 * If the tuple we're updating is locked, we need to preserve the locking
3804 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3805 */
3807 oldtup.t_data->t_infomask,
3808 oldtup.t_data->t_infomask2,
3809 xid, *lockmode, true,
3812
3813 /*
3814 * And also prepare an Xmax value for the new copy of the tuple. If there
3815 * was no xmax previously, or there was one but all lockers are now gone,
3816 * then use InvalidTransactionId; otherwise, get the xmax from the old
3817 * tuple. (In rare cases that might also be InvalidTransactionId and yet
3818 * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3819 */
3820 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3821 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3824 else
3826
3828 {
3831 }
3832 else
3833 {
3834 /*
3835 * If we found a valid Xmax for the new tuple, then the infomask bits
3836 * to use on the new tuple depend on what was there on the old one.
3837 * Note that since we're doing an update, the only possibility is that
3838 * the lockers had FOR KEY SHARE lock.
3839 */
3840 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3841 {
3844 }
3845 else
3846 {
3849 }
3850 }
3851
3852 /*
3853 * Prepare the new tuple with the appropriate initial values of Xmin and
3854 * Xmax, as well as initial infomask bits as computed above.
3855 */
3856 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3857 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3858 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3860 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3861 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3863
3864 /*
3865 * Replace cid with a combo CID if necessary. Note that we already put
3866 * the plain cid into the new tuple.
3867 */
3869
3870 /*
3871 * If the toaster needs to be activated, OR if the new tuple will not fit
3872 * on the same page as the old, then we need to release the content lock
3873 * (but not the pin!) on the old tuple's buffer while we are off doing
3874 * TOAST and/or table-file-extension work. We must mark the old tuple to
3875 * show that it's locked, else other processes may try to update it
3876 * themselves.
3877 *
3878 * We need to invoke the toaster if there are already any out-of-line
3879 * toasted values present, or if the new tuple is over-threshold.
3880 */
3881 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3882 relation->rd_rel->relkind != RELKIND_MATVIEW)
3883 {
3884 /* toast table entries should never be recursively toasted */
3887 need_toast = false;
3888 }
3889 else
3892 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3893
3895
3896 newtupsize = MAXALIGN(newtup->t_len);
3897
3899 {
3903 bool cleared_all_frozen = false;
3904
3905 /*
3906 * To prevent concurrent sessions from updating the tuple, we have to
3907 * temporarily mark it locked, while we release the page-level lock.
3908 *
3909 * To satisfy the rule that any xid potentially appearing in a buffer
3910 * written out to disk, we unfortunately have to WAL log this
3911 * temporary modification. We can reuse xl_heap_lock for this
3912 * purpose. If we crash/error before following through with the
3913 * actual update, xmax will be of an aborted transaction, allowing
3914 * other sessions to proceed.
3915 */
3916
3917 /*
3918 * Compute xmax / infomask appropriate for locking the tuple. This has
3919 * to be done separately from the combo that's going to be used for
3920 * updating, because the potentially created multixact would otherwise
3921 * be wrong.
3922 */
3924 oldtup.t_data->t_infomask,
3925 oldtup.t_data->t_infomask2,
3926 xid, *lockmode, false,
3929
3931
3933
3934 /* Clear obsolete visibility flags ... */
3935 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3936 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3938 /* ... and store info about transaction updating this tuple */
3941 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3942 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3944
3945 /* temporarily make it look not-updated, but locked */
3946 oldtup.t_data->t_ctid = oldtup.t_self;
3947
3948 /*
3949 * Clear all-frozen bit on visibility map if needed. We could
3950 * immediately reset ALL_VISIBLE, but given that the WAL logging
3951 * overhead would be unchanged, that doesn't seem necessarily
3952 * worthwhile.
3953 */
3954 if (PageIsAllVisible(page) &&
3955 visibilitymap_clear(relation, block, vmbuffer,
3957 cleared_all_frozen = true;
3958
3959 MarkBufferDirty(buffer);
3960
3961 if (RelationNeedsWAL(relation))
3962 {
3965
3968
3969 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3971 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3972 oldtup.t_data->t_infomask2);
3973 xlrec.flags =
3977 PageSetLSN(page, recptr);
3978 }
3979
3981
3983
3984 /*
3985 * Let the toaster do its thing, if needed.
3986 *
3987 * Note: below this point, heaptup is the data we actually intend to
3988 * store into the relation; newtup is the caller's original untoasted
3989 * data.
3990 */
3991 if (need_toast)
3992 {
3993 /* Note we always use WAL and FSM during updates */
3995 newtupsize = MAXALIGN(heaptup->t_len);
3996 }
3997 else
3998 heaptup = newtup;
3999
4000 /*
4001 * Now, do we need a new page for the tuple, or not? This is a bit
4002 * tricky since someone else could have added tuples to the page while
4003 * we weren't looking. We have to recheck the available space after
4004 * reacquiring the buffer lock. But don't bother to do that if the
4005 * former amount of free space is still not enough; it's unlikely
4006 * there's more free now than before.
4007 *
4008 * What's more, if we need to get a new page, we will need to acquire
4009 * buffer locks on both old and new pages. To avoid deadlock against
4010 * some other backend trying to get the same two locks in the other
4011 * order, we must be consistent about the order we get the locks in.
4012 * We use the rule "lock the lower-numbered page of the relation
4013 * first". To implement this, we must do RelationGetBufferForTuple
4014 * while not holding the lock on the old page, and we must rely on it
4015 * to get the locks on both pages in the correct order.
4016 *
4017 * Another consideration is that we need visibility map page pin(s) if
4018 * we will have to clear the all-visible flag on either page. If we
4019 * call RelationGetBufferForTuple, we rely on it to acquire any such
4020 * pins; but if we don't, we have to handle that here. Hence we need
4021 * a loop.
4022 */
4023 for (;;)
4024 {
4025 if (newtupsize > pagefree)
4026 {
4027 /* It doesn't fit, must use RelationGetBufferForTuple. */
4028 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4029 buffer, 0, NULL,
4030 &vmbuffer_new, &vmbuffer,
4031 0);
4032 /* We're all done. */
4033 break;
4034 }
4035 /* Acquire VM page pin if needed and we don't have it. */
4036 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4037 visibilitymap_pin(relation, block, &vmbuffer);
4038 /* Re-acquire the lock on the old tuple's page. */
4040 /* Re-check using the up-to-date free space */
4042 if (newtupsize > pagefree ||
4043 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
4044 {
4045 /*
4046 * Rats, it doesn't fit anymore, or somebody just now set the
4047 * all-visible flag. We must now unlock and loop to avoid
4048 * deadlock. Fortunately, this path should seldom be taken.
4049 */
4051 }
4052 else
4053 {
4054 /* We're all done. */
4055 newbuf = buffer;
4056 break;
4057 }
4058 }
4059 }
4060 else
4061 {
4062 /* No TOAST work needed, and it'll fit on same page */
4063 newbuf = buffer;
4064 heaptup = newtup;
4065 }
4066
4068
4069 /*
4070 * We're about to do the actual update -- check for conflict first, to
4071 * avoid possibly having to roll back work we've just done.
4072 *
4073 * This is safe without a recheck as long as there is no possibility of
4074 * another process scanning the pages between this check and the update
4075 * being visible to the scan (i.e., exclusive buffer content lock(s) are
4076 * continuously held from this point until the tuple update is visible).
4077 *
4078 * For the new tuple the only check needed is at the relation level, but
4079 * since both tuples are in the same relation and the check for oldtup
4080 * will include checking the relation level, there is no benefit to a
4081 * separate check for the new tuple.
4082 */
4083 CheckForSerializableConflictIn(relation, &oldtup.t_self,
4084 BufferGetBlockNumber(buffer));
4085
4086 /*
4087 * At this point newbuf and buffer are both pinned and locked, and newbuf
4088 * has enough space for the new tuple. If they are the same buffer, only
4089 * one pin is held.
4090 */
4091
4092 if (newbuf == buffer)
4093 {
4094 /*
4095 * Since the new tuple is going into the same page, we might be able
4096 * to do a HOT update. Check if any of the index columns have been
4097 * changed.
4098 */
4100 {
4101 use_hot_update = true;
4102
4103 /*
4104 * If none of the columns that are used in hot-blocking indexes
4105 * were updated, we can apply HOT, but we do still need to check
4106 * if we need to update the summarizing indexes, and update those
4107 * indexes if the columns were updated, or we may fail to detect
4108 * e.g. value bound changes in BRIN minmax indexes.
4109 */
4111 summarized_update = true;
4112 }
4113 }
4114 else
4115 {
4116 /* Set a hint that the old page could use prune/defrag */
4117 PageSetFull(page);
4118 }
4119
4120 /*
4121 * Compute replica identity tuple before entering the critical section so
4122 * we don't PANIC upon a memory allocation failure.
4123 * ExtractReplicaIdentity() will return NULL if nothing needs to be
4124 * logged. Pass old key required as true only if the replica identity key
4125 * columns are modified or it has external data.
4126 */
4131
4132 /* NO EREPORT(ERROR) from here till changes are logged */
4134
4135 /*
4136 * If this transaction commits, the old tuple will become DEAD sooner or
4137 * later. Set flag that this page is a candidate for pruning once our xid
4138 * falls below the OldestXmin horizon. If the transaction finally aborts,
4139 * the subsequent page pruning will be a no-op and the hint will be
4140 * cleared.
4141 *
4142 * XXX Should we set hint on newbuf as well? If the transaction aborts,
4143 * there would be a prunable tuple in the newbuf; but for now we choose
4144 * not to optimize for aborts. Note that heap_xlog_update must be kept in
4145 * sync if this decision changes.
4146 */
4147 PageSetPrunable(page, xid);
4148
4149 if (use_hot_update)
4150 {
4151 /* Mark the old tuple as HOT-updated */
4153 /* And mark the new tuple as heap-only */
4155 /* Mark the caller's copy too, in case different from heaptup */
4157 }
4158 else
4159 {
4160 /* Make sure tuples are correctly marked as not-HOT */
4164 }
4165
4166 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4167
4168
4169 /* Clear obsolete visibility flags, possibly set by ourselves above... */
4170 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4171 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4172 /* ... and store info about transaction updating this tuple */
4175 oldtup.t_data->t_infomask |= infomask_old_tuple;
4176 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4178
4179 /* record address of new tuple in t_ctid of old one */
4180 oldtup.t_data->t_ctid = heaptup->t_self;
4181
4182 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4183 if (PageIsAllVisible(page))
4184 {
4185 all_visible_cleared = true;
4186 PageClearAllVisible(page);
4187 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4188 vmbuffer, VISIBILITYMAP_VALID_BITS);
4189 }
4190 if (newbuf != buffer && PageIsAllVisible(newpage))
4191 {
4196 }
4197
4198 if (newbuf != buffer)
4200 MarkBufferDirty(buffer);
4201
4202 /* XLOG stuff */
4203 if (RelationNeedsWAL(relation))
4204 {
4206
4207 /*
4208 * For logical decoding we need combo CIDs to properly decode the
4209 * catalog.
4210 */
4212 {
4213 log_heap_new_cid(relation, &oldtup);
4214 log_heap_new_cid(relation, heaptup);
4215 }
4216
4217 recptr = log_heap_update(relation, buffer,
4222 if (newbuf != buffer)
4223 {
4225 }
4226 PageSetLSN(page, recptr);
4227 }
4228
4230
4231 if (newbuf != buffer)
4234
4235 /*
4236 * Mark old tuple for invalidation from system caches at next command
4237 * boundary, and mark the new tuple for invalidation in case we abort. We
4238 * have to do this before releasing the buffer because oldtup is in the
4239 * buffer. (heaptup is all in local memory, but it's necessary to process
4240 * both tuple versions in one call to inval.c so we can avoid redundant
4241 * sinval messages.)
4242 */
4244
4245 /* Now we can release the buffer(s) */
4246 if (newbuf != buffer)
4248 ReleaseBuffer(buffer);
4251 if (BufferIsValid(vmbuffer))
4252 ReleaseBuffer(vmbuffer);
4253
4254 /*
4255 * Release the lmgr tuple lock, if we had it.
4256 */
4257 if (have_tuple_lock)
4258 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4259
4260 pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
4261
4262 /*
4263 * If heaptup is a private copy, release it. Don't forget to copy t_self
4264 * back to the caller's image, too.
4265 */
4266 if (heaptup != newtup)
4267 {
4268 newtup->t_self = heaptup->t_self;
4270 }
4271
4272 /*
4273 * If it is a HOT update, the update may still need to update summarized
4274 * indexes, lest we fail to update those summaries and get incorrect
4275 * results (for example, minmax bounds of the block may change with this
4276 * update).
4277 */
4278 if (use_hot_update)
4279 {
4282 else
4284 }
4285 else
4287
4290
4297
4298 return TM_Ok;
4299}
4300
4301#ifdef USE_ASSERT_CHECKING
4302/*
4303 * Confirm adequate lock held during heap_update(), per rules from
4304 * README.tuplock section "Locking to write inplace-updated tables".
4305 */
4306static void
4308 const ItemPointerData *otid,
4310{
4311 /* LOCKTAG_TUPLE acceptable for any catalog */
4312 switch (RelationGetRelid(relation))
4313 {
4314 case RelationRelationId:
4315 case DatabaseRelationId:
4316 {
4318
4320 relation->rd_lockInfo.lockRelId.dbId,
4321 relation->rd_lockInfo.lockRelId.relId,
4325 return;
4326 }
4327 break;
4328 default:
4329 Assert(!IsInplaceUpdateRelation(relation));
4330 return;
4331 }
4332
4333 switch (RelationGetRelid(relation))
4334 {
4335 case RelationRelationId:
4336 {
4337 /* LOCKTAG_TUPLE or LOCKTAG_RELATION ok */
4339 Oid relid = classForm->oid;
4340 Oid dbid;
4341 LOCKTAG tag;
4342
4343 if (IsSharedRelation(relid))
4344 dbid = InvalidOid;
4345 else
4346 dbid = MyDatabaseId;
4347
4348 if (classForm->relkind == RELKIND_INDEX)
4349 {
4350 Relation irel = index_open(relid, AccessShareLock);
4351
4352 SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4354 }
4355 else
4356 SET_LOCKTAG_RELATION(tag, dbid, relid);
4357
4358 if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, false) &&
4359 !LockHeldByMe(&tag, ShareRowExclusiveLock, true))
4360 elog(WARNING,
4361 "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4362 NameStr(classForm->relname),
4363 relid,
4364 classForm->relkind,
4367 }
4368 break;
4369 case DatabaseRelationId:
4370 {
4371 /* LOCKTAG_TUPLE required */
4373
4374 elog(WARNING,
4375 "missing lock on database \"%s\" (OID %u) @ TID (%u,%u)",
4376 NameStr(dbForm->datname),
4377 dbForm->oid,
4380 }
4381 break;
4382 }
4383}
4384
4385/*
4386 * Confirm adequate relation lock held, per rules from README.tuplock section
4387 * "Locking to write inplace-updated tables".
4388 */
4389static void
4391{
4393 Oid relid = classForm->oid;
4394 Oid dbid;
4395 LOCKTAG tag;
4396
4397 if (IsSharedRelation(relid))
4398 dbid = InvalidOid;
4399 else
4400 dbid = MyDatabaseId;
4401
4402 if (classForm->relkind == RELKIND_INDEX)
4403 {
4404 Relation irel = index_open(relid, AccessShareLock);
4405
4406 SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4408 }
4409 else
4410 SET_LOCKTAG_RELATION(tag, dbid, relid);
4411
4412 if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, true))
4413 elog(WARNING,
4414 "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4415 NameStr(classForm->relname),
4416 relid,
4417 classForm->relkind,
4420}
4421#endif
4422
4423/*
4424 * Check if the specified attribute's values are the same. Subroutine for
4425 * HeapDetermineColumnsInfo.
4426 */
4427static bool
4428heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2,
4429 bool isnull1, bool isnull2)
4430{
4431 /*
4432 * If one value is NULL and other is not, then they are certainly not
4433 * equal
4434 */
4435 if (isnull1 != isnull2)
4436 return false;
4437
4438 /*
4439 * If both are NULL, they can be considered equal.
4440 */
4441 if (isnull1)
4442 return true;
4443
4444 /*
4445 * We do simple binary comparison of the two datums. This may be overly
4446 * strict because there can be multiple binary representations for the
4447 * same logical value. But we should be OK as long as there are no false
4448 * positives. Using a type-specific equality operator is messy because
4449 * there could be multiple notions of equality in different operator
4450 * classes; furthermore, we cannot safely invoke user-defined functions
4451 * while holding exclusive buffer lock.
4452 */
4453 if (attrnum <= 0)
4454 {
4455 /* The only allowed system columns are OIDs, so do this */
4457 }
4458 else
4459 {
4461
4463 att = TupleDescCompactAttr(tupdesc, attrnum - 1);
4464 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4465 }
4466}
4467
4468/*
4469 * Check which columns are being updated.
4470 *
4471 * Given an updated tuple, determine (and return into the output bitmapset),
4472 * from those listed as interesting, the set of columns that changed.
4473 *
4474 * has_external indicates if any of the unmodified attributes (from those
4475 * listed as interesting) of the old tuple is a member of external_cols and is
4476 * stored externally.
4477 */
4478static Bitmapset *
4483 bool *has_external)
4484{
4485 int attidx;
4487 TupleDesc tupdesc = RelationGetDescr(relation);
4488
4489 attidx = -1;
4490 while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
4491 {
4492 /* attidx is zero-based, attrnum is the normal attribute number */
4494 Datum value1,
4495 value2;
4496 bool isnull1,
4497 isnull2;
4498
4499 /*
4500 * If it's a whole-tuple reference, say "not equal". It's not really
4501 * worth supporting this case, since it could only succeed after a
4502 * no-op update, which is hardly a case worth optimizing for.
4503 */
4504 if (attrnum == 0)
4505 {
4506 modified = bms_add_member(modified, attidx);
4507 continue;
4508 }
4509
4510 /*
4511 * Likewise, automatically say "not equal" for any system attribute
4512 * other than tableOID; we cannot expect these to be consistent in a
4513 * HOT chain, or even to be set correctly yet in the new tuple.
4514 */
4515 if (attrnum < 0)
4516 {
4517 if (attrnum != TableOidAttributeNumber)
4518 {
4519 modified = bms_add_member(modified, attidx);
4520 continue;
4521 }
4522 }
4523
4524 /*
4525 * Extract the corresponding values. XXX this is pretty inefficient
4526 * if there are many indexed columns. Should we do a single
4527 * heap_deform_tuple call on each tuple, instead? But that doesn't
4528 * work for system columns ...
4529 */
4530 value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4531 value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4532
4533 if (!heap_attr_equals(tupdesc, attrnum, value1,
4534 value2, isnull1, isnull2))
4535 {
4536 modified = bms_add_member(modified, attidx);
4537 continue;
4538 }
4539
4540 /*
4541 * No need to check attributes that can't be stored externally. Note
4542 * that system attributes can't be stored externally.
4543 */
4544 if (attrnum < 0 || isnull1 ||
4545 TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
4546 continue;
4547
4548 /*
4549 * Check if the old tuple's attribute is stored externally and is a
4550 * member of external_cols.
4551 */
4554 *has_external = true;
4555 }
4556
4557 return modified;
4558}
4559
4560/*
4561 * simple_heap_update - replace a tuple
4562 *
4563 * This routine may be used to update a tuple when concurrent updates of
4564 * the target tuple are not expected (for example, because we have a lock
4565 * on the relation associated with the tuple). Any failure is reported
4566 * via ereport().
4567 */
4568void
4571{
4572 TM_Result result;
4573 TM_FailureData tmfd;
4574 LockTupleMode lockmode;
4575
4576 result = heap_update(relation, otid, tup,
4578 true /* wait for commit */ ,
4579 &tmfd, &lockmode, update_indexes);
4580 switch (result)
4581 {
4582 case TM_SelfModified:
4583 /* Tuple was already updated in current command? */
4584 elog(ERROR, "tuple already updated by self");
4585 break;
4586
4587 case TM_Ok:
4588 /* done successfully */
4589 break;
4590
4591 case TM_Updated:
4592 elog(ERROR, "tuple concurrently updated");
4593 break;
4594
4595 case TM_Deleted:
4596 elog(ERROR, "tuple concurrently deleted");
4597 break;
4598
4599 default:
4600 elog(ERROR, "unrecognized heap_update status: %u", result);
4601 break;
4602 }
4603}
4604
4605
4606/*
4607 * Return the MultiXactStatus corresponding to the given tuple lock mode.
4608 */
4609static MultiXactStatus
4611{
4612 int retval;
4613
4614 if (is_update)
4615 retval = tupleLockExtraInfo[mode].updstatus;
4616 else
4617 retval = tupleLockExtraInfo[mode].lockstatus;
4618
4619 if (retval == -1)
4620 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4621 is_update ? "true" : "false");
4622
4623 return (MultiXactStatus) retval;
4624}
4625
4626/*
4627 * heap_lock_tuple - lock a tuple in shared or exclusive mode
4628 *
4629 * Note that this acquires a buffer pin, which the caller must release.
4630 *
4631 * Input parameters:
4632 * relation: relation containing tuple (caller must hold suitable lock)
4633 * cid: current command ID (used for visibility test, and stored into
4634 * tuple's cmax if lock is successful)
4635 * mode: indicates if shared or exclusive tuple lock is desired
4636 * wait_policy: what to do if tuple lock is not available
4637 * follow_updates: if true, follow the update chain to also lock descendant
4638 * tuples.
4639 *
4640 * Output parameters:
4641 * *tuple: all fields filled in
4642 * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4643 * *tmfd: filled in failure cases (see below)
4644 *
4645 * Function results are the same as the ones for table_tuple_lock().
4646 *
4647 * In the failure cases other than TM_Invisible, the routine fills
4648 * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4649 * if necessary), and t_cmax (the last only for TM_SelfModified,
4650 * since we cannot obtain cmax from a combo CID generated by another
4651 * transaction).
4652 * See comments for struct TM_FailureData for additional info.
4653 *
4654 * See README.tuplock for a thorough explanation of this mechanism.
4655 */
4657heap_lock_tuple(Relation relation, HeapTuple tuple,
4659 bool follow_updates,
4660 Buffer *buffer, TM_FailureData *tmfd)
4661{
4662 TM_Result result;
4663 ItemPointer tid = &(tuple->t_self);
4664 ItemId lp;
4665 Page page;
4666 Buffer vmbuffer = InvalidBuffer;
4667 BlockNumber block;
4668 TransactionId xid,
4669 xmax;
4673 bool first_time = true;
4674 bool skip_tuple_lock = false;
4675 bool have_tuple_lock = false;
4676 bool cleared_all_frozen = false;
4677
4678 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4679 block = ItemPointerGetBlockNumber(tid);
4680
4681 /*
4682 * Before locking the buffer, pin the visibility map page if it appears to
4683 * be necessary. Since we haven't got the lock yet, someone else might be
4684 * in the middle of changing this, so we'll need to recheck after we have
4685 * the lock.
4686 */
4687 if (PageIsAllVisible(BufferGetPage(*buffer)))
4688 visibilitymap_pin(relation, block, &vmbuffer);
4689
4691
4692 page = BufferGetPage(*buffer);
4695
4696 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4697 tuple->t_len = ItemIdGetLength(lp);
4698 tuple->t_tableOid = RelationGetRelid(relation);
4699
4700l3:
4701 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4702
4703 if (result == TM_Invisible)
4704 {
4705 /*
4706 * This is possible, but only when locking a tuple for ON CONFLICT DO
4707 * SELECT/UPDATE. We return this value here rather than throwing an
4708 * error in order to give that case the opportunity to throw a more
4709 * specific error.
4710 */
4711 result = TM_Invisible;
4712 goto out_locked;
4713 }
4714 else if (result == TM_BeingModified ||
4715 result == TM_Updated ||
4716 result == TM_Deleted)
4717 {
4721 bool require_sleep;
4722 ItemPointerData t_ctid;
4723
4724 /* must copy state data before unlocking buffer */
4726 infomask = tuple->t_data->t_infomask;
4727 infomask2 = tuple->t_data->t_infomask2;
4728 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4729
4731
4732 /*
4733 * If any subtransaction of the current top transaction already holds
4734 * a lock as strong as or stronger than what we're requesting, we
4735 * effectively hold the desired lock already. We *must* succeed
4736 * without trying to take the tuple lock, else we will deadlock
4737 * against anyone wanting to acquire a stronger lock.
4738 *
4739 * Note we only do this the first time we loop on the HTSU result;
4740 * there is no point in testing in subsequent passes, because
4741 * evidently our own transaction cannot have acquired a new lock after
4742 * the first time we checked.
4743 */
4744 if (first_time)
4745 {
4746 first_time = false;
4747
4749 {
4750 int i;
4751 int nmembers;
4752 MultiXactMember *members;
4753
4754 /*
4755 * We don't need to allow old multixacts here; if that had
4756 * been the case, HeapTupleSatisfiesUpdate would have returned
4757 * MayBeUpdated and we wouldn't be here.
4758 */
4759 nmembers =
4760 GetMultiXactIdMembers(xwait, &members, false,
4762
4763 for (i = 0; i < nmembers; i++)
4764 {
4765 /* only consider members of our own transaction */
4766 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4767 continue;
4768
4769 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4770 {
4771 pfree(members);
4772 result = TM_Ok;
4773 goto out_unlocked;
4774 }
4775 else
4776 {
4777 /*
4778 * Disable acquisition of the heavyweight tuple lock.
4779 * Otherwise, when promoting a weaker lock, we might
4780 * deadlock with another locker that has acquired the
4781 * heavyweight tuple lock and is waiting for our
4782 * transaction to finish.
4783 *
4784 * Note that in this case we still need to wait for
4785 * the multixact if required, to avoid acquiring
4786 * conflicting locks.
4787 */
4788 skip_tuple_lock = true;
4789 }
4790 }
4791
4792 if (members)
4793 pfree(members);
4794 }
4796 {
4797 switch (mode)
4798 {
4799 case LockTupleKeyShare:
4803 result = TM_Ok;
4804 goto out_unlocked;
4805 case LockTupleShare:
4808 {
4809 result = TM_Ok;
4810 goto out_unlocked;
4811 }
4812 break;
4815 {
4816 result = TM_Ok;
4817 goto out_unlocked;
4818 }
4819 break;
4820 case LockTupleExclusive:
4823 {
4824 result = TM_Ok;
4825 goto out_unlocked;
4826 }
4827 break;
4828 }
4829 }
4830 }
4831
4832 /*
4833 * Initially assume that we will have to wait for the locking
4834 * transaction(s) to finish. We check various cases below in which
4835 * this can be turned off.
4836 */
4837 require_sleep = true;
4838 if (mode == LockTupleKeyShare)
4839 {
4840 /*
4841 * If we're requesting KeyShare, and there's no update present, we
4842 * don't need to wait. Even if there is an update, we can still
4843 * continue if the key hasn't been modified.
4844 *
4845 * However, if there are updates, we need to walk the update chain
4846 * to mark future versions of the row as locked, too. That way,
4847 * if somebody deletes that future version, we're protected
4848 * against the key going away. This locking of future versions
4849 * could block momentarily, if a concurrent transaction is
4850 * deleting a key; or it could return a value to the effect that
4851 * the transaction deleting the key has already committed. So we
4852 * do this before re-locking the buffer; otherwise this would be
4853 * prone to deadlocks.
4854 *
4855 * Note that the TID we're locking was grabbed before we unlocked
4856 * the buffer. For it to change while we're not looking, the
4857 * other properties we're testing for below after re-locking the
4858 * buffer would also change, in which case we would restart this
4859 * loop above.
4860 */
4862 {
4863 bool updated;
4864
4866
4867 /*
4868 * If there are updates, follow the update chain; bail out if
4869 * that cannot be done.
4870 */
4871 if (follow_updates && updated &&
4872 !ItemPointerEquals(&tuple->t_self, &t_ctid))
4873 {
4874 TM_Result res;
4875
4876 res = heap_lock_updated_tuple(relation,
4877 infomask, xwait, &t_ctid,
4879 mode);
4880 if (res != TM_Ok)
4881 {
4882 result = res;
4883 /* recovery code expects to have buffer lock held */
4885 goto failed;
4886 }
4887 }
4888
4890
4891 /*
4892 * Make sure it's still an appropriate lock, else start over.
4893 * Also, if it wasn't updated before we released the lock, but
4894 * is updated now, we start over too; the reason is that we
4895 * now need to follow the update chain to lock the new
4896 * versions.
4897 */
4898 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4899 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4900 !updated))
4901 goto l3;
4902
4903 /* Things look okay, so we can skip sleeping */
4904 require_sleep = false;
4905
4906 /*
4907 * Note we allow Xmax to change here; other updaters/lockers
4908 * could have modified it before we grabbed the buffer lock.
4909 * However, this is not a problem, because with the recheck we
4910 * just did we ensure that they still don't conflict with the
4911 * lock we want.
4912 */
4913 }
4914 }
4915 else if (mode == LockTupleShare)
4916 {
4917 /*
4918 * If we're requesting Share, we can similarly avoid sleeping if
4919 * there's no update and no exclusive lock present.
4920 */
4923 {
4925
4926 /*
4927 * Make sure it's still an appropriate lock, else start over.
4928 * See above about allowing xmax to change.
4929 */
4932 goto l3;
4933 require_sleep = false;
4934 }
4935 }
4936 else if (mode == LockTupleNoKeyExclusive)
4937 {
4938 /*
4939 * If we're requesting NoKeyExclusive, we might also be able to
4940 * avoid sleeping; just ensure that there no conflicting lock
4941 * already acquired.
4942 */
4944 {
4946 mode, NULL))
4947 {
4948 /*
4949 * No conflict, but if the xmax changed under us in the
4950 * meantime, start over.
4951 */
4955 xwait))
4956 goto l3;
4957
4958 /* otherwise, we're good */
4959 require_sleep = false;
4960 }
4961 }
4963 {
4965
4966 /* if the xmax changed in the meantime, start over */
4969 xwait))
4970 goto l3;
4971 /* otherwise, we're good */
4972 require_sleep = false;
4973 }
4974 }
4975
4976 /*
4977 * As a check independent from those above, we can also avoid sleeping
4978 * if the current transaction is the sole locker of the tuple. Note
4979 * that the strength of the lock already held is irrelevant; this is
4980 * not about recording the lock in Xmax (which will be done regardless
4981 * of this optimization, below). Also, note that the cases where we
4982 * hold a lock stronger than we are requesting are already handled
4983 * above by not doing anything.
4984 *
4985 * Note we only deal with the non-multixact case here; MultiXactIdWait
4986 * is well equipped to deal with this situation on its own.
4987 */
4990 {
4991 /* ... but if the xmax changed in the meantime, start over */
4995 xwait))
4996 goto l3;
4998 require_sleep = false;
4999 }
5000
5001 /*
5002 * Time to sleep on the other transaction/multixact, if necessary.
5003 *
5004 * If the other transaction is an update/delete that's already
5005 * committed, then sleeping cannot possibly do any good: if we're
5006 * required to sleep, get out to raise an error instead.
5007 *
5008 * By here, we either have already acquired the buffer exclusive lock,
5009 * or we must wait for the locking transaction or multixact; so below
5010 * we ensure that we grab buffer lock after the sleep.
5011 */
5012 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
5013 {
5015 goto failed;
5016 }
5017 else if (require_sleep)
5018 {
5019 /*
5020 * Acquire tuple lock to establish our priority for the tuple, or
5021 * die trying. LockTuple will release us when we are next-in-line
5022 * for the tuple. We must do this even if we are share-locking,
5023 * but not if we already have a weaker lock on the tuple.
5024 *
5025 * If we are forced to "start over" below, we keep the tuple lock;
5026 * this arranges that we stay at the head of the line while
5027 * rechecking tuple state.
5028 */
5029 if (!skip_tuple_lock &&
5030 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
5032 {
5033 /*
5034 * This can only happen if wait_policy is Skip and the lock
5035 * couldn't be obtained.
5036 */
5037 result = TM_WouldBlock;
5038 /* recovery code expects to have buffer lock held */
5040 goto failed;
5041 }
5042
5044 {
5046
5047 /* We only ever lock tuples, never update them */
5048 if (status >= MultiXactStatusNoKeyUpdate)
5049 elog(ERROR, "invalid lock mode in heap_lock_tuple");
5050
5051 /* wait for multixact to end, or die trying */
5052 switch (wait_policy)
5053 {
5054 case LockWaitBlock:
5056 relation, &tuple->t_self, XLTW_Lock, NULL);
5057 break;
5058 case LockWaitSkip:
5060 status, infomask, relation,
5061 NULL, false))
5062 {
5063 result = TM_WouldBlock;
5064 /* recovery code expects to have buffer lock held */
5066 goto failed;
5067 }
5068 break;
5069 case LockWaitError:
5071 status, infomask, relation,
5073 ereport(ERROR,
5075 errmsg("could not obtain lock on row in relation \"%s\"",
5076 RelationGetRelationName(relation))));
5077
5078 break;
5079 }
5080
5081 /*
5082 * Of course, the multixact might not be done here: if we're
5083 * requesting a light lock mode, other transactions with light
5084 * locks could still be alive, as well as locks owned by our
5085 * own xact or other subxacts of this backend. We need to
5086 * preserve the surviving MultiXact members. Note that it
5087 * isn't absolutely necessary in the latter case, but doing so
5088 * is simpler.
5089 */
5090 }
5091 else
5092 {
5093 /* wait for regular transaction to end, or die trying */
5094 switch (wait_policy)
5095 {
5096 case LockWaitBlock:
5097 XactLockTableWait(xwait, relation, &tuple->t_self,
5098 XLTW_Lock);
5099 break;
5100 case LockWaitSkip:
5102 {
5103 result = TM_WouldBlock;
5104 /* recovery code expects to have buffer lock held */
5106 goto failed;
5107 }
5108 break;
5109 case LockWaitError:
5111 ereport(ERROR,
5113 errmsg("could not obtain lock on row in relation \"%s\"",
5114 RelationGetRelationName(relation))));
5115 break;
5116 }
5117 }
5118
5119 /* if there are updates, follow the update chain */
5121 !ItemPointerEquals(&tuple->t_self, &t_ctid))
5122 {
5123 TM_Result res;
5124
5125 res = heap_lock_updated_tuple(relation,
5126 infomask, xwait, &t_ctid,
5128 mode);
5129 if (res != TM_Ok)
5130 {
5131 result = res;
5132 /* recovery code expects to have buffer lock held */
5134 goto failed;
5135 }
5136 }
5137
5139
5140 /*
5141 * xwait is done, but if xwait had just locked the tuple then some
5142 * other xact could update this tuple before we get to this point.
5143 * Check for xmax change, and start over if so.
5144 */
5147 xwait))
5148 goto l3;
5149
5151 {
5152 /*
5153 * Otherwise check if it committed or aborted. Note we cannot
5154 * be here if the tuple was only locked by somebody who didn't
5155 * conflict with us; that would have been handled above. So
5156 * that transaction must necessarily be gone by now. But
5157 * don't check for this in the multixact case, because some
5158 * locker transactions might still be running.
5159 */
5160 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5161 }
5162 }
5163
5164 /* By here, we're certain that we hold buffer exclusive lock again */
5165
5166 /*
5167 * We may lock if previous xmax aborted, or if it committed but only
5168 * locked the tuple without updating it; or if we didn't have to wait
5169 * at all for whatever reason.
5170 */
5171 if (!require_sleep ||
5172 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5175 result = TM_Ok;
5176 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
5177 result = TM_Updated;
5178 else
5179 result = TM_Deleted;
5180 }
5181
5182failed:
5183 if (result != TM_Ok)
5184 {
5185 Assert(result == TM_SelfModified || result == TM_Updated ||
5186 result == TM_Deleted || result == TM_WouldBlock);
5187
5188 /*
5189 * When locking a tuple under LockWaitSkip semantics and we fail with
5190 * TM_WouldBlock above, it's possible for concurrent transactions to
5191 * release the lock and set HEAP_XMAX_INVALID in the meantime. So
5192 * this assert is slightly different from the equivalent one in
5193 * heap_delete and heap_update.
5194 */
5195 Assert((result == TM_WouldBlock) ||
5196 !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5197 Assert(result != TM_Updated ||
5198 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
5199 tmfd->ctid = tuple->t_data->t_ctid;
5200 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5201 if (result == TM_SelfModified)
5202 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5203 else
5204 tmfd->cmax = InvalidCommandId;
5205 goto out_locked;
5206 }
5207
5208 /*
5209 * If we didn't pin the visibility map page and the page has become all
5210 * visible while we were busy locking the buffer, or during some
5211 * subsequent window during which we had it unlocked, we'll have to unlock
5212 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5213 * unfortunate, especially since we'll now have to recheck whether the
5214 * tuple has been locked or updated under us, but hopefully it won't
5215 * happen very often.
5216 */
5217 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5218 {
5220 visibilitymap_pin(relation, block, &vmbuffer);
5222 goto l3;
5223 }
5224
5225 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5226 old_infomask = tuple->t_data->t_infomask;
5227
5228 /*
5229 * If this is the first possibly-multixact-able operation in the current
5230 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5231 * certain that the transaction will never become a member of any older
5232 * MultiXactIds than that. (We have to do this even if we end up just
5233 * using our own TransactionId below, since some other backend could
5234 * incorporate our XID into a MultiXact immediately afterwards.)
5235 */
5237
5238 /*
5239 * Compute the new xmax and infomask to store into the tuple. Note we do
5240 * not modify the tuple just yet, because that would leave it in the wrong
5241 * state if multixact.c elogs.
5242 */
5244 GetCurrentTransactionId(), mode, false,
5245 &xid, &new_infomask, &new_infomask2);
5246
5248
5249 /*
5250 * Store transaction information of xact locking the tuple.
5251 *
5252 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5253 * possibly generating a useless combo CID. Moreover, if we're locking a
5254 * previously updated tuple, it's important to preserve the Cmax.
5255 *
5256 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5257 * we would break the HOT chain.
5258 */
5261 tuple->t_data->t_infomask |= new_infomask;
5262 tuple->t_data->t_infomask2 |= new_infomask2;
5265 HeapTupleHeaderSetXmax(tuple->t_data, xid);
5266
5267 /*
5268 * Make sure there is no forward chain link in t_ctid. Note that in the
5269 * cases where the tuple has been updated, we must not overwrite t_ctid,
5270 * because it was set by the updater. Moreover, if the tuple has been
5271 * updated, we need to follow the update chain to lock the new versions of
5272 * the tuple as well.
5273 */
5275 tuple->t_data->t_ctid = *tid;
5276
5277 /* Clear only the all-frozen bit on visibility map if needed */
5278 if (PageIsAllVisible(page) &&
5279 visibilitymap_clear(relation, block, vmbuffer,
5281 cleared_all_frozen = true;
5282
5283
5284 MarkBufferDirty(*buffer);
5285
5286 /*
5287 * XLOG stuff. You might think that we don't need an XLOG record because
5288 * there is no state change worth restoring after a crash. You would be
5289 * wrong however: we have just written either a TransactionId or a
5290 * MultiXactId that may never have been seen on disk before, and we need
5291 * to make sure that there are XLOG entries covering those ID numbers.
5292 * Else the same IDs might be re-used after a crash, which would be
5293 * disastrous if this page made it to disk before the crash. Essentially
5294 * we have to enforce the WAL log-before-data rule even in this case.
5295 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5296 * entries for everything anyway.)
5297 */
5298 if (RelationNeedsWAL(relation))
5299 {
5302
5305
5306 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5307 xlrec.xmax = xid;
5308 xlrec.infobits_set = compute_infobits(new_infomask,
5309 tuple->t_data->t_infomask2);
5312
5313 /* we don't decode row locks atm, so no need to log the origin */
5314
5316
5317 PageSetLSN(page, recptr);
5318 }
5319
5321
5322 result = TM_Ok;
5323
5326
5328 if (BufferIsValid(vmbuffer))
5329 ReleaseBuffer(vmbuffer);
5330
5331 /*
5332 * Don't update the visibility map here. Locking a tuple doesn't change
5333 * visibility info.
5334 */
5335
5336 /*
5337 * Now that we have successfully marked the tuple as locked, we can
5338 * release the lmgr tuple lock, if we had it.
5339 */
5340 if (have_tuple_lock)
5341 UnlockTupleTuplock(relation, tid, mode);
5342
5343 return result;
5344}
5345
5346/*
5347 * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5348 * its normal, Xmax-based tuple lock.
5349 *
5350 * have_tuple_lock is an input and output parameter: on input, it indicates
5351 * whether the lock has previously been acquired (and this function does
5352 * nothing in that case). If this function returns success, have_tuple_lock
5353 * has been flipped to true.
5354 *
5355 * Returns false if it was unable to obtain the lock; this can only happen if
5356 * wait_policy is Skip.
5357 */
5358static bool
5361{
5362 if (*have_tuple_lock)
5363 return true;
5364
5365 switch (wait_policy)
5366 {
5367 case LockWaitBlock:
5368 LockTupleTuplock(relation, tid, mode);
5369 break;
5370
5371 case LockWaitSkip:
5372 if (!ConditionalLockTupleTuplock(relation, tid, mode, false))
5373 return false;
5374 break;
5375
5376 case LockWaitError:
5378 ereport(ERROR,
5380 errmsg("could not obtain lock on row in relation \"%s\"",
5381 RelationGetRelationName(relation))));
5382 break;
5383 }
5384 *have_tuple_lock = true;
5385
5386 return true;
5387}
5388
5389/*
5390 * Given an original set of Xmax and infomask, and a transaction (identified by
5391 * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5392 * corresponding infomasks to use on the tuple.
5393 *
5394 * Note that this might have side effects such as creating a new MultiXactId.
5395 *
5396 * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5397 * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5398 * but it was not running anymore. There is a race condition, which is that the
5399 * MultiXactId may have finished since then, but that uncommon case is handled
5400 * either here, or within MultiXactIdExpand.
5401 *
5402 * There is a similar race condition possible when the old xmax was a regular
5403 * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5404 * window, but it's still possible to end up creating an unnecessary
5405 * MultiXactId. Fortunately this is harmless.
5406 */
5407static void
5413{
5414 TransactionId new_xmax;
5417
5419
5420l5:
5421 new_infomask = 0;
5422 new_infomask2 = 0;
5424 {
5425 /*
5426 * No previous locker; we just insert our own TransactionId.
5427 *
5428 * Note that it's critical that this case be the first one checked,
5429 * because there are several blocks below that come back to this one
5430 * to implement certain optimizations; old_infomask might contain
5431 * other dirty bits in those cases, but we don't really care.
5432 */
5433 if (is_update)
5434 {
5435 new_xmax = add_to_xmax;
5436 if (mode == LockTupleExclusive)
5438 }
5439 else
5440 {
5442 switch (mode)
5443 {
5444 case LockTupleKeyShare:
5445 new_xmax = add_to_xmax;
5447 break;
5448 case LockTupleShare:
5449 new_xmax = add_to_xmax;
5451 break;
5453 new_xmax = add_to_xmax;
5455 break;
5456 case LockTupleExclusive:
5457 new_xmax = add_to_xmax;
5460 break;
5461 default:
5462 new_xmax = InvalidTransactionId; /* silence compiler */
5463 elog(ERROR, "invalid lock mode");
5464 }
5465 }
5466 }
5468 {
5470
5471 /*
5472 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5473 * cross-check.
5474 */
5476
5477 /*
5478 * A multixact together with LOCK_ONLY set but neither lock bit set
5479 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5480 * anymore. This check is critical for databases upgraded by
5481 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5482 * that such multis are never passed.
5483 */
5485 {
5488 goto l5;
5489 }
5490
5491 /*
5492 * If the XMAX is already a MultiXactId, then we need to expand it to
5493 * include add_to_xmax; but if all the members were lockers and are
5494 * all gone, we can do away with the IS_MULTI bit and just set
5495 * add_to_xmax as the only locker/updater. If all lockers are gone
5496 * and we have an updater that aborted, we can also do without a
5497 * multi.
5498 *
5499 * The cost of doing GetMultiXactIdMembers would be paid by
5500 * MultiXactIdExpand if we weren't to do this, so this check is not
5501 * incurring extra work anyhow.
5502 */
5504 {
5507 old_infomask)))
5508 {
5509 /*
5510 * Reset these bits and restart; otherwise fall through to
5511 * create a new multi below.
5512 */
5515 goto l5;
5516 }
5517 }
5518
5520
5521 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5522 new_status);
5524 }
5526 {
5527 /*
5528 * It's a committed update, so we need to preserve him as updater of
5529 * the tuple.
5530 */
5531 MultiXactStatus status;
5533
5535 status = MultiXactStatusUpdate;
5536 else
5538
5540
5541 /*
5542 * since it's not running, it's obviously impossible for the old
5543 * updater to be identical to the current one, so we need not check
5544 * for that case as we do in the block above.
5545 */
5546 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5548 }
5549 else if (TransactionIdIsInProgress(xmax))
5550 {
5551 /*
5552 * If the XMAX is a valid, in-progress TransactionId, then we need to
5553 * create a new MultiXactId that includes both the old locker or
5554 * updater and our own TransactionId.
5555 */
5559
5561 {
5567 {
5570 else
5572 }
5573 else
5574 {
5575 /*
5576 * LOCK_ONLY can be present alone only when a page has been
5577 * upgraded by pg_upgrade. But in that case,
5578 * TransactionIdIsInProgress() should have returned false. We
5579 * assume it's no longer locked in this case.
5580 */
5581 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5584 goto l5;
5585 }
5586 }
5587 else
5588 {
5589 /* it's an update, but which kind? */
5592 else
5594 }
5595
5597
5598 /*
5599 * If the lock to be acquired is for the same TransactionId as the
5600 * existing lock, there's an optimization possible: consider only the
5601 * strongest of both locks as the only one present, and restart.
5602 */
5603 if (xmax == add_to_xmax)
5604 {
5605 /*
5606 * Note that it's not possible for the original tuple to be
5607 * updated: we wouldn't be here because the tuple would have been
5608 * invisible and we wouldn't try to update it. As a subtlety,
5609 * this code can also run when traversing an update chain to lock
5610 * future versions of a tuple. But we wouldn't be here either,
5611 * because the add_to_xmax would be different from the original
5612 * updater.
5613 */
5615
5616 /* acquire the strongest of both */
5617 if (mode < old_mode)
5618 mode = old_mode;
5619 /* mustn't touch is_update */
5620
5622 goto l5;
5623 }
5624
5625 /* otherwise, just fall back to creating a new multixact */
5627 new_xmax = MultiXactIdCreate(xmax, old_status,
5630 }
5633 {
5634 /*
5635 * It's a committed update, so we gotta preserve him as updater of the
5636 * tuple.
5637 */
5638 MultiXactStatus status;
5640
5642 status = MultiXactStatusUpdate;
5643 else
5645
5647
5648 /*
5649 * since it's not running, it's obviously impossible for the old
5650 * updater to be identical to the current one, so we need not check
5651 * for that case as we do in the block above.
5652 */
5653 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5655 }
5656 else
5657 {
5658 /*
5659 * Can get here iff the locking/updating transaction was running when
5660 * the infomask was extracted from the tuple, but finished before
5661 * TransactionIdIsInProgress got to run. Deal with it as if there was
5662 * no locker at all in the first place.
5663 */
5665 goto l5;
5666 }
5667
5670 *result_xmax = new_xmax;
5671}
5672
5673/*
5674 * Subroutine for heap_lock_updated_tuple_rec.
5675 *
5676 * Given a hypothetical multixact status held by the transaction identified
5677 * with the given xid, does the current transaction need to wait, fail, or can
5678 * it continue if it wanted to acquire a lock of the given mode? "needwait"
5679 * is set to true if waiting is necessary; if it can continue, then TM_Ok is
5680 * returned. If the lock is already held by the current transaction, return
5681 * TM_SelfModified. In case of a conflict with another transaction, a
5682 * different HeapTupleSatisfiesUpdate return code is returned.
5683 *
5684 * The held status is said to be hypothetical because it might correspond to a
5685 * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5686 * way for simplicity of API.
5687 */
5688static TM_Result
5691 bool *needwait)
5692{
5694
5695 *needwait = false;
5697
5698 /*
5699 * Note: we *must* check TransactionIdIsInProgress before
5700 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5701 * for an explanation.
5702 */
5704 {
5705 /*
5706 * The tuple has already been locked by our own transaction. This is
5707 * very rare but can happen if multiple transactions are trying to
5708 * lock an ancient version of the same tuple.
5709 */
5710 return TM_SelfModified;
5711 }
5712 else if (TransactionIdIsInProgress(xid))
5713 {
5714 /*
5715 * If the locking transaction is running, what we do depends on
5716 * whether the lock modes conflict: if they do, then we must wait for
5717 * it to finish; otherwise we can fall through to lock this tuple
5718 * version without waiting.
5719 */
5722 {
5723 *needwait = true;
5724 }
5725
5726 /*
5727 * If we set needwait above, then this value doesn't matter;
5728 * otherwise, this value signals to caller that it's okay to proceed.
5729 */
5730 return TM_Ok;
5731 }
5732 else if (TransactionIdDidAbort(xid))
5733 return TM_Ok;
5734 else if (TransactionIdDidCommit(xid))
5735 {
5736 /*
5737 * The other transaction committed. If it was only a locker, then the
5738 * lock is completely gone now and we can return success; but if it
5739 * was an update, then what we do depends on whether the two lock
5740 * modes conflict. If they conflict, then we must report error to
5741 * caller. But if they don't, we can fall through to allow the current
5742 * transaction to lock the tuple.
5743 *
5744 * Note: the reason we worry about ISUPDATE here is because as soon as
5745 * a transaction ends, all its locks are gone and meaningless, and
5746 * thus we can ignore them; whereas its updates persist. In the
5747 * TransactionIdIsInProgress case, above, we don't need to check
5748 * because we know the lock is still "alive" and thus a conflict needs
5749 * always be checked.
5750 */
5751 if (!ISUPDATE_from_mxstatus(status))
5752 return TM_Ok;
5753
5756 {
5757 /* bummer */
5758 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5759 return TM_Updated;
5760 else
5761 return TM_Deleted;
5762 }
5763
5764 return TM_Ok;
5765 }
5766
5767 /* Not in progress, not aborted, not committed -- must have crashed */
5768 return TM_Ok;
5769}
5770
5771
5772/*
5773 * Recursive part of heap_lock_updated_tuple
5774 *
5775 * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5776 * xid with the given mode; if this tuple is updated, recurse to lock the new
5777 * version as well.
5778 */
5779static TM_Result
5781 const ItemPointerData *tid, TransactionId xid,
5783{
5784 TM_Result result;
5787 Buffer buf;
5792 TransactionId xmax,
5793 new_xmax;
5794 bool cleared_all_frozen = false;
5796 Buffer vmbuffer = InvalidBuffer;
5797 BlockNumber block;
5798
5799 ItemPointerCopy(tid, &tupid);
5800
5801 for (;;)
5802 {
5803 new_infomask = 0;
5804 new_xmax = InvalidTransactionId;
5806 ItemPointerCopy(&tupid, &(mytup.t_self));
5807
5808 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
5809 {
5810 /*
5811 * if we fail to find the updated version of the tuple, it's
5812 * because it was vacuumed/pruned away after its creator
5813 * transaction aborted. So behave as if we got to the end of the
5814 * chain, and there's no further tuple to lock: return success to
5815 * caller.
5816 */
5817 result = TM_Ok;
5818 goto out_unlocked;
5819 }
5820
5821l4:
5823
5824 /*
5825 * Before locking the buffer, pin the visibility map page if it
5826 * appears to be necessary. Since we haven't got the lock yet,
5827 * someone else might be in the middle of changing this, so we'll need
5828 * to recheck after we have the lock.
5829 */
5831 {
5832 visibilitymap_pin(rel, block, &vmbuffer);
5833 pinned_desired_page = true;
5834 }
5835 else
5836 pinned_desired_page = false;
5837
5839
5840 /*
5841 * If we didn't pin the visibility map page and the page has become
5842 * all visible while we were busy locking the buffer, we'll have to
5843 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5844 * That's a bit unfortunate, but hopefully shouldn't happen often.
5845 *
5846 * Note: in some paths through this function, we will reach here
5847 * holding a pin on a vm page that may or may not be the one matching
5848 * this page. If this page isn't all-visible, we won't use the vm
5849 * page, but we hold onto such a pin till the end of the function.
5850 */
5852 {
5854 visibilitymap_pin(rel, block, &vmbuffer);
5856 }
5857
5858 /*
5859 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5860 * end of the chain, we're done, so return success.
5861 */
5864 priorXmax))
5865 {
5866 result = TM_Ok;
5867 goto out_locked;
5868 }
5869
5870 /*
5871 * Also check Xmin: if this tuple was created by an aborted
5872 * (sub)transaction, then we already locked the last live one in the
5873 * chain, thus we're done, so return success.
5874 */
5876 {
5877 result = TM_Ok;
5878 goto out_locked;
5879 }
5880
5881 old_infomask = mytup.t_data->t_infomask;
5882 old_infomask2 = mytup.t_data->t_infomask2;
5883 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5884
5885 /*
5886 * If this tuple version has been updated or locked by some concurrent
5887 * transaction(s), what we do depends on whether our lock mode
5888 * conflicts with what those other transactions hold, and also on the
5889 * status of them.
5890 */
5892 {
5894 bool needwait;
5895
5898 {
5899 int nmembers;
5900 int i;
5901 MultiXactMember *members;
5902
5903 /*
5904 * We don't need a test for pg_upgrade'd tuples: this is only
5905 * applied to tuples after the first in an update chain. Said
5906 * first tuple in the chain may well be locked-in-9.2-and-
5907 * pg_upgraded, but that one was already locked by our caller,
5908 * not us; and any subsequent ones cannot be because our
5909 * caller must necessarily have obtained a snapshot later than
5910 * the pg_upgrade itself.
5911 */
5912 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5913
5914 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5916 for (i = 0; i < nmembers; i++)
5917 {
5918 result = test_lockmode_for_conflict(members[i].status,
5919 members[i].xid,
5920 mode,
5921 &mytup,
5922 &needwait);
5923
5924 /*
5925 * If the tuple was already locked by ourselves in a
5926 * previous iteration of this (say heap_lock_tuple was
5927 * forced to restart the locking loop because of a change
5928 * in xmax), then we hold the lock already on this tuple
5929 * version and we don't need to do anything; and this is
5930 * not an error condition either. We just need to skip
5931 * this tuple and continue locking the next version in the
5932 * update chain.
5933 */
5934 if (result == TM_SelfModified)
5935 {
5936 pfree(members);
5937 goto next;
5938 }
5939
5940 if (needwait)
5941 {
5943 XactLockTableWait(members[i].xid, rel,
5944 &mytup.t_self,
5946 pfree(members);
5947 goto l4;
5948 }
5949 if (result != TM_Ok)
5950 {
5951 pfree(members);
5952 goto out_locked;
5953 }
5954 }
5955 if (members)
5956 pfree(members);
5957 }
5958 else
5959 {
5960 MultiXactStatus status;
5961
5962 /*
5963 * For a non-multi Xmax, we first need to compute the
5964 * corresponding MultiXactStatus by using the infomask bits.
5965 */
5967 {
5971 status = MultiXactStatusForShare;
5973 {
5975 status = MultiXactStatusForUpdate;
5976 else
5978 }
5979 else
5980 {
5981 /*
5982 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5983 * as share-locked in the old cluster) shouldn't be
5984 * seen in the middle of an update chain.
5985 */
5986 elog(ERROR, "invalid lock status in tuple");
5987 }
5988 }
5989 else
5990 {
5991 /* it's an update, but which kind? */
5993 status = MultiXactStatusUpdate;
5994 else
5996 }
5997
5998 result = test_lockmode_for_conflict(status, rawxmax, mode,
5999 &mytup, &needwait);
6000
6001 /*
6002 * If the tuple was already locked by ourselves in a previous
6003 * iteration of this (say heap_lock_tuple was forced to
6004 * restart the locking loop because of a change in xmax), then
6005 * we hold the lock already on this tuple version and we don't
6006 * need to do anything; and this is not an error condition
6007 * either. We just need to skip this tuple and continue
6008 * locking the next version in the update chain.
6009 */
6010 if (result == TM_SelfModified)
6011 goto next;
6012
6013 if (needwait)
6014 {
6016 XactLockTableWait(rawxmax, rel, &mytup.t_self,
6018 goto l4;
6019 }
6020 if (result != TM_Ok)
6021 {
6022 goto out_locked;
6023 }
6024 }
6025 }
6026
6027 /* compute the new Xmax and infomask values for the tuple ... */
6028 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
6029 xid, mode, false,
6030 &new_xmax, &new_infomask, &new_infomask2);
6031
6033 visibilitymap_clear(rel, block, vmbuffer,
6035 cleared_all_frozen = true;
6036
6038
6039 /* ... and set them */
6040 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
6041 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
6042 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6043 mytup.t_data->t_infomask |= new_infomask;
6044 mytup.t_data->t_infomask2 |= new_infomask2;
6045
6047
6048 /* XLOG stuff */
6049 if (RelationNeedsWAL(rel))
6050 {
6053 Page page = BufferGetPage(buf);
6054
6057
6058 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
6059 xlrec.xmax = new_xmax;
6061 xlrec.flags =
6063
6065
6067
6068 PageSetLSN(page, recptr);
6069 }
6070
6072
6073next:
6074 /* if we find the end of update chain, we're done. */
6075 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
6077 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
6079 {
6080 result = TM_Ok;
6081 goto out_locked;
6082 }
6083
6084 /* tail recursion */
6086 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
6088 }
6089
6090 result = TM_Ok;
6091
6094
6096 if (vmbuffer != InvalidBuffer)
6097 ReleaseBuffer(vmbuffer);
6098
6099 return result;
6100}
6101
6102/*
6103 * heap_lock_updated_tuple
6104 * Follow update chain when locking an updated tuple, acquiring locks (row
6105 * marks) on the updated versions.
6106 *
6107 * 'prior_infomask', 'prior_raw_xmax' and 'prior_ctid' are the corresponding
6108 * fields from the initial tuple. We will lock the tuples starting from the
6109 * one that 'prior_ctid' points to. Note: This function does not lock the
6110 * initial tuple itself.
6111 *
6112 * This function doesn't check visibility, it just unconditionally marks the
6113 * tuple(s) as locked. If any tuple in the updated chain is being deleted
6114 * concurrently (or updated with the key being modified), sleep until the
6115 * transaction doing it is finished.
6116 *
6117 * Note that we don't acquire heavyweight tuple locks on the tuples we walk
6118 * when we have to wait for other transactions to release them, as opposed to
6119 * what heap_lock_tuple does. The reason is that having more than one
6120 * transaction walking the chain is probably uncommon enough that risk of
6121 * starvation is not likely: one of the preconditions for being here is that
6122 * the snapshot in use predates the update that created this tuple (because we
6123 * started at an earlier version of the tuple), but at the same time such a
6124 * transaction cannot be using repeatable read or serializable isolation
6125 * levels, because that would lead to a serializability failure.
6126 */
6127static TM_Result
6133{
6134 INJECTION_POINT("heap_lock_updated_tuple", NULL);
6135
6136 /*
6137 * If the tuple has moved into another partition (effectively a delete)
6138 * stop here.
6139 */
6141 {
6143
6144 /*
6145 * If this is the first possibly-multixact-able operation in the
6146 * current transaction, set my per-backend OldestMemberMXactId
6147 * setting. We can be certain that the transaction will never become a
6148 * member of any older MultiXactIds than that. (We have to do this
6149 * even if we end up just using our own TransactionId below, since
6150 * some other backend could incorporate our XID into a MultiXact
6151 * immediately afterwards.)
6152 */
6154
6158 }
6159
6160 /* nothing to lock */
6161 return TM_Ok;
6162}
6163
6164/*
6165 * heap_finish_speculative - mark speculative insertion as successful
6166 *
6167 * To successfully finish a speculative insertion we have to clear speculative
6168 * token from tuple. To do so the t_ctid field, which will contain a
6169 * speculative token value, is modified in place to point to the tuple itself,
6170 * which is characteristic of a newly inserted ordinary tuple.
6171 *
6172 * NB: It is not ok to commit without either finishing or aborting a
6173 * speculative insertion. We could treat speculative tuples of committed
6174 * transactions implicitly as completed, but then we would have to be prepared
6175 * to deal with speculative tokens on committed tuples. That wouldn't be
6176 * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
6177 * but clearing the token at completion isn't very expensive either.
6178 * An explicit confirmation WAL record also makes logical decoding simpler.
6179 */
6180void
6182{
6183 Buffer buffer;
6184 Page page;
6185 OffsetNumber offnum;
6186 ItemId lp;
6187 HeapTupleHeader htup;
6188
6189 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
6191 page = BufferGetPage(buffer);
6192
6193 offnum = ItemPointerGetOffsetNumber(tid);
6195 elog(ERROR, "offnum out of range");
6196 lp = PageGetItemId(page, offnum);
6197 if (!ItemIdIsNormal(lp))
6198 elog(ERROR, "invalid lp");
6199
6200 htup = (HeapTupleHeader) PageGetItem(page, lp);
6201
6202 /* NO EREPORT(ERROR) from here till changes are logged */
6204
6206
6207 MarkBufferDirty(buffer);
6208
6209 /*
6210 * Replace the speculative insertion token with a real t_ctid, pointing to
6211 * itself like it does on regular tuples.
6212 */
6213 htup->t_ctid = *tid;
6214
6215 /* XLOG stuff */
6216 if (RelationNeedsWAL(relation))
6217 {
6220
6222
6224
6225 /* We want the same filtering on this as on a plain insert */
6227
6230
6232
6233 PageSetLSN(page, recptr);
6234 }
6235
6237
6238 UnlockReleaseBuffer(buffer);
6239}
6240
6241/*
6242 * heap_abort_speculative - kill a speculatively inserted tuple
6243 *
6244 * Marks a tuple that was speculatively inserted in the same command as dead,
6245 * by setting its xmin as invalid. That makes it immediately appear as dead
6246 * to all transactions, including our own. In particular, it makes
6247 * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
6248 * inserting a duplicate key value won't unnecessarily wait for our whole
6249 * transaction to finish (it'll just wait for our speculative insertion to
6250 * finish).
6251 *
6252 * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
6253 * that arise due to a mutual dependency that is not user visible. By
6254 * definition, unprincipled deadlocks cannot be prevented by the user
6255 * reordering lock acquisition in client code, because the implementation level
6256 * lock acquisitions are not under the user's direct control. If speculative
6257 * inserters did not take this precaution, then under high concurrency they
6258 * could deadlock with each other, which would not be acceptable.
6259 *
6260 * This is somewhat redundant with heap_delete, but we prefer to have a
6261 * dedicated routine with stripped down requirements. Note that this is also
6262 * used to delete the TOAST tuples created during speculative insertion.
6263 *
6264 * This routine does not affect logical decoding as it only looks at
6265 * confirmation records.
6266 */
6267void
6269{
6271 ItemId lp;
6272 HeapTupleData tp;
6273 Page page;
6274 BlockNumber block;
6275 Buffer buffer;
6276
6278
6279 block = ItemPointerGetBlockNumber(tid);
6280 buffer = ReadBuffer(relation, block);
6281 page = BufferGetPage(buffer);
6282
6284
6285 /*
6286 * Page can't be all visible, we just inserted into it, and are still
6287 * running.
6288 */
6289 Assert(!PageIsAllVisible(page));
6290
6293
6294 tp.t_tableOid = RelationGetRelid(relation);
6295 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6296 tp.t_len = ItemIdGetLength(lp);
6297 tp.t_self = *tid;
6298
6299 /*
6300 * Sanity check that the tuple really is a speculatively inserted tuple,
6301 * inserted by us.
6302 */
6303 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6304 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6305 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6306 elog(ERROR, "attempted to kill a non-speculative tuple");
6308
6309 /*
6310 * No need to check for serializable conflicts here. There is never a
6311 * need for a combo CID, either. No need to extract replica identity, or
6312 * do anything special with infomask bits.
6313 */
6314
6316
6317 /*
6318 * The tuple will become DEAD immediately. Flag that this page is a
6319 * candidate for pruning by setting xmin to TransactionXmin. While not
6320 * immediately prunable, it is the oldest xid we can cheaply determine
6321 * that's safe against wraparound / being older than the table's
6322 * relfrozenxid. To defend against the unlikely case of a new relation
6323 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6324 * if so (vacuum can't subsequently move relfrozenxid to beyond
6325 * TransactionXmin, so there's no race here).
6326 */
6328 {
6329 TransactionId relfrozenxid = relation->rd_rel->relfrozenxid;
6331
6332 if (TransactionIdPrecedes(TransactionXmin, relfrozenxid))
6333 prune_xid = relfrozenxid;
6334 else
6337 }
6338
6339 /* store transaction information of xact deleting the tuple */
6342
6343 /*
6344 * Set the tuple header xmin to InvalidTransactionId. This makes the
6345 * tuple immediately invisible everyone. (In particular, to any
6346 * transactions waiting on the speculative token, woken up later.)
6347 */
6349
6350 /* Clear the speculative insertion token too */
6351 tp.t_data->t_ctid = tp.t_self;
6352
6353 MarkBufferDirty(buffer);
6354
6355 /*
6356 * XLOG stuff
6357 *
6358 * The WAL records generated here match heap_delete(). The same recovery
6359 * routines are used.
6360 */
6361 if (RelationNeedsWAL(relation))
6362 {
6365
6367 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6368 tp.t_data->t_infomask2);
6370 xlrec.xmax = xid;
6371
6375
6376 /* No replica identity & replication origin logged */
6377
6379
6380 PageSetLSN(page, recptr);
6381 }
6382
6384
6386
6387 if (HeapTupleHasExternal(&tp))
6388 {
6389 Assert(!IsToastRelation(relation));
6390 heap_toast_delete(relation, &tp, true);
6391 }
6392
6393 /*
6394 * Never need to mark tuple for invalidation, since catalogs don't support
6395 * speculative insertion
6396 */
6397
6398 /* Now we can release the buffer */
6399 ReleaseBuffer(buffer);
6400
6401 /* count deletion, as we counted the insertion too */
6402 pgstat_count_heap_delete(relation);
6403}
6404
6405/*
6406 * heap_inplace_lock - protect inplace update from concurrent heap_update()
6407 *
6408 * Evaluate whether the tuple's state is compatible with a no-key update.
6409 * Current transaction rowmarks are fine, as is KEY SHARE from any
6410 * transaction. If compatible, return true with the buffer exclusive-locked,
6411 * and the caller must release that by calling
6412 * heap_inplace_update_and_unlock(), calling heap_inplace_unlock(), or raising
6413 * an error. Otherwise, call release_callback(arg), wait for blocking
6414 * transactions to end, and return false.
6415 *
6416 * Since this is intended for system catalogs and SERIALIZABLE doesn't cover
6417 * DDL, this doesn't guarantee any particular predicate locking.
6418 *
6419 * heap_delete() is a rarer source of blocking transactions (xwait). We'll
6420 * wait for such a transaction just like for the normal heap_update() case.
6421 * Normal concurrent DROP commands won't cause that, because all inplace
6422 * updaters take some lock that conflicts with DROP. An explicit SQL "DELETE
6423 * FROM pg_class" can cause it. By waiting, if the concurrent transaction
6424 * executed both "DELETE FROM pg_class" and "INSERT INTO pg_class", our caller
6425 * can find the successor tuple.
6426 *
6427 * Readers of inplace-updated fields expect changes to those fields are
6428 * durable. For example, vac_truncate_clog() reads datfrozenxid from
6429 * pg_database tuples via catalog snapshots. A future snapshot must not
6430 * return a lower datfrozenxid for the same database OID (lower in the
6431 * FullTransactionIdPrecedes() sense). We achieve that since no update of a
6432 * tuple can start while we hold a lock on its buffer. In cases like
6433 * BEGIN;GRANT;CREATE INDEX;COMMIT we're inplace-updating a tuple visible only
6434 * to this transaction. ROLLBACK then is one case where it's okay to lose
6435 * inplace updates. (Restoring relhasindex=false on ROLLBACK is fine, since
6436 * any concurrent CREATE INDEX would have blocked, then inplace-updated the
6437 * committed tuple.)
6438 *
6439 * In principle, we could avoid waiting by overwriting every tuple in the
6440 * updated tuple chain. Reader expectations permit updating a tuple only if
6441 * it's aborted, is the tail of the chain, or we already updated the tuple
6442 * referenced in its t_ctid. Hence, we would need to overwrite the tuples in
6443 * order from tail to head. That would imply either (a) mutating all tuples
6444 * in one critical section or (b) accepting a chance of partial completion.
6445 * Partial completion of a relfrozenxid update would have the weird
6446 * consequence that the table's next VACUUM could see the table's relfrozenxid
6447 * move forward between vacuum_get_cutoffs() and finishing.
6448 */
6449bool
6451 HeapTuple oldtup_ptr, Buffer buffer,
6452 void (*release_callback) (void *), void *arg)
6453{
6454 HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */
6455 TM_Result result;
6456 bool ret;
6457
6458#ifdef USE_ASSERT_CHECKING
6459 if (RelationGetRelid(relation) == RelationRelationId)
6461#endif
6462
6463 Assert(BufferIsValid(buffer));
6464
6465 /*
6466 * Register shared cache invals if necessary. Other sessions may finish
6467 * inplace updates of this tuple between this step and LockTuple(). Since
6468 * inplace updates don't change cache keys, that's harmless.
6469 *
6470 * While it's tempting to register invals only after confirming we can
6471 * return true, the following obstacle precludes reordering steps that
6472 * way. Registering invals might reach a CatalogCacheInitializeCache()
6473 * that locks "buffer". That would hang indefinitely if running after our
6474 * own LockBuffer(). Hence, we must register invals before LockBuffer().
6475 */
6477
6478 LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6480
6481 /*----------
6482 * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
6483 *
6484 * - wait unconditionally
6485 * - already locked tuple above, since inplace needs that unconditionally
6486 * - don't recheck header after wait: simpler to defer to next iteration
6487 * - don't try to continue even if the updater aborts: likewise
6488 * - no crosscheck
6489 */
6491 buffer);
6492
6493 if (result == TM_Invisible)
6494 {
6495 /* no known way this can happen */
6496 ereport(ERROR,
6498 errmsg_internal("attempted to overwrite invisible tuple")));
6499 }
6500 else if (result == TM_SelfModified)
6501 {
6502 /*
6503 * CREATE INDEX might reach this if an expression is silly enough to
6504 * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
6505 * statements might get here after a heap_update() of the same row, in
6506 * the absence of an intervening CommandCounterIncrement().
6507 */
6508 ereport(ERROR,
6510 errmsg("tuple to be updated was already modified by an operation triggered by the current command")));
6511 }
6512 else if (result == TM_BeingModified)
6513 {
6516
6518 infomask = oldtup.t_data->t_infomask;
6519
6521 {
6524 int remain;
6525
6527 lockmode, NULL))
6528 {
6531 ret = false;
6533 relation, &oldtup.t_self, XLTW_Update,
6534 &remain);
6535 }
6536 else
6537 ret = true;
6538 }
6540 ret = true;
6542 ret = true;
6543 else
6544 {
6547 ret = false;
6548 XactLockTableWait(xwait, relation, &oldtup.t_self,
6549 XLTW_Update);
6550 }
6551 }
6552 else
6553 {
6554 ret = (result == TM_Ok);
6555 if (!ret)
6556 {
6559 }
6560 }
6561
6562 /*
6563 * GetCatalogSnapshot() relies on invalidation messages to know when to
6564 * take a new snapshot. COMMIT of xwait is responsible for sending the
6565 * invalidation. We're not acquiring heavyweight locks sufficient to
6566 * block if not yet sent, so we must take a new snapshot to ensure a later
6567 * attempt has a fair chance. While we don't need this if xwait aborted,
6568 * don't bother optimizing that.
6569 */
6570 if (!ret)
6571 {
6572 UnlockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6575 }
6576 return ret;
6577}
6578
6579/*
6580 * heap_inplace_update_and_unlock - core of systable_inplace_update_finish
6581 *
6582 * The tuple cannot change size, and therefore its header fields and null
6583 * bitmap (if any) don't change either.
6584 *
6585 * Since we hold LOCKTAG_TUPLE, no updater has a local copy of this tuple.
6586 */
6587void
6589 HeapTuple oldtup, HeapTuple tuple,
6590 Buffer buffer)
6591{
6592 HeapTupleHeader htup = oldtup->t_data;
6593 uint32 oldlen;
6594 uint32 newlen;
6595 char *dst;
6596 char *src;
6597 int nmsgs = 0;
6599 bool RelcacheInitFileInval = false;
6600
6601 Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
6602 oldlen = oldtup->t_len - htup->t_hoff;
6603 newlen = tuple->t_len - tuple->t_data->t_hoff;
6604 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6605 elog(ERROR, "wrong tuple length");
6606
6607 dst = (char *) htup + htup->t_hoff;
6608 src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6609
6610 /* Like RecordTransactionCommit(), log only if needed */
6613 &RelcacheInitFileInval);
6614
6615 /*
6616 * Unlink relcache init files as needed. If unlinking, acquire
6617 * RelCacheInitLock until after associated invalidations. By doing this
6618 * in advance, if we checkpoint and then crash between inplace
6619 * XLogInsert() and inval, we don't rely on StartupXLOG() ->
6620 * RelationCacheInitFileRemove(). That uses elevel==LOG, so replay would
6621 * neglect to PANIC on EIO.
6622 */
6624
6625 /*----------
6626 * NO EREPORT(ERROR) from here till changes are complete
6627 *
6628 * Our exclusive buffer lock won't stop a reader having already pinned and
6629 * checked visibility for this tuple. With the usual order of changes
6630 * (i.e. updating the buffer contents before WAL logging), a reader could
6631 * observe our not-yet-persistent update to relfrozenxid and update
6632 * datfrozenxid based on that. A crash in that moment could allow
6633 * datfrozenxid to overtake relfrozenxid:
6634 *
6635 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6636 * ["R" is a VACUUM tbl]
6637 * D: vac_update_datfrozenxid() -> systable_beginscan(pg_class)
6638 * D: systable_getnext() returns pg_class tuple of tbl
6639 * R: memcpy() into pg_class tuple of tbl
6640 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6641 * [crash]
6642 * [recovery restores datfrozenxid w/o relfrozenxid]
6643 *
6644 * We avoid that by using a temporary copy of the buffer to hide our
6645 * change from other backends until the change has been WAL-logged. We
6646 * apply our change to the temporary copy and WAL-log it, before modifying
6647 * the real page. That way any action a reader of the in-place-updated
6648 * value takes will be WAL logged after this change.
6649 */
6651
6652 MarkBufferDirty(buffer);
6653
6654 /* XLOG stuff */
6655 if (RelationNeedsWAL(relation))
6656 {
6659 char *origdata = (char *) BufferGetBlock(buffer);
6660 Page page = BufferGetPage(buffer);
6661 uint16 lower = ((PageHeader) page)->pd_lower;
6662 uint16 upper = ((PageHeader) page)->pd_upper;
6664 RelFileLocator rlocator;
6665 ForkNumber forkno;
6666 BlockNumber blkno;
6668
6669 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6670 xlrec.dbId = MyDatabaseId;
6672 xlrec.relcacheInitFileInval = RelcacheInitFileInval;
6673 xlrec.nmsgs = nmsgs;
6674
6677 if (nmsgs != 0)
6679 nmsgs * sizeof(SharedInvalidationMessage));
6680
6681 /* register block matching what buffer will look like after changes */
6686 BufferGetTag(buffer, &rlocator, &forkno, &blkno);
6687 Assert(forkno == MAIN_FORKNUM);
6688 XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
6690 XLogRegisterBufData(0, src, newlen);
6691
6692 /* inplace updates aren't decoded atm, don't log the origin */
6693
6695
6696 PageSetLSN(page, recptr);
6697 }
6698
6699 memcpy(dst, src, newlen);
6700
6702
6703 /*
6704 * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we
6705 * do this before UnlockTuple().
6706 */
6708
6710 UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
6711
6712 AcceptInvalidationMessages(); /* local processing of just-sent inval */
6713
6714 /*
6715 * Queue a transactional inval, for logical decoding and for third-party
6716 * code that might have been relying on it since long before inplace
6717 * update adopted immediate invalidation. See README.tuplock section
6718 * "Reading inplace-updated columns" for logical decoding details.
6719 */
6721 CacheInvalidateHeapTuple(relation, tuple, NULL);
6722}
6723
6724/*
6725 * heap_inplace_unlock - reverse of heap_inplace_lock
6726 */
6727void
6729 HeapTuple oldtup, Buffer buffer)
6730{
6732 UnlockTuple(relation, &oldtup->t_self, InplaceUpdateTupleLock);
6734}
6735
6736#define FRM_NOOP 0x0001
6737#define FRM_INVALIDATE_XMAX 0x0002
6738#define FRM_RETURN_IS_XID 0x0004
6739#define FRM_RETURN_IS_MULTI 0x0008
6740#define FRM_MARK_COMMITTED 0x0010
6741
6742/*
6743 * FreezeMultiXactId
6744 * Determine what to do during freezing when a tuple is marked by a
6745 * MultiXactId.
6746 *
6747 * "flags" is an output value; it's used to tell caller what to do on return.
6748 * "pagefrz" is an input/output value, used to manage page level freezing.
6749 *
6750 * Possible values that we can set in "flags":
6751 * FRM_NOOP
6752 * don't do anything -- keep existing Xmax
6753 * FRM_INVALIDATE_XMAX
6754 * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
6755 * FRM_RETURN_IS_XID
6756 * The Xid return value is a single update Xid to set as xmax.
6757 * FRM_MARK_COMMITTED
6758 * Xmax can be marked as HEAP_XMAX_COMMITTED
6759 * FRM_RETURN_IS_MULTI
6760 * The return value is a new MultiXactId to set as new Xmax.
6761 * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
6762 *
6763 * Caller delegates control of page freezing to us. In practice we always
6764 * force freezing of caller's page unless FRM_NOOP processing is indicated.
6765 * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff
6766 * can never be left behind. We freely choose when and how to process each
6767 * Multi, without ever violating the cutoff postconditions for freezing.
6768 *
6769 * It's useful to remove Multis on a proactive timeline (relative to freezing
6770 * XIDs) to keep MultiXact member SLRU buffer misses to a minimum. It can also
6771 * be cheaper in the short run, for us, since we too can avoid SLRU buffer
6772 * misses through eager processing.
6773 *
6774 * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only
6775 * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice.
6776 * This can usually be put off, which is usually enough to avoid it altogether.
6777 * Allocating new multis during VACUUM should be avoided on general principle;
6778 * only VACUUM can advance relminmxid, so allocating new Multis here comes with
6779 * its own special risks.
6780 *
6781 * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers
6782 * using heap_tuple_should_freeze when we haven't forced page-level freezing.
6783 *
6784 * NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we
6785 * have already forced page-level freezing, since that might incur the same
6786 * SLRU buffer misses that we specifically intended to avoid by freezing.
6787 */
6788static TransactionId
6789FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
6790 const struct VacuumCutoffs *cutoffs, uint16 *flags,
6791 HeapPageFreeze *pagefrz)
6792{
6794 MultiXactMember *members;
6795 int nmembers;
6796 bool need_replace;
6797 int nnewmembers;
6799 bool has_lockers;
6801 bool update_committed;
6802 TransactionId FreezePageRelfrozenXid;
6803
6804 *flags = 0;
6805
6806 /* We should only be called in Multis */
6807 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6808
6809 if (!MultiXactIdIsValid(multi) ||
6810 HEAP_LOCKED_UPGRADED(t_infomask))
6811 {
6812 *flags |= FRM_INVALIDATE_XMAX;
6813 pagefrz->freeze_required = true;
6814 return InvalidTransactionId;
6815 }
6816 else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
6817 ereport(ERROR,
6819 errmsg_internal("found multixact %u from before relminmxid %u",
6820 multi, cutoffs->relminmxid)));
6821 else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
6822 {
6824
6825 /*
6826 * This old multi cannot possibly have members still running, but
6827 * verify just in case. If it was a locker only, it can be removed
6828 * without any further consideration; but if it contained an update,
6829 * we might need to preserve it.
6830 */
6831 if (MultiXactIdIsRunning(multi,
6832 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6833 ereport(ERROR,
6835 errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
6836 multi, cutoffs->OldestMxact)));
6837
6838 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6839 {
6840 *flags |= FRM_INVALIDATE_XMAX;
6841 pagefrz->freeze_required = true;
6842 return InvalidTransactionId;
6843 }
6844
6845 /* replace multi with single XID for its updater? */
6846 update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
6848 ereport(ERROR,
6850 errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
6851 multi, update_xact,
6852 cutoffs->relfrozenxid)));
6853 else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
6854 {
6855 /*
6856 * Updater XID has to have aborted (otherwise the tuple would have
6857 * been pruned away instead, since updater XID is < OldestXmin).
6858 * Just remove xmax.
6859 */
6861 ereport(ERROR,
6863 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6864 multi, update_xact,
6865 cutoffs->OldestXmin)));
6866 *flags |= FRM_INVALIDATE_XMAX;
6867 pagefrz->freeze_required = true;
6868 return InvalidTransactionId;
6869 }
6870
6871 /* Have to keep updater XID as new xmax */
6872 *flags |= FRM_RETURN_IS_XID;
6873 pagefrz->freeze_required = true;
6874 return update_xact;
6875 }
6876
6877 /*
6878 * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
6879 * need to walk the whole members array to figure out what to do, if
6880 * anything.
6881 */
6882 nmembers =
6883 GetMultiXactIdMembers(multi, &members, false,
6884 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6885 if (nmembers <= 0)
6886 {
6887 /* Nothing worth keeping */
6888 *flags |= FRM_INVALIDATE_XMAX;
6889 pagefrz->freeze_required = true;
6890 return InvalidTransactionId;
6891 }
6892
6893 /*
6894 * The FRM_NOOP case is the only case where we might need to ratchet back
6895 * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
6896 * case where our caller might ratchet back its NoFreezePageRelfrozenXid
6897 * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
6898 * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
6899 * trackers managed by VACUUM being ratcheting back by xmax to the degree
6900 * required to make it safe to leave xmax undisturbed, independent of
6901 * whether or not page freezing is triggered somewhere else.
6902 *
6903 * Our policy is to force freezing in every case other than FRM_NOOP,
6904 * which obviates the need to maintain either set of trackers, anywhere.
6905 * Every other case will reliably execute a freeze plan for xmax that
6906 * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
6907 * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
6908 * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
6909 * OldestXmin/OldestMxact, so later values never need to be tracked here.)
6910 */
6911 need_replace = false;
6912 FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
6913 for (int i = 0; i < nmembers; i++)
6914 {
6915 TransactionId xid = members[i].xid;
6916
6917 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6918
6919 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
6920 {
6921 /* Can't violate the FreezeLimit postcondition */
6922 need_replace = true;
6923 break;
6924 }
6925 if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
6926 FreezePageRelfrozenXid = xid;
6927 }
6928
6929 /* Can't violate the MultiXactCutoff postcondition, either */
6930 if (!need_replace)
6932
6933 if (!need_replace)
6934 {
6935 /*
6936 * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
6937 * both together to make it safe to retain this particular multi after
6938 * freezing its page
6939 */
6940 *flags |= FRM_NOOP;
6941 pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
6942 if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
6943 pagefrz->FreezePageRelminMxid = multi;
6944 pfree(members);
6945 return multi;
6946 }
6947
6948 /*
6949 * Do a more thorough second pass over the multi to figure out which
6950 * member XIDs actually need to be kept. Checking the precise status of
6951 * individual members might even show that we don't need to keep anything.
6952 * That is quite possible even though the Multi must be >= OldestMxact,
6953 * since our second pass only keeps member XIDs when it's truly necessary;
6954 * even member XIDs >= OldestXmin often won't be kept by second pass.
6955 */
6956 nnewmembers = 0;
6958 has_lockers = false;
6960 update_committed = false;
6961
6962 /*
6963 * Determine whether to keep each member xid, or to ignore it instead
6964 */
6965 for (int i = 0; i < nmembers; i++)
6966 {
6967 TransactionId xid = members[i].xid;
6968 MultiXactStatus mstatus = members[i].status;
6969
6970 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6971
6972 if (!ISUPDATE_from_mxstatus(mstatus))
6973 {
6974 /*
6975 * Locker XID (not updater XID). We only keep lockers that are
6976 * still running.
6977 */
6980 {
6981 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6982 ereport(ERROR,
6984 errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
6985 multi, xid,
6986 cutoffs->OldestXmin)));
6987 newmembers[nnewmembers++] = members[i];
6988 has_lockers = true;
6989 }
6990
6991 continue;
6992 }
6993
6994 /*
6995 * Updater XID (not locker XID). Should we keep it?
6996 *
6997 * Since the tuple wasn't totally removed when vacuum pruned, the
6998 * update Xid cannot possibly be older than OldestXmin cutoff unless
6999 * the updater XID aborted. If the updater transaction is known
7000 * aborted or crashed then it's okay to ignore it, otherwise not.
7001 *
7002 * In any case the Multi should never contain two updaters, whatever
7003 * their individual commit status. Check for that first, in passing.
7004 */
7006 ereport(ERROR,
7008 errmsg_internal("multixact %u has two or more updating members",
7009 multi),
7010 errdetail_internal("First updater XID=%u second updater XID=%u.",
7011 update_xid, xid)));
7012
7013 /*
7014 * As with all tuple visibility routines, it's critical to test
7015 * TransactionIdIsInProgress before TransactionIdDidCommit, because of
7016 * race conditions explained in detail in heapam_visibility.c.
7017 */
7020 update_xid = xid;
7021 else if (TransactionIdDidCommit(xid))
7022 {
7023 /*
7024 * The transaction committed, so we can tell caller to set
7025 * HEAP_XMAX_COMMITTED. (We can only do this because we know the
7026 * transaction is not running.)
7027 */
7028 update_committed = true;
7029 update_xid = xid;
7030 }
7031 else
7032 {
7033 /*
7034 * Not in progress, not committed -- must be aborted or crashed;
7035 * we can ignore it.
7036 */
7037 continue;
7038 }
7039
7040 /*
7041 * We determined that updater must be kept -- add it to pending new
7042 * members list
7043 */
7044 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
7045 ereport(ERROR,
7047 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
7048 multi, xid, cutoffs->OldestXmin)));
7049 newmembers[nnewmembers++] = members[i];
7050 }
7051
7052 pfree(members);
7053
7054 /*
7055 * Determine what to do with caller's multi based on information gathered
7056 * during our second pass
7057 */
7058 if (nnewmembers == 0)
7059 {
7060 /* Nothing worth keeping */
7061 *flags |= FRM_INVALIDATE_XMAX;
7063 }
7065 {
7066 /*
7067 * If there's a single member and it's an update, pass it back alone
7068 * without creating a new Multi. (XXX we could do this when there's a
7069 * single remaining locker, too, but that would complicate the API too
7070 * much; moreover, the case with the single updater is more
7071 * interesting, because those are longer-lived.)
7072 */
7073 Assert(nnewmembers == 1);
7074 *flags |= FRM_RETURN_IS_XID;
7075 if (update_committed)
7076 *flags |= FRM_MARK_COMMITTED;
7078 }
7079 else
7080 {
7081 /*
7082 * Create a new multixact with the surviving members of the previous
7083 * one, to set as new Xmax in the tuple
7084 */
7086 *flags |= FRM_RETURN_IS_MULTI;
7087 }
7088
7090
7091 pagefrz->freeze_required = true;
7092 return newxmax;
7093}
7094
7095/*
7096 * heap_prepare_freeze_tuple
7097 *
7098 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7099 * are older than the OldestXmin and/or OldestMxact freeze cutoffs. If so,
7100 * setup enough state (in the *frz output argument) to enable caller to
7101 * process this tuple as part of freezing its page, and return true. Return
7102 * false if nothing can be changed about the tuple right now.
7103 *
7104 * FreezePageConflictXid is advanced only for xmin/xvac freezing, not for xmax
7105 * changes. We only remove xmax state here when it is lock-only, or when the
7106 * updater XID (including an updater member of a MultiXact) must be aborted;
7107 * otherwise, the tuple would already be removable. Neither case affects
7108 * visibility on a standby.
7109 *
7110 * Also sets *totally_frozen to true if the tuple will be totally frozen once
7111 * caller executes returned freeze plan (or if the tuple was already totally
7112 * frozen by an earlier VACUUM). This indicates that there are no remaining
7113 * XIDs or MultiXactIds that will need to be processed by a future VACUUM.
7114 *
7115 * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every
7116 * tuple that we returned true for, and then execute freezing. Caller must
7117 * initialize pagefrz fields for page as a whole before first call here for
7118 * each heap page.
7119 *
7120 * VACUUM caller decides on whether or not to freeze the page as a whole.
7121 * We'll often prepare freeze plans for a page that caller just discards.
7122 * However, VACUUM doesn't always get to make a choice; it must freeze when
7123 * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and
7124 * MXIDs < MultiXactCutoff) can never be left behind. We help to make sure
7125 * that VACUUM always follows that rule.
7126 *
7127 * We sometimes force freezing of xmax MultiXactId values long before it is
7128 * strictly necessary to do so just to ensure the FreezeLimit postcondition.
7129 * It's worth processing MultiXactIds proactively when it is cheap to do so,
7130 * and it's convenient to make that happen by piggy-backing it on the "force
7131 * freezing" mechanism. Conversely, we sometimes delay freezing MultiXactIds
7132 * because it is expensive right now (though only when it's still possible to
7133 * do so without violating the FreezeLimit/MultiXactCutoff postcondition).
7134 *
7135 * It is assumed that the caller has checked the tuple with
7136 * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
7137 * (else we should be removing the tuple, not freezing it).
7138 *
7139 * NB: This function has side effects: it might allocate a new MultiXactId.
7140 * It will be set as tuple's new xmax when our *frz output is processed within
7141 * heap_execute_freeze_tuple later on. If the tuple is in a shared buffer
7142 * then caller had better have an exclusive lock on it already.
7143 */
7144bool
7146 const struct VacuumCutoffs *cutoffs,
7147 HeapPageFreeze *pagefrz,
7149{
7150 bool xmin_already_frozen = false,
7151 xmax_already_frozen = false;
7152 bool freeze_xmin = false,
7153 replace_xvac = false,
7154 replace_xmax = false,
7155 freeze_xmax = false;
7156 TransactionId xid;
7157
7158 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
7159 frz->t_infomask2 = tuple->t_infomask2;
7160 frz->t_infomask = tuple->t_infomask;
7161 frz->frzflags = 0;
7162 frz->checkflags = 0;
7163
7164 /*
7165 * Process xmin, while keeping track of whether it's already frozen, or
7166 * will become frozen iff our freeze plan is executed by caller (could be
7167 * neither).
7168 */
7169 xid = HeapTupleHeaderGetXmin(tuple);
7170 if (!TransactionIdIsNormal(xid))
7171 xmin_already_frozen = true;
7172 else
7173 {
7174 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7175 ereport(ERROR,
7177 errmsg_internal("found xmin %u from before relfrozenxid %u",
7178 xid, cutoffs->relfrozenxid)));
7179
7180 /* Will set freeze_xmin flags in freeze plan below */
7182
7183 /* Verify that xmin committed if and when freeze plan is executed */
7184 if (freeze_xmin)
7185 {
7188 pagefrz->FreezePageConflictXid = xid;
7189 }
7190 }
7191
7192 /*
7193 * Old-style VACUUM FULL is gone, but we have to process xvac for as long
7194 * as we support having MOVED_OFF/MOVED_IN tuples in the database
7195 */
7196 xid = HeapTupleHeaderGetXvac(tuple);
7197 if (TransactionIdIsNormal(xid))
7198 {
7200 Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
7201
7202 /*
7203 * For Xvac, we always freeze proactively. This allows totally_frozen
7204 * tracking to ignore xvac.
7205 */
7206 replace_xvac = pagefrz->freeze_required = true;
7207
7209 pagefrz->FreezePageConflictXid = xid;
7210
7211 /* Will set replace_xvac flags in freeze plan below */
7212 }
7213
7214 /* Now process xmax */
7215 xid = frz->xmax;
7216 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7217 {
7218 /* Raw xmax is a MultiXactId */
7220 uint16 flags;
7221
7222 /*
7223 * We will either remove xmax completely (in the "freeze_xmax" path),
7224 * process xmax by replacing it (in the "replace_xmax" path), or
7225 * perform no-op xmax processing. The only constraint is that the
7226 * FreezeLimit/MultiXactCutoff postcondition must never be violated.
7227 */
7228 newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
7229 &flags, pagefrz);
7230
7231 if (flags & FRM_NOOP)
7232 {
7233 /*
7234 * xmax is a MultiXactId, and nothing about it changes for now.
7235 * This is the only case where 'freeze_required' won't have been
7236 * set for us by FreezeMultiXactId, as well as the only case where
7237 * neither freeze_xmax nor replace_xmax are set (given a multi).
7238 *
7239 * This is a no-op, but the call to FreezeMultiXactId might have
7240 * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
7241 * for us (the "freeze page" variants, specifically). That'll
7242 * make it safe for our caller to freeze the page later on, while
7243 * leaving this particular xmax undisturbed.
7244 *
7245 * FreezeMultiXactId is _not_ responsible for the "no freeze"
7246 * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
7247 * job. A call to heap_tuple_should_freeze for this same tuple
7248 * will take place below if 'freeze_required' isn't set already.
7249 * (This repeats work from FreezeMultiXactId, but allows "no
7250 * freeze" tracker maintenance to happen in only one place.)
7251 */
7254 }
7255 else if (flags & FRM_RETURN_IS_XID)
7256 {
7257 /*
7258 * xmax will become an updater Xid (original MultiXact's updater
7259 * member Xid will be carried forward as a simple Xid in Xmax).
7260 */
7262
7263 /*
7264 * NB -- some of these transformations are only valid because we
7265 * know the return Xid is a tuple updater (i.e. not merely a
7266 * locker.) Also note that the only reason we don't explicitly
7267 * worry about HEAP_KEYS_UPDATED is because it lives in
7268 * t_infomask2 rather than t_infomask.
7269 */
7270 frz->t_infomask &= ~HEAP_XMAX_BITS;
7271 frz->xmax = newxmax;
7272 if (flags & FRM_MARK_COMMITTED)
7273 frz->t_infomask |= HEAP_XMAX_COMMITTED;
7274 replace_xmax = true;
7275 }
7276 else if (flags & FRM_RETURN_IS_MULTI)
7277 {
7280
7281 /*
7282 * xmax is an old MultiXactId that we have to replace with a new
7283 * MultiXactId, to carry forward two or more original member XIDs.
7284 */
7286
7287 /*
7288 * We can't use GetMultiXactIdHintBits directly on the new multi
7289 * here; that routine initializes the masks to all zeroes, which
7290 * would lose other bits we need. Doing it this way ensures all
7291 * unrelated bits remain untouched.
7292 */
7293 frz->t_infomask &= ~HEAP_XMAX_BITS;
7294 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7296 frz->t_infomask |= newbits;
7297 frz->t_infomask2 |= newbits2;
7298 frz->xmax = newxmax;
7299 replace_xmax = true;
7300 }
7301 else
7302 {
7303 /*
7304 * Freeze plan for tuple "freezes xmax" in the strictest sense:
7305 * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
7306 */
7307 Assert(flags & FRM_INVALIDATE_XMAX);
7309
7310 /* Will set freeze_xmax flags in freeze plan below */
7311 freeze_xmax = true;
7312 }
7313
7314 /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
7315 Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
7316 }
7317 else if (TransactionIdIsNormal(xid))
7318 {
7319 /* Raw xmax is normal XID */
7320 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7321 ereport(ERROR,
7323 errmsg_internal("found xmax %u from before relfrozenxid %u",
7324 xid, cutoffs->relfrozenxid)));
7325
7326 /* Will set freeze_xmax flags in freeze plan below */
7328
7329 /*
7330 * Verify that xmax aborted if and when freeze plan is executed,
7331 * provided it's from an update. (A lock-only xmax can be removed
7332 * independent of this, since the lock is released at xact end.)
7333 */
7335 frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
7336 }
7337 else if (!TransactionIdIsValid(xid))
7338 {
7339 /* Raw xmax is InvalidTransactionId XID */
7340 Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
7341 xmax_already_frozen = true;
7342 }
7343 else
7344 ereport(ERROR,
7346 errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
7347 xid, tuple->t_infomask)));
7348
7349 if (freeze_xmin)
7350 {
7352
7353 frz->t_infomask |= HEAP_XMIN_FROZEN;
7354 }
7355 if (replace_xvac)
7356 {
7357 /*
7358 * If a MOVED_OFF tuple is not dead, the xvac transaction must have
7359 * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
7360 * transaction succeeded.
7361 */
7362 Assert(pagefrz->freeze_required);
7363 if (tuple->t_infomask & HEAP_MOVED_OFF)
7364 frz->frzflags |= XLH_INVALID_XVAC;
7365 else
7366 frz->frzflags |= XLH_FREEZE_XVAC;
7367 }
7368 if (replace_xmax)
7369 {
7371 Assert(pagefrz->freeze_required);
7372
7373 /* Already set replace_xmax flags in freeze plan earlier */
7374 }
7375 if (freeze_xmax)
7376 {
7378
7379 frz->xmax = InvalidTransactionId;
7380
7381 /*
7382 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7383 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
7384 * Also get rid of the HEAP_KEYS_UPDATED bit.
7385 */
7386 frz->t_infomask &= ~HEAP_XMAX_BITS;
7387 frz->t_infomask |= HEAP_XMAX_INVALID;
7388 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7389 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7390 }
7391
7392 /*
7393 * Determine if this tuple is already totally frozen, or will become
7394 * totally frozen (provided caller executes freeze plans for the page)
7395 */
7398
7399 if (!pagefrz->freeze_required && !(xmin_already_frozen &&
7401 {
7402 /*
7403 * So far no previous tuple from the page made freezing mandatory.
7404 * Does this tuple force caller to freeze the entire page?
7405 */
7406 pagefrz->freeze_required =
7407 heap_tuple_should_freeze(tuple, cutoffs,
7408 &pagefrz->NoFreezePageRelfrozenXid,
7409 &pagefrz->NoFreezePageRelminMxid);
7410 }
7411
7412 /* Tell caller if this tuple has a usable freeze plan set in *frz */
7414}
7415
7416/*
7417 * Perform xmin/xmax XID status sanity checks before actually executing freeze
7418 * plans.
7419 *
7420 * heap_prepare_freeze_tuple doesn't perform these checks directly because
7421 * pg_xact lookups are relatively expensive. They shouldn't be repeated by
7422 * successive VACUUMs that each decide against freezing the same page.
7423 */
7424void
7426 HeapTupleFreeze *tuples, int ntuples)
7427{
7428 Page page = BufferGetPage(buffer);
7429
7430 for (int i = 0; i < ntuples; i++)
7431 {
7432 HeapTupleFreeze *frz = tuples + i;
7433 ItemId itemid = PageGetItemId(page, frz->offset);
7434 HeapTupleHeader htup;
7435
7436 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7437
7438 /* Deliberately avoid relying on tuple hint bits here */
7439 if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
7440 {
7442
7444 if (unlikely(!TransactionIdDidCommit(xmin)))
7445 ereport(ERROR,
7447 errmsg_internal("uncommitted xmin %u needs to be frozen",
7448 xmin)));
7449 }
7450
7451 /*
7452 * TransactionIdDidAbort won't work reliably in the presence of XIDs
7453 * left behind by transactions that were in progress during a crash,
7454 * so we can only check that xmax didn't commit
7455 */
7456 if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
7457 {
7459
7462 ereport(ERROR,
7464 errmsg_internal("cannot freeze committed xmax %u",
7465 xmax)));
7466 }
7467 }
7468}
7469
7470/*
7471 * Helper which executes freezing of one or more heap tuples on a page on
7472 * behalf of caller. Caller passes an array of tuple plans from
7473 * heap_prepare_freeze_tuple. Caller must set 'offset' in each plan for us.
7474 * Must be called in a critical section that also marks the buffer dirty and,
7475 * if needed, emits WAL.
7476 */
7477void
7478heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
7479{
7480 Page page = BufferGetPage(buffer);
7481
7482 for (int i = 0; i < ntuples; i++)
7483 {
7484 HeapTupleFreeze *frz = tuples + i;
7485 ItemId itemid = PageGetItemId(page, frz->offset);
7486 HeapTupleHeader htup;
7487
7488 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7490 }
7491}
7492
7493/*
7494 * heap_freeze_tuple
7495 * Freeze tuple in place, without WAL logging.
7496 *
7497 * Useful for callers like CLUSTER that perform their own WAL logging.
7498 */
7499bool
7501 TransactionId relfrozenxid, TransactionId relminmxid,
7502 TransactionId FreezeLimit, TransactionId MultiXactCutoff)
7503{
7505 bool do_freeze;
7506 bool totally_frozen;
7507 struct VacuumCutoffs cutoffs;
7508 HeapPageFreeze pagefrz;
7509
7510 cutoffs.relfrozenxid = relfrozenxid;
7511 cutoffs.relminmxid = relminmxid;
7512 cutoffs.OldestXmin = FreezeLimit;
7513 cutoffs.OldestMxact = MultiXactCutoff;
7514 cutoffs.FreezeLimit = FreezeLimit;
7516
7517 pagefrz.freeze_required = true;
7518 pagefrz.FreezePageRelfrozenXid = FreezeLimit;
7519 pagefrz.FreezePageRelminMxid = MultiXactCutoff;
7520 pagefrz.FreezePageConflictXid = InvalidTransactionId;
7521 pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
7522 pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
7523
7524 do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
7525 &pagefrz, &frz, &totally_frozen);
7526
7527 /*
7528 * Note that because this is not a WAL-logged operation, we don't need to
7529 * fill in the offset in the freeze record.
7530 */
7531
7532 if (do_freeze)
7534 return do_freeze;
7535}
7536
7537/*
7538 * For a given MultiXactId, return the hint bits that should be set in the
7539 * tuple's infomask.
7540 *
7541 * Normally this should be called for a multixact that was just created, and
7542 * so is on our local cache, so the GetMembers call is fast.
7543 */
7544static void
7547{
7548 int nmembers;
7549 MultiXactMember *members;
7550 int i;
7552 uint16 bits2 = 0;
7553 bool has_update = false;
7555
7556 /*
7557 * We only use this in multis we just created, so they cannot be values
7558 * pre-pg_upgrade.
7559 */
7560 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7561
7562 for (i = 0; i < nmembers; i++)
7563 {
7565
7566 /*
7567 * Remember the strongest lock mode held by any member of the
7568 * multixact.
7569 */
7570 mode = TUPLOCK_from_mxstatus(members[i].status);
7571 if (mode > strongest)
7572 strongest = mode;
7573
7574 /* See what other bits we need */
7575 switch (members[i].status)
7576 {
7580 break;
7581
7584 break;
7585
7587 has_update = true;
7588 break;
7589
7592 has_update = true;
7593 break;
7594 }
7595 }
7596
7599 bits |= HEAP_XMAX_EXCL_LOCK;
7600 else if (strongest == LockTupleShare)
7601 bits |= HEAP_XMAX_SHR_LOCK;
7602 else if (strongest == LockTupleKeyShare)
7603 bits |= HEAP_XMAX_KEYSHR_LOCK;
7604
7605 if (!has_update)
7606 bits |= HEAP_XMAX_LOCK_ONLY;
7607
7608 if (nmembers > 0)
7609 pfree(members);
7610
7611 *new_infomask = bits;
7613}
7614
7615/*
7616 * MultiXactIdGetUpdateXid
7617 *
7618 * Given a multixact Xmax and corresponding infomask, which does not have the
7619 * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
7620 * transaction.
7621 *
7622 * Caller is expected to check the status of the updating transaction, if
7623 * necessary.
7624 */
7625static TransactionId
7627{
7629 MultiXactMember *members;
7630 int nmembers;
7631
7632 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7633 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7634
7635 /*
7636 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7637 * pre-pg_upgrade.
7638 */
7639 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7640
7641 if (nmembers > 0)
7642 {
7643 int i;
7644
7645 for (i = 0; i < nmembers; i++)
7646 {
7647 /* Ignore lockers */
7648 if (!ISUPDATE_from_mxstatus(members[i].status))
7649 continue;
7650
7651 /* there can be at most one updater */
7653 update_xact = members[i].xid;
7654#ifndef USE_ASSERT_CHECKING
7655
7656 /*
7657 * in an assert-enabled build, walk the whole array to ensure
7658 * there's no other updater.
7659 */
7660 break;
7661#endif
7662 }
7663
7664 pfree(members);
7665 }
7666
7667 return update_xact;
7668}
7669
7670/*
7671 * HeapTupleGetUpdateXid
7672 * As above, but use a HeapTupleHeader
7673 *
7674 * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
7675 * checking the hint bits.
7676 */
7679{
7681 tup->t_infomask);
7682}
7683
7684/*
7685 * Does the given multixact conflict with the current transaction grabbing a
7686 * tuple lock of the given strength?
7687 *
7688 * The passed infomask pairs up with the given multixact in the tuple header.
7689 *
7690 * If current_is_member is not NULL, it is set to 'true' if the current
7691 * transaction is a member of the given multixact.
7692 */
7693static bool
7695 LockTupleMode lockmode, bool *current_is_member)
7696{
7697 int nmembers;
7698 MultiXactMember *members;
7699 bool result = false;
7700 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7701
7703 return false;
7704
7705 nmembers = GetMultiXactIdMembers(multi, &members, false,
7707 if (nmembers >= 0)
7708 {
7709 int i;
7710
7711 for (i = 0; i < nmembers; i++)
7712 {
7715
7716 if (result && (current_is_member == NULL || *current_is_member))
7717 break;
7718
7719 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7720
7721 /* ignore members from current xact (but track their presence) */
7722 memxid = members[i].xid;
7724 {
7725 if (current_is_member != NULL)
7726 *current_is_member = true;
7727 continue;
7728 }
7729 else if (result)
7730 continue;
7731
7732 /* ignore members that don't conflict with the lock we want */
7734 continue;
7735
7736 if (ISUPDATE_from_mxstatus(members[i].status))
7737 {
7738 /* ignore aborted updaters */
7740 continue;
7741 }
7742 else
7743 {
7744 /* ignore lockers-only that are no longer in progress */
7746 continue;
7747 }
7748
7749 /*
7750 * Whatever remains are either live lockers that conflict with our
7751 * wanted lock, and updaters that are not aborted. Those conflict
7752 * with what we want. Set up to return true, but keep going to
7753 * look for the current transaction among the multixact members,
7754 * if needed.
7755 */
7756 result = true;
7757 }
7758 pfree(members);
7759 }
7760
7761 return result;
7762}
7763
7764/*
7765 * Do_MultiXactIdWait
7766 * Actual implementation for the two functions below.
7767 *
7768 * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
7769 * needed to ensure we only sleep on conflicting members, and the infomask is
7770 * used to optimize multixact access in case it's a lock-only multi); 'nowait'
7771 * indicates whether to use conditional lock acquisition, to allow callers to
7772 * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
7773 * context information for error messages. 'remaining', if not NULL, receives
7774 * the number of members that are still running, including any (non-aborted)
7775 * subtransactions of our own transaction. 'logLockFailure' indicates whether
7776 * to log details when a lock acquisition fails with 'nowait' enabled.
7777 *
7778 * We do this by sleeping on each member using XactLockTableWait. Any
7779 * members that belong to the current backend are *not* waited for, however;
7780 * this would not merely be useless but would lead to Assert failure inside
7781 * XactLockTableWait. By the time this returns, it is certain that all
7782 * transactions *of other backends* that were members of the MultiXactId
7783 * that conflict with the requested status are dead (and no new ones can have
7784 * been added, since it is not legal to add members to an existing
7785 * MultiXactId).
7786 *
7787 * But by the time we finish sleeping, someone else may have changed the Xmax
7788 * of the containing tuple, so the caller needs to iterate on us somehow.
7789 *
7790 * Note that in case we return false, the number of remaining members is
7791 * not to be trusted.
7792 */
7793static bool
7795 uint16 infomask, bool nowait,
7796 Relation rel, const ItemPointerData *ctid, XLTW_Oper oper,
7797 int *remaining, bool logLockFailure)
7798{
7799 bool result = true;
7800 MultiXactMember *members;
7801 int nmembers;
7802 int remain = 0;
7803
7804 /* for pre-pg_upgrade tuples, no need to sleep at all */
7805 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7806 GetMultiXactIdMembers(multi, &members, false,
7808
7809 if (nmembers >= 0)
7810 {
7811 int i;
7812
7813 for (i = 0; i < nmembers; i++)
7814 {
7815 TransactionId memxid = members[i].xid;
7816 MultiXactStatus memstatus = members[i].status;
7817
7819 {
7820 remain++;
7821 continue;
7822 }
7823
7825 LOCKMODE_from_mxstatus(status)))
7826 {
7828 remain++;
7829 continue;
7830 }
7831
7832 /*
7833 * This member conflicts with our multi, so we have to sleep (or
7834 * return failure, if asked to avoid waiting.)
7835 *
7836 * Note that we don't set up an error context callback ourselves,
7837 * but instead we pass the info down to XactLockTableWait. This
7838 * might seem a bit wasteful because the context is set up and
7839 * tore down for each member of the multixact, but in reality it
7840 * should be barely noticeable, and it avoids duplicate code.
7841 */
7842 if (nowait)
7843 {
7845 if (!result)
7846 break;
7847 }
7848 else
7849 XactLockTableWait(memxid, rel, ctid, oper);
7850 }
7851
7852 pfree(members);
7853 }
7854
7855 if (remaining)
7856 *remaining = remain;
7857
7858 return result;
7859}
7860
7861/*
7862 * MultiXactIdWait
7863 * Sleep on a MultiXactId.
7864 *
7865 * By the time we finish sleeping, someone else may have changed the Xmax
7866 * of the containing tuple, so the caller needs to iterate on us somehow.
7867 *
7868 * We return (in *remaining, if not NULL) the number of members that are still
7869 * running, including any (non-aborted) subtransactions of our own transaction.
7870 */
7871static void
7873 Relation rel, const ItemPointerData *ctid, XLTW_Oper oper,
7874 int *remaining)
7875{
7876 (void) Do_MultiXactIdWait(multi, status, infomask, false,
7877 rel, ctid, oper, remaining, false);
7878}
7879
7880/*
7881 * ConditionalMultiXactIdWait
7882 * As above, but only lock if we can get the lock without blocking.
7883 *
7884 * By the time we finish sleeping, someone else may have changed the Xmax
7885 * of the containing tuple, so the caller needs to iterate on us somehow.
7886 *
7887 * If the multixact is now all gone, return true. Returns false if some
7888 * transactions might still be running.
7889 *
7890 * We return (in *remaining, if not NULL) the number of members that are still
7891 * running, including any (non-aborted) subtransactions of our own transaction.
7892 */
7893static bool
7895 uint16 infomask, Relation rel, int *remaining,
7896 bool logLockFailure)
7897{
7898 return Do_MultiXactIdWait(multi, status, infomask, true,
7900}
7901
7902/*
7903 * heap_tuple_needs_eventual_freeze
7904 *
7905 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7906 * will eventually require freezing (if tuple isn't removed by pruning first).
7907 */
7908bool
7910{
7911 TransactionId xid;
7912
7913 /*
7914 * If xmin is a normal transaction ID, this tuple is definitely not
7915 * frozen.
7916 */
7917 xid = HeapTupleHeaderGetXmin(tuple);
7918 if (TransactionIdIsNormal(xid))
7919 return true;
7920
7921 /*
7922 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7923 */
7924 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7925 {
7926 MultiXactId multi;
7927
7928 multi = HeapTupleHeaderGetRawXmax(tuple);
7929 if (MultiXactIdIsValid(multi))
7930 return true;
7931 }
7932 else
7933 {
7934 xid = HeapTupleHeaderGetRawXmax(tuple);
7935 if (TransactionIdIsNormal(xid))
7936 return true;
7937 }
7938
7939 if (tuple->t_infomask & HEAP_MOVED)
7940 {
7941 xid = HeapTupleHeaderGetXvac(tuple);
7942 if (TransactionIdIsNormal(xid))
7943 return true;
7944 }
7945
7946 return false;
7947}
7948
7949/*
7950 * heap_tuple_should_freeze
7951 *
7952 * Return value indicates if heap_prepare_freeze_tuple sibling function would
7953 * (or should) force freezing of the heap page that contains caller's tuple.
7954 * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing.
7955 * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs.
7956 *
7957 * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output
7958 * arguments help VACUUM track the oldest extant XID/MXID remaining in rel.
7959 * Our working assumption is that caller won't decide to freeze this tuple.
7960 * It's up to caller to only ratchet back its own top-level trackers after the
7961 * point that it fully commits to not freezing the tuple/page in question.
7962 */
7963bool
7965 const struct VacuumCutoffs *cutoffs,
7966 TransactionId *NoFreezePageRelfrozenXid,
7967 MultiXactId *NoFreezePageRelminMxid)
7968{
7969 TransactionId xid;
7970 MultiXactId multi;
7971 bool freeze = false;
7972
7973 /* First deal with xmin */
7974 xid = HeapTupleHeaderGetXmin(tuple);
7975 if (TransactionIdIsNormal(xid))
7976 {
7978 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7979 *NoFreezePageRelfrozenXid = xid;
7980 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7981 freeze = true;
7982 }
7983
7984 /* Now deal with xmax */
7986 multi = InvalidMultiXactId;
7987 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7988 multi = HeapTupleHeaderGetRawXmax(tuple);
7989 else
7990 xid = HeapTupleHeaderGetRawXmax(tuple);
7991
7992 if (TransactionIdIsNormal(xid))
7993 {
7995 /* xmax is a non-permanent XID */
7996 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7997 *NoFreezePageRelfrozenXid = xid;
7998 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7999 freeze = true;
8000 }
8001 else if (!MultiXactIdIsValid(multi))
8002 {
8003 /* xmax is a permanent XID or invalid MultiXactId/XID */
8004 }
8005 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
8006 {
8007 /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
8008 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
8009 *NoFreezePageRelminMxid = multi;
8010 /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
8011 freeze = true;
8012 }
8013 else
8014 {
8015 /* xmax is a MultiXactId that may have an updater XID */
8016 MultiXactMember *members;
8017 int nmembers;
8018
8020 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
8021 *NoFreezePageRelminMxid = multi;
8022 if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
8023 freeze = true;
8024
8025 /* need to check whether any member of the mxact is old */
8026 nmembers = GetMultiXactIdMembers(multi, &members, false,
8028
8029 for (int i = 0; i < nmembers; i++)
8030 {
8031 xid = members[i].xid;
8033 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8034 *NoFreezePageRelfrozenXid = xid;
8035 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
8036 freeze = true;
8037 }
8038 if (nmembers > 0)
8039 pfree(members);
8040 }
8041
8042 if (tuple->t_infomask & HEAP_MOVED)
8043 {
8044 xid = HeapTupleHeaderGetXvac(tuple);
8045 if (TransactionIdIsNormal(xid))
8046 {
8048 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8049 *NoFreezePageRelfrozenXid = xid;
8050 /* heap_prepare_freeze_tuple forces xvac freezing */
8051 freeze = true;
8052 }
8053 }
8054
8055 return freeze;
8056}
8057
8058/*
8059 * Maintain snapshotConflictHorizon for caller by ratcheting forward its value
8060 * using any committed XIDs contained in 'tuple', an obsolescent heap tuple
8061 * that caller is in the process of physically removing, e.g. via HOT pruning
8062 * or index deletion.
8063 *
8064 * Caller must initialize its value to InvalidTransactionId, which is
8065 * generally interpreted as "definitely no need for a recovery conflict".
8066 * Final value must reflect all heap tuples that caller will physically remove
8067 * (or remove TID references to) via its ongoing pruning/deletion operation.
8068 * ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from
8069 * caller's WAL record) by REDO routine when it replays caller's operation.
8070 */
8071void
8073 TransactionId *snapshotConflictHorizon)
8074{
8078
8079 if (tuple->t_infomask & HEAP_MOVED)
8080 {
8081 if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
8082 *snapshotConflictHorizon = xvac;
8083 }
8084
8085 /*
8086 * Ignore tuples inserted by an aborted transaction or if the tuple was
8087 * updated/deleted by the inserting transaction.
8088 *
8089 * Look for a committed hint bit, or if no xmin bit is set, check clog.
8090 */
8091 if (HeapTupleHeaderXminCommitted(tuple) ||
8093 {
8094 if (xmax != xmin &&
8095 TransactionIdFollows(xmax, *snapshotConflictHorizon))
8096 *snapshotConflictHorizon = xmax;
8097 }
8098}
8099
8100#ifdef USE_PREFETCH
8101/*
8102 * Helper function for heap_index_delete_tuples. Issues prefetch requests for
8103 * prefetch_count buffers. The prefetch_state keeps track of all the buffers
8104 * we can prefetch, and which have already been prefetched; each call to this
8105 * function picks up where the previous call left off.
8106 *
8107 * Note: we expect the deltids array to be sorted in an order that groups TIDs
8108 * by heap block, with all TIDs for each block appearing together in exactly
8109 * one group.
8110 */
8111static void
8114 int prefetch_count)
8115{
8117 int count = 0;
8118 int i;
8119 int ndeltids = prefetch_state->ndeltids;
8120 TM_IndexDelete *deltids = prefetch_state->deltids;
8121
8122 for (i = prefetch_state->next_item;
8123 i < ndeltids && count < prefetch_count;
8124 i++)
8125 {
8126 ItemPointer htid = &deltids[i].tid;
8127
8130 {
8133 count++;
8134 }
8135 }
8136
8137 /*
8138 * Save the prefetch position so that next time we can continue from that
8139 * position.
8140 */
8141 prefetch_state->next_item = i;
8142 prefetch_state->cur_hblkno = cur_hblkno;
8143}
8144#endif
8145
8146/*
8147 * Helper function for heap_index_delete_tuples. Checks for index corruption
8148 * involving an invalid TID in index AM caller's index page.
8149 *
8150 * This is an ideal place for these checks. The index AM must hold a buffer
8151 * lock on the index page containing the TIDs we examine here, so we don't
8152 * have to worry about concurrent VACUUMs at all. We can be sure that the
8153 * index is corrupt when htid points directly to an LP_UNUSED item or
8154 * heap-only tuple, which is not the case during standard index scans.
8155 */
8156static inline void
8158 Page page, OffsetNumber maxoff,
8160{
8162 ItemId iid;
8163
8164 Assert(OffsetNumberIsValid(istatus->idxoffnum));
8165
8166 if (unlikely(indexpagehoffnum > maxoff))
8167 ereport(ERROR,
8169 errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
8172 istatus->idxoffnum, delstate->iblknum,
8174
8176 if (unlikely(!ItemIdIsUsed(iid)))
8177 ereport(ERROR,
8179 errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
8182 istatus->idxoffnum, delstate->iblknum,
8184
8185 if (ItemIdHasStorage(iid))
8186 {
8187 HeapTupleHeader htup;
8188
8190 htup = (HeapTupleHeader) PageGetItem(page, iid);
8191
8193 ereport(ERROR,
8195 errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
8198 istatus->idxoffnum, delstate->iblknum,
8200 }
8201}
8202
8203/*
8204 * heapam implementation of tableam's index_delete_tuples interface.
8205 *
8206 * This helper function is called by index AMs during index tuple deletion.
8207 * See tableam header comments for an explanation of the interface implemented
8208 * here and a general theory of operation. Note that each call here is either
8209 * a simple index deletion call, or a bottom-up index deletion call.
8210 *
8211 * It's possible for this to generate a fair amount of I/O, since we may be
8212 * deleting hundreds of tuples from a single index block. To amortize that
8213 * cost to some degree, this uses prefetching and combines repeat accesses to
8214 * the same heap block.
8215 */
8218{
8219 /* Initial assumption is that earlier pruning took care of conflict */
8220 TransactionId snapshotConflictHorizon = InvalidTransactionId;
8223 Page page = NULL;
8226#ifdef USE_PREFETCH
8229#endif
8231 int finalndeltids = 0,
8232 nblocksaccessed = 0;
8233
8234 /* State that's only used in bottom-up index deletion case */
8235 int nblocksfavorable = 0;
8236 int curtargetfreespace = delstate->bottomupfreespace,
8237 lastfreespace = 0,
8238 actualfreespace = 0;
8239 bool bottomup_final_block = false;
8240
8242
8243 /* Sort caller's deltids array by TID for further processing */
8245
8246 /*
8247 * Bottom-up case: resort deltids array in an order attuned to where the
8248 * greatest number of promising TIDs are to be found, and determine how
8249 * many blocks from the start of sorted array should be considered
8250 * favorable. This will also shrink the deltids array in order to
8251 * eliminate completely unfavorable blocks up front.
8252 */
8253 if (delstate->bottomup)
8255
8256#ifdef USE_PREFETCH
8257 /* Initialize prefetch state. */
8259 prefetch_state.next_item = 0;
8260 prefetch_state.ndeltids = delstate->ndeltids;
8261 prefetch_state.deltids = delstate->deltids;
8262
8263 /*
8264 * Determine the prefetch distance that we will attempt to maintain.
8265 *
8266 * Since the caller holds a buffer lock somewhere in rel, we'd better make
8267 * sure that isn't a catalog relation before we call code that does
8268 * syscache lookups, to avoid risk of deadlock.
8269 */
8270 if (IsCatalogRelation(rel))
8272 else
8275
8276 /* Cap initial prefetch distance for bottom-up deletion caller */
8277 if (delstate->bottomup)
8278 {
8282 }
8283
8284 /* Start prefetching. */
8286#endif
8287
8288 /* Iterate over deltids, determine which to delete, check their horizon */
8289 Assert(delstate->ndeltids > 0);
8290 for (int i = 0; i < delstate->ndeltids; i++)
8291 {
8292 TM_IndexDelete *ideltid = &delstate->deltids[i];
8293 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8294 ItemPointer htid = &ideltid->tid;
8295 OffsetNumber offnum;
8296
8297 /*
8298 * Read buffer, and perform required extra steps each time a new block
8299 * is encountered. Avoid refetching if it's the same block as the one
8300 * from the last htid.
8301 */
8302 if (blkno == InvalidBlockNumber ||
8304 {
8305 /*
8306 * Consider giving up early for bottom-up index deletion caller
8307 * first. (Only prefetch next-next block afterwards, when it
8308 * becomes clear that we're at least going to access the next
8309 * block in line.)
8310 *
8311 * Sometimes the first block frees so much space for bottom-up
8312 * caller that the deletion process can end without accessing any
8313 * more blocks. It is usually necessary to access 2 or 3 blocks
8314 * per bottom-up deletion operation, though.
8315 */
8316 if (delstate->bottomup)
8317 {
8318 /*
8319 * We often allow caller to delete a few additional items
8320 * whose entries we reached after the point that space target
8321 * from caller was satisfied. The cost of accessing the page
8322 * was already paid at that point, so it made sense to finish
8323 * it off. When that happened, we finalize everything here
8324 * (by finishing off the whole bottom-up deletion operation
8325 * without needlessly paying the cost of accessing any more
8326 * blocks).
8327 */
8329 break;
8330
8331 /*
8332 * Give up when we didn't enable our caller to free any
8333 * additional space as a result of processing the page that we
8334 * just finished up with. This rule is the main way in which
8335 * we keep the cost of bottom-up deletion under control.
8336 */
8338 break;
8339 lastfreespace = actualfreespace; /* for next time */
8340
8341 /*
8342 * Deletion operation (which is bottom-up) will definitely
8343 * access the next block in line. Prepare for that now.
8344 *
8345 * Decay target free space so that we don't hang on for too
8346 * long with a marginal case. (Space target is only truly
8347 * helpful when it allows us to recognize that we don't need
8348 * to access more than 1 or 2 blocks to satisfy caller due to
8349 * agreeable workload characteristics.)
8350 *
8351 * We are a bit more patient when we encounter contiguous
8352 * blocks, though: these are treated as favorable blocks. The
8353 * decay process is only applied when the next block in line
8354 * is not a favorable/contiguous block. This is not an
8355 * exception to the general rule; we still insist on finding
8356 * at least one deletable item per block accessed. See
8357 * bottomup_nblocksfavorable() for full details of the theory
8358 * behind favorable blocks and heap block locality in general.
8359 *
8360 * Note: The first block in line is always treated as a
8361 * favorable block, so the earliest possible point that the
8362 * decay can be applied is just before we access the second
8363 * block in line. The Assert() verifies this for us.
8364 */
8366 if (nblocksfavorable > 0)
8368 else
8369 curtargetfreespace /= 2;
8370 }
8371
8372 /* release old buffer */
8373 if (BufferIsValid(buf))
8375
8377 buf = ReadBuffer(rel, blkno);
8379 Assert(!delstate->bottomup ||
8381
8382#ifdef USE_PREFETCH
8383
8384 /*
8385 * To maintain the prefetch distance, prefetch one more page for
8386 * each page we read.
8387 */
8389#endif
8390
8392
8393 page = BufferGetPage(buf);
8394 maxoff = PageGetMaxOffsetNumber(page);
8395 }
8396
8397 /*
8398 * In passing, detect index corruption involving an index page with a
8399 * TID that points to a location in the heap that couldn't possibly be
8400 * correct. We only do this with actual TIDs from caller's index page
8401 * (not items reached by traversing through a HOT chain).
8402 */
8404
8405 if (istatus->knowndeletable)
8406 Assert(!delstate->bottomup && !istatus->promising);
8407 else
8408 {
8409 ItemPointerData tmp = *htid;
8411
8412 /* Are any tuples from this HOT chain non-vacuumable? */
8414 &heapTuple, NULL, true))
8415 continue; /* can't delete entry */
8416
8417 /* Caller will delete, since whole HOT chain is vacuumable */
8418 istatus->knowndeletable = true;
8419
8420 /* Maintain index free space info for bottom-up deletion case */
8421 if (delstate->bottomup)
8422 {
8423 Assert(istatus->freespace > 0);
8424 actualfreespace += istatus->freespace;
8426 bottomup_final_block = true;
8427 }
8428 }
8429
8430 /*
8431 * Maintain snapshotConflictHorizon value for deletion operation as a
8432 * whole by advancing current value using heap tuple headers. This is
8433 * loosely based on the logic for pruning a HOT chain.
8434 */
8436 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
8437 for (;;)
8438 {
8439 ItemId lp;
8440 HeapTupleHeader htup;
8441
8442 /* Sanity check (pure paranoia) */
8443 if (offnum < FirstOffsetNumber)
8444 break;
8445
8446 /*
8447 * An offset past the end of page's line pointer array is possible
8448 * when the array was truncated
8449 */
8450 if (offnum > maxoff)
8451 break;
8452
8453 lp = PageGetItemId(page, offnum);
8455 {
8456 offnum = ItemIdGetRedirect(lp);
8457 continue;
8458 }
8459
8460 /*
8461 * We'll often encounter LP_DEAD line pointers (especially with an
8462 * entry marked knowndeletable by our caller up front). No heap
8463 * tuple headers get examined for an htid that leads us to an
8464 * LP_DEAD item. This is okay because the earlier pruning
8465 * operation that made the line pointer LP_DEAD in the first place
8466 * must have considered the original tuple header as part of
8467 * generating its own snapshotConflictHorizon value.
8468 *
8469 * Relying on XLOG_HEAP2_PRUNE_VACUUM_SCAN records like this is
8470 * the same strategy that index vacuuming uses in all cases. Index
8471 * VACUUM WAL records don't even have a snapshotConflictHorizon
8472 * field of their own for this reason.
8473 */
8474 if (!ItemIdIsNormal(lp))
8475 break;
8476
8477 htup = (HeapTupleHeader) PageGetItem(page, lp);
8478
8479 /*
8480 * Check the tuple XMIN against prior XMAX, if any
8481 */
8484 break;
8485
8487 &snapshotConflictHorizon);
8488
8489 /*
8490 * If the tuple is not HOT-updated, then we are at the end of this
8491 * HOT-chain. No need to visit later tuples from the same update
8492 * chain (they get their own index entries) -- just move on to
8493 * next htid from index AM caller.
8494 */
8495 if (!HeapTupleHeaderIsHotUpdated(htup))
8496 break;
8497
8498 /* Advance to next HOT chain member */
8499 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
8500 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
8502 }
8503
8504 /* Enable further/final shrinking of deltids for caller */
8505 finalndeltids = i + 1;
8506 }
8507
8509
8510 /*
8511 * Shrink deltids array to exclude non-deletable entries at the end. This
8512 * is not just a minor optimization. Final deltids array size might be
8513 * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
8514 * ndeltids being zero in all cases with zero total deletable entries.
8515 */
8516 Assert(finalndeltids > 0 || delstate->bottomup);
8517 delstate->ndeltids = finalndeltids;
8518
8519 return snapshotConflictHorizon;
8520}
8521
8522/*
8523 * Specialized inlineable comparison function for index_delete_sort()
8524 */
8525static inline int
8527{
8528 ItemPointer tid1 = &deltid1->tid;
8529 ItemPointer tid2 = &deltid2->tid;
8530
8531 {
8534
8535 if (blk1 != blk2)
8536 return (blk1 < blk2) ? -1 : 1;
8537 }
8538 {
8541
8542 if (pos1 != pos2)
8543 return (pos1 < pos2) ? -1 : 1;
8544 }
8545
8546 Assert(false);
8547
8548 return 0;
8549}
8550
8551/*
8552 * Sort deltids array from delstate by TID. This prepares it for further
8553 * processing by heap_index_delete_tuples().
8554 *
8555 * This operation becomes a noticeable consumer of CPU cycles with some
8556 * workloads, so we go to the trouble of specialization/micro optimization.
8557 * We use shellsort for this because it's easy to specialize, compiles to
8558 * relatively few instructions, and is adaptive to presorted inputs/subsets
8559 * (which are typical here).
8560 */
8561static void
8563{
8564 TM_IndexDelete *deltids = delstate->deltids;
8565 int ndeltids = delstate->ndeltids;
8566
8567 /*
8568 * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
8569 *
8570 * This implementation is fast with array sizes up to ~4500. This covers
8571 * all supported BLCKSZ values.
8572 */
8573 const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
8574
8575 /* Think carefully before changing anything here -- keep swaps cheap */
8576 StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
8577 "element size exceeds 8 bytes");
8578
8579 for (int g = 0; g < lengthof(gaps); g++)
8580 {
8581 for (int hi = gaps[g], i = hi; i < ndeltids; i++)
8582 {
8583 TM_IndexDelete d = deltids[i];
8584 int j = i;
8585
8586 while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
8587 {
8588 deltids[j] = deltids[j - hi];
8589 j -= hi;
8590 }
8591 deltids[j] = d;
8592 }
8593 }
8594}
8595
8596/*
8597 * Returns how many blocks should be considered favorable/contiguous for a
8598 * bottom-up index deletion pass. This is a number of heap blocks that starts
8599 * from and includes the first block in line.
8600 *
8601 * There is always at least one favorable block during bottom-up index
8602 * deletion. In the worst case (i.e. with totally random heap blocks) the
8603 * first block in line (the only favorable block) can be thought of as a
8604 * degenerate array of contiguous blocks that consists of a single block.
8605 * heap_index_delete_tuples() will expect this.
8606 *
8607 * Caller passes blockgroups, a description of the final order that deltids
8608 * will be sorted in for heap_index_delete_tuples() bottom-up index deletion
8609 * processing. Note that deltids need not actually be sorted just yet (caller
8610 * only passes deltids to us so that we can interpret blockgroups).
8611 *
8612 * You might guess that the existence of contiguous blocks cannot matter much,
8613 * since in general the main factor that determines which blocks we visit is
8614 * the number of promising TIDs, which is a fixed hint from the index AM.
8615 * We're not really targeting the general case, though -- the actual goal is
8616 * to adapt our behavior to a wide variety of naturally occurring conditions.
8617 * The effects of most of the heuristics we apply are only noticeable in the
8618 * aggregate, over time and across many _related_ bottom-up index deletion
8619 * passes.
8620 *
8621 * Deeming certain blocks favorable allows heapam to recognize and adapt to
8622 * workloads where heap blocks visited during bottom-up index deletion can be
8623 * accessed contiguously, in the sense that each newly visited block is the
8624 * neighbor of the block that bottom-up deletion just finished processing (or
8625 * close enough to it). It will likely be cheaper to access more favorable
8626 * blocks sooner rather than later (e.g. in this pass, not across a series of
8627 * related bottom-up passes). Either way it is probably only a matter of time
8628 * (or a matter of further correlated version churn) before all blocks that
8629 * appear together as a single large batch of favorable blocks get accessed by
8630 * _some_ bottom-up pass. Large batches of favorable blocks tend to either
8631 * appear almost constantly or not even once (it all depends on per-index
8632 * workload characteristics).
8633 *
8634 * Note that the blockgroups sort order applies a power-of-two bucketing
8635 * scheme that creates opportunities for contiguous groups of blocks to get
8636 * batched together, at least with workloads that are naturally amenable to
8637 * being driven by heap block locality. This doesn't just enhance the spatial
8638 * locality of bottom-up heap block processing in the obvious way. It also
8639 * enables temporal locality of access, since sorting by heap block number
8640 * naturally tends to make the bottom-up processing order deterministic.
8641 *
8642 * Consider the following example to get a sense of how temporal locality
8643 * might matter: There is a heap relation with several indexes, each of which
8644 * is low to medium cardinality. It is subject to constant non-HOT updates.
8645 * The updates are skewed (in one part of the primary key, perhaps). None of
8646 * the indexes are logically modified by the UPDATE statements (if they were
8647 * then bottom-up index deletion would not be triggered in the first place).
8648 * Naturally, each new round of index tuples (for each heap tuple that gets a
8649 * heap_update() call) will have the same heap TID in each and every index.
8650 * Since these indexes are low cardinality and never get logically modified,
8651 * heapam processing during bottom-up deletion passes will access heap blocks
8652 * in approximately sequential order. Temporal locality of access occurs due
8653 * to bottom-up deletion passes behaving very similarly across each of the
8654 * indexes at any given moment. This keeps the number of buffer misses needed
8655 * to visit heap blocks to a minimum.
8656 */
8657static int
8659 TM_IndexDelete *deltids)
8660{
8661 int64 lastblock = -1;
8662 int nblocksfavorable = 0;
8663
8664 Assert(nblockgroups >= 1);
8666
8667 /*
8668 * We tolerate heap blocks that will be accessed only slightly out of
8669 * physical order. Small blips occur when a pair of almost-contiguous
8670 * blocks happen to fall into different buckets (perhaps due only to a
8671 * small difference in npromisingtids that the bucketing scheme didn't
8672 * quite manage to ignore). We effectively ignore these blips by applying
8673 * a small tolerance. The precise tolerance we use is a little arbitrary,
8674 * but it works well enough in practice.
8675 */
8676 for (int b = 0; b < nblockgroups; b++)
8677 {
8678 IndexDeleteCounts *group = blockgroups + b;
8679 TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
8681
8682 if (lastblock != -1 &&
8685 break;
8686
8688 lastblock = block;
8689 }
8690
8691 /* Always indicate that there is at least 1 favorable block */
8693
8694 return nblocksfavorable;
8695}
8696
8697/*
8698 * qsort comparison function for bottomup_sort_and_shrink()
8699 */
8700static int
8701bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
8702{
8705
8706 /*
8707 * Most significant field is npromisingtids (which we invert the order of
8708 * so as to sort in desc order).
8709 *
8710 * Caller should have already normalized npromisingtids fields into
8711 * power-of-two values (buckets).
8712 */
8713 if (group1->npromisingtids > group2->npromisingtids)
8714 return -1;
8715 if (group1->npromisingtids < group2->npromisingtids)
8716 return 1;
8717
8718 /*
8719 * Tiebreak: desc ntids sort order.
8720 *
8721 * We cannot expect power-of-two values for ntids fields. We should
8722 * behave as if they were already rounded up for us instead.
8723 */
8724 if (group1->ntids != group2->ntids)
8725 {
8728
8729 if (ntids1 > ntids2)
8730 return -1;
8731 if (ntids1 < ntids2)
8732 return 1;
8733 }
8734
8735 /*
8736 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
8737 * block in deltids array) order.
8738 *
8739 * This is equivalent to sorting in ascending heap block number order
8740 * (among otherwise equal subsets of the array). This approach allows us
8741 * to avoid accessing the out-of-line TID. (We rely on the assumption
8742 * that the deltids array was sorted in ascending heap TID order when
8743 * these offsets to the first TID from each heap block group were formed.)
8744 */
8745 if (group1->ifirsttid > group2->ifirsttid)
8746 return 1;
8747 if (group1->ifirsttid < group2->ifirsttid)
8748 return -1;
8749
8751
8752 return 0;
8753}
8754
8755/*
8756 * heap_index_delete_tuples() helper function for bottom-up deletion callers.
8757 *
8758 * Sorts deltids array in the order needed for useful processing by bottom-up
8759 * deletion. The array should already be sorted in TID order when we're
8760 * called. The sort process groups heap TIDs from deltids into heap block
8761 * groupings. Earlier/more-promising groups/blocks are usually those that are
8762 * known to have the most "promising" TIDs.
8763 *
8764 * Sets new size of deltids array (ndeltids) in state. deltids will only have
8765 * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
8766 * return. This often means that deltids will be shrunk to a small fraction
8767 * of its original size (we eliminate many heap blocks from consideration for
8768 * caller up front).
8769 *
8770 * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable()
8771 * for a definition and full details.
8772 */
8773static int
8775{
8779 int nblockgroups = 0;
8780 int ncopied = 0;
8781 int nblocksfavorable = 0;
8782
8783 Assert(delstate->bottomup);
8784 Assert(delstate->ndeltids > 0);
8785
8786 /* Calculate per-heap-block count of TIDs */
8788 for (int i = 0; i < delstate->ndeltids; i++)
8789 {
8790 TM_IndexDelete *ideltid = &delstate->deltids[i];
8791 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8792 ItemPointer htid = &ideltid->tid;
8793 bool promising = istatus->promising;
8794
8796 {
8797 /* New block group */
8798 nblockgroups++;
8799
8802
8804 blockgroups[nblockgroups - 1].ifirsttid = i;
8805 blockgroups[nblockgroups - 1].ntids = 1;
8806 blockgroups[nblockgroups - 1].npromisingtids = 0;
8807 }
8808 else
8809 {
8810 blockgroups[nblockgroups - 1].ntids++;
8811 }
8812
8813 if (promising)
8814 blockgroups[nblockgroups - 1].npromisingtids++;
8815 }
8816
8817 /*
8818 * We're about ready to sort block groups to determine the optimal order
8819 * for visiting heap blocks. But before we do, round the number of
8820 * promising tuples for each block group up to the next power-of-two,
8821 * unless it is very low (less than 4), in which case we round up to 4.
8822 * npromisingtids is far too noisy to trust when choosing between a pair
8823 * of block groups that both have very low values.
8824 *
8825 * This scheme divides heap blocks/block groups into buckets. Each bucket
8826 * contains blocks that have _approximately_ the same number of promising
8827 * TIDs as each other. The goal is to ignore relatively small differences
8828 * in the total number of promising entries, so that the whole process can
8829 * give a little weight to heapam factors (like heap block locality)
8830 * instead. This isn't a trade-off, really -- we have nothing to lose. It
8831 * would be foolish to interpret small differences in npromisingtids
8832 * values as anything more than noise.
8833 *
8834 * We tiebreak on nhtids when sorting block group subsets that have the
8835 * same npromisingtids, but this has the same issues as npromisingtids,
8836 * and so nhtids is subject to the same power-of-two bucketing scheme. The
8837 * only reason that we don't fix nhtids in the same way here too is that
8838 * we'll need accurate nhtids values after the sort. We handle nhtids
8839 * bucketization dynamically instead (in the sort comparator).
8840 *
8841 * See bottomup_nblocksfavorable() for a full explanation of when and how
8842 * heap locality/favorable blocks can significantly influence when and how
8843 * heap blocks are accessed.
8844 */
8845 for (int b = 0; b < nblockgroups; b++)
8846 {
8847 IndexDeleteCounts *group = blockgroups + b;
8848
8849 /* Better off falling back on nhtids with low npromisingtids */
8850 if (group->npromisingtids <= 4)
8851 group->npromisingtids = 4;
8852 else
8853 group->npromisingtids =
8855 }
8856
8857 /* Sort groups and rearrange caller's deltids array */
8860 reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
8861
8863 /* Determine number of favorable blocks at the start of final deltids */
8865 delstate->deltids);
8866
8867 for (int b = 0; b < nblockgroups; b++)
8868 {
8869 IndexDeleteCounts *group = blockgroups + b;
8870 TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
8871
8873 sizeof(TM_IndexDelete) * group->ntids);
8874 ncopied += group->ntids;
8875 }
8876
8877 /* Copy final grouped and sorted TIDs back into start of caller's array */
8879 sizeof(TM_IndexDelete) * ncopied);
8880 delstate->ndeltids = ncopied;
8881
8884
8885 return nblocksfavorable;
8886}
8887
8888/*
8889 * Perform XLogInsert for a heap-visible operation. 'block' is the block
8890 * being marked all-visible, and vm_buffer is the buffer containing the
8891 * corresponding visibility map block. Both should have already been modified
8892 * and dirtied.
8893 *
8894 * snapshotConflictHorizon comes from the largest xmin on the page being
8895 * marked all-visible. REDO routine uses it to generate recovery conflicts.
8896 *
8897 * If checksums or wal_log_hints are enabled, we may also generate a full-page
8898 * image of heap_buffer. Otherwise, we optimize away the FPI (by specifying
8899 * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not*
8900 * update the heap page's LSN.
8901 */
8904 TransactionId snapshotConflictHorizon, uint8 vmflags)
8905{
8908 uint8 flags;
8909
8912
8913 xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
8914 xlrec.flags = vmflags;
8919
8921
8922 flags = REGBUF_STANDARD;
8923 if (!XLogHintBitIsNeeded())
8924 flags |= REGBUF_NO_IMAGE;
8926
8928
8929 return recptr;
8930}
8931
8932/*
8933 * Perform XLogInsert for a heap-update operation. Caller must already
8934 * have modified the buffer(s) and marked them dirty.
8935 */
8936static XLogRecPtr
8941{
8945 uint8 info;
8947 uint16 prefixlen = 0,
8948 suffixlen = 0;
8950 Page page = BufferGetPage(newbuf);
8952 bool init;
8953 int bufflags;
8954
8955 /* Caller should not call me on a non-WAL-logged relation */
8957
8959
8961 info = XLOG_HEAP_HOT_UPDATE;
8962 else
8963 info = XLOG_HEAP_UPDATE;
8964
8965 /*
8966 * If the old and new tuple are on the same page, we only need to log the
8967 * parts of the new tuple that were changed. That saves on the amount of
8968 * WAL we need to write. Currently, we just count any unchanged bytes in
8969 * the beginning and end of the tuple. That's quick to check, and
8970 * perfectly covers the common case that only one field is updated.
8971 *
8972 * We could do this even if the old and new tuple are on different pages,
8973 * but only if we don't make a full-page image of the old page, which is
8974 * difficult to know in advance. Also, if the old tuple is corrupt for
8975 * some reason, it would allow the corruption to propagate the new page,
8976 * so it seems best to avoid. Under the general assumption that most
8977 * updates tend to create the new tuple version on the same page, there
8978 * isn't much to be gained by doing this across pages anyway.
8979 *
8980 * Skip this if we're taking a full-page image of the new page, as we
8981 * don't include the new tuple in the WAL record in that case. Also
8982 * disable if effective_wal_level='logical', as logical decoding needs to
8983 * be able to read the new tuple in whole from the WAL record alone.
8984 */
8985 if (oldbuf == newbuf && !need_tuple_data &&
8987 {
8988 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8989 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8990 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8991 int newlen = newtup->t_len - newtup->t_data->t_hoff;
8992
8993 /* Check for common prefix between old and new tuple */
8994 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8995 {
8996 if (newp[prefixlen] != oldp[prefixlen])
8997 break;
8998 }
8999
9000 /*
9001 * Storing the length of the prefix takes 2 bytes, so we need to save
9002 * at least 3 bytes or there's no point.
9003 */
9004 if (prefixlen < 3)
9005 prefixlen = 0;
9006
9007 /* Same for suffix */
9009 {
9010 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
9011 break;
9012 }
9013 if (suffixlen < 3)
9014 suffixlen = 0;
9015 }
9016
9017 /* Prepare main WAL data chain */
9018 xlrec.flags = 0;
9023 if (prefixlen > 0)
9025 if (suffixlen > 0)
9027 if (need_tuple_data)
9028 {
9030 if (old_key_tuple)
9031 {
9032 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
9034 else
9036 }
9037 }
9038
9039 /* If new tuple is the single and first tuple on page... */
9042 {
9043 info |= XLOG_HEAP_INIT_PAGE;
9044 init = true;
9045 }
9046 else
9047 init = false;
9048
9049 /* Prepare WAL data for the old page */
9050 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
9051 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
9052 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
9053 oldtup->t_data->t_infomask2);
9054
9055 /* Prepare WAL data for the new page */
9056 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
9057 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
9058
9060 if (init)
9062 if (need_tuple_data)
9064
9066 if (oldbuf != newbuf)
9068
9070
9071 /*
9072 * Prepare WAL data for the new tuple.
9073 */
9074 if (prefixlen > 0 || suffixlen > 0)
9075 {
9076 if (prefixlen > 0 && suffixlen > 0)
9077 {
9080 XLogRegisterBufData(0, &prefix_suffix, sizeof(uint16) * 2);
9081 }
9082 else if (prefixlen > 0)
9083 {
9084 XLogRegisterBufData(0, &prefixlen, sizeof(uint16));
9085 }
9086 else
9087 {
9088 XLogRegisterBufData(0, &suffixlen, sizeof(uint16));
9089 }
9090 }
9091
9092 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
9093 xlhdr.t_infomask = newtup->t_data->t_infomask;
9094 xlhdr.t_hoff = newtup->t_data->t_hoff;
9096
9097 /*
9098 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
9099 *
9100 * The 'data' doesn't include the common prefix or suffix.
9101 */
9103 if (prefixlen == 0)
9104 {
9106 (char *) newtup->t_data + SizeofHeapTupleHeader,
9108 }
9109 else
9110 {
9111 /*
9112 * Have to write the null bitmap and data after the common prefix as
9113 * two separate rdata entries.
9114 */
9115 /* bitmap [+ padding] [+ oid] */
9116 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
9117 {
9119 (char *) newtup->t_data + SizeofHeapTupleHeader,
9120 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
9121 }
9122
9123 /* data after common prefix */
9125 (char *) newtup->t_data + newtup->t_data->t_hoff + prefixlen,
9126 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
9127 }
9128
9129 /* We need to log a tuple identity */
9131 {
9132 /* don't really need this, but its more comfy to decode */
9133 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
9134 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
9135 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
9136
9138
9139 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
9142 }
9143
9144 /* filtering by origin on a row level is much more efficient */
9146
9147 recptr = XLogInsert(RM_HEAP_ID, info);
9148
9149 return recptr;
9150}
9151
9152/*
9153 * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
9154 *
9155 * This is only used when effective_wal_level is logical, and only for
9156 * catalog tuples.
9157 */
9158static XLogRecPtr
9160{
9162
9164 HeapTupleHeader hdr = tup->t_data;
9165
9166 Assert(ItemPointerIsValid(&tup->t_self));
9167 Assert(tup->t_tableOid != InvalidOid);
9168
9169 xlrec.top_xid = GetTopTransactionId();
9170 xlrec.target_locator = relation->rd_locator;
9171 xlrec.target_tid = tup->t_self;
9172
9173 /*
9174 * If the tuple got inserted & deleted in the same TX we definitely have a
9175 * combo CID, set cmin and cmax.
9176 */
9177 if (hdr->t_infomask & HEAP_COMBOCID)
9178 {
9181 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
9182 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
9183 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
9184 }
9185 /* No combo CID, so only cmin or cmax can be set by this TX */
9186 else
9187 {
9188 /*
9189 * Tuple inserted.
9190 *
9191 * We need to check for LOCK ONLY because multixacts might be
9192 * transferred to the new tuple in case of FOR KEY SHARE updates in
9193 * which case there will be an xmax, although the tuple just got
9194 * inserted.
9195 */
9196 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
9198 {
9200 xlrec.cmax = InvalidCommandId;
9201 }
9202 /* Tuple from a different tx updated or deleted. */
9203 else
9204 {
9205 xlrec.cmin = InvalidCommandId;
9207 }
9208 xlrec.combocid = InvalidCommandId;
9209 }
9210
9211 /*
9212 * Note that we don't need to register the buffer here, because this
9213 * operation does not modify the page. The insert/update/delete that
9214 * called us certainly did, but that's WAL-logged separately.
9215 */
9218
9219 /* will be looked at irrespective of origin */
9220
9222
9223 return recptr;
9224}
9225
9226/*
9227 * Build a heap tuple representing the configured REPLICA IDENTITY to represent
9228 * the old tuple in an UPDATE or DELETE.
9229 *
9230 * Returns NULL if there's no need to log an identity or if there's no suitable
9231 * key defined.
9232 *
9233 * Pass key_required true if any replica identity columns changed value, or if
9234 * any of them have any external data. Delete must always pass true.
9235 *
9236 * *copy is set to true if the returned tuple is a modified copy rather than
9237 * the same tuple that was passed in.
9238 */
9239static HeapTuple
9241 bool *copy)
9242{
9243 TupleDesc desc = RelationGetDescr(relation);
9244 char replident = relation->rd_rel->relreplident;
9247 bool nulls[MaxHeapAttributeNumber];
9249
9250 *copy = false;
9251
9252 if (!RelationIsLogicallyLogged(relation))
9253 return NULL;
9254
9255 if (replident == REPLICA_IDENTITY_NOTHING)
9256 return NULL;
9257
9258 if (replident == REPLICA_IDENTITY_FULL)
9259 {
9260 /*
9261 * When logging the entire old tuple, it very well could contain
9262 * toasted columns. If so, force them to be inlined.
9263 */
9264 if (HeapTupleHasExternal(tp))
9265 {
9266 *copy = true;
9267 tp = toast_flatten_tuple(tp, desc);
9268 }
9269 return tp;
9270 }
9271
9272 /* if the key isn't required and we're only logging the key, we're done */
9273 if (!key_required)
9274 return NULL;
9275
9276 /* find out the replica identity columns */
9279
9280 /*
9281 * If there's no defined replica identity columns, treat as !key_required.
9282 * (This case should not be reachable from heap_update, since that should
9283 * calculate key_required accurately. But heap_delete just passes
9284 * constant true for key_required, so we can hit this case in deletes.)
9285 */
9286 if (bms_is_empty(idattrs))
9287 return NULL;
9288
9289 /*
9290 * Construct a new tuple containing only the replica identity columns,
9291 * with nulls elsewhere. While we're at it, assert that the replica
9292 * identity columns aren't null.
9293 */
9294 heap_deform_tuple(tp, desc, values, nulls);
9295
9296 for (int i = 0; i < desc->natts; i++)
9297 {
9299 idattrs))
9300 Assert(!nulls[i]);
9301 else
9302 nulls[i] = true;
9303 }
9304
9305 key_tuple = heap_form_tuple(desc, values, nulls);
9306 *copy = true;
9307
9309
9310 /*
9311 * If the tuple, which by here only contains indexed columns, still has
9312 * toasted columns, force them to be inlined. This is somewhat unlikely
9313 * since there's limits on the size of indexed columns, so we don't
9314 * duplicate toast_flatten_tuple()s functionality in the above loop over
9315 * the indexed columns, even if it would be more efficient.
9316 */
9318 {
9320
9323 }
9324
9325 return key_tuple;
9326}
9327
9328/*
9329 * HeapCheckForSerializableConflictOut
9330 * We are reading a tuple. If it's not visible, there may be a
9331 * rw-conflict out with the inserter. Otherwise, if it is visible to us
9332 * but has been deleted, there may be a rw-conflict out with the deleter.
9333 *
9334 * We will determine the top level xid of the writing transaction with which
9335 * we may be in conflict, and ask CheckForSerializableConflictOut() to check
9336 * for overlap with our own transaction.
9337 *
9338 * This function should be called just about anywhere in heapam.c where a
9339 * tuple has been read. The caller must hold at least a shared lock on the
9340 * buffer, because this function might set hint bits on the tuple. There is
9341 * currently no known reason to call this function from an index AM.
9342 */
9343void
9344HeapCheckForSerializableConflictOut(bool visible, Relation relation,
9345 HeapTuple tuple, Buffer buffer,
9346 Snapshot snapshot)
9347{
9348 TransactionId xid;
9350
9351 if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9352 return;
9353
9354 /*
9355 * Check to see whether the tuple has been written to by a concurrent
9356 * transaction, either to create it not visible to us, or to delete it
9357 * while it is visible to us. The "visible" bool indicates whether the
9358 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9359 * is going on with it.
9360 *
9361 * In the event of a concurrently inserted tuple that also happens to have
9362 * been concurrently updated (by a separate transaction), the xmin of the
9363 * tuple will be used -- not the updater's xid.
9364 */
9366 switch (htsvResult)
9367 {
9368 case HEAPTUPLE_LIVE:
9369 if (visible)
9370 return;
9371 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9372 break;
9375 if (visible)
9376 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9377 else
9378 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9379
9381 {
9382 /* This is like the HEAPTUPLE_DEAD case */
9383 Assert(!visible);
9384 return;
9385 }
9386 break;
9388 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9389 break;
9390 case HEAPTUPLE_DEAD:
9391 Assert(!visible);
9392 return;
9393 default:
9394
9395 /*
9396 * The only way to get to this default clause is if a new value is
9397 * added to the enum type without adding it to this switch
9398 * statement. That's a bug, so elog.
9399 */
9400 elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9401
9402 /*
9403 * In spite of having all enum values covered and calling elog on
9404 * this default, some compilers think this is a code path which
9405 * allows xid to be used below without initialization. Silence
9406 * that warning.
9407 */
9409 }
9410
9413
9414 /*
9415 * Find top level xid. Bail out if xid is too early to be a conflict, or
9416 * if it's our own xid.
9417 */
9419 return;
9422 return;
9423
9424 CheckForSerializableConflictOut(relation, xid, snapshot);
9425}
int16 AttrNumber
Definition attnum.h:21
int bms_next_member(const Bitmapset *a, int prevbit)
Definition bitmapset.c:1290
void bms_free(Bitmapset *a)
Definition bitmapset.c:239
bool bms_is_member(int x, const Bitmapset *a)
Definition bitmapset.c:510
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition bitmapset.c:799
Bitmapset * bms_add_members(Bitmapset *a, const Bitmapset *b)
Definition bitmapset.c:901
bool bms_overlap(const Bitmapset *a, const Bitmapset *b)
Definition bitmapset.c:575
#define bms_is_empty(a)
Definition bitmapset.h:118
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static int32 next
Definition blutils.c:225
static Datum values[MAXATTR]
Definition bootstrap.c:188
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4357
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:782
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition bufmgr.c:4378
bool BufferIsDirty(Buffer buffer)
Definition bufmgr.c:3030
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5505
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5522
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3063
int maintenance_io_concurrency
Definition bufmgr.c:207
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:874
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:307
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:470
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:437
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:210
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:220
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:205
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:332
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:421
Size PageGetHeapFreeSpace(const PageData *page)
Definition bufpage.c:990
PageHeaderData * PageHeader
Definition bufpage.h:199
static bool PageIsAllVisible(const PageData *page)
Definition bufpage.h:455
static void PageClearAllVisible(Page page)
Definition bufpage.h:465
#define SizeOfPageHeaderData
Definition bufpage.h:242
static void PageSetAllVisible(Page page)
Definition bufpage.h:460
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:269
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:379
static void PageSetFull(Page page)
Definition bufpage.h:444
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:417
PageData * Page
Definition bufpage.h:81
#define PageClearPrunable(page)
Definition bufpage.h:486
#define PageSetPrunable(page, xid)
Definition bufpage.h:479
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:397
#define NameStr(name)
Definition c.h:837
#define InvalidCommandId
Definition c.h:755
#define pg_noinline
Definition c.h:315
#define Min(x, y)
Definition c.h:1093
#define likely(x)
Definition c.h:431
#define MAXALIGN(LEN)
Definition c.h:898
uint8_t uint8
Definition c.h:616
#define Assert(condition)
Definition c.h:945
int64_t int64
Definition c.h:615
TransactionId MultiXactId
Definition c.h:748
#define pg_attribute_always_inline
Definition c.h:299
int16_t int16
Definition c.h:613
#define SHORTALIGN(LEN)
Definition c.h:894
uint16_t uint16
Definition c.h:617
#define pg_unreachable()
Definition c.h:361
#define unlikely(x)
Definition c.h:432
uint32_t uint32
Definition c.h:618
#define lengthof(array)
Definition c.h:875
#define StaticAssertDecl(condition, errmessage)
Definition c.h:1010
uint32 CommandId
Definition c.h:752
uint32 TransactionId
Definition c.h:738
#define OidIsValid(objectId)
Definition c.h:860
size_t Size
Definition c.h:691
bool IsToastRelation(Relation relation)
Definition catalog.c:206
bool IsCatalogRelation(Relation relation)
Definition catalog.c:104
bool IsSharedRelation(Oid relationId)
Definition catalog.c:304
bool IsInplaceUpdateRelation(Relation relation)
Definition catalog.c:183
CommandId HeapTupleHeaderGetCmin(const HeapTupleHeaderData *tup)
Definition combocid.c:104
void HeapTupleHeaderAdjustCmax(const HeapTupleHeaderData *tup, CommandId *cmax, bool *iscombo)
Definition combocid.c:153
CommandId HeapTupleHeaderGetCmax(const HeapTupleHeaderData *tup)
Definition combocid.c:118
bool datumIsEqual(Datum value1, Datum value2, bool typByVal, int typLen)
Definition datum.c:223
Datum arg
Definition elog.c:1322
int errcode(int sqlerrcode)
Definition elog.c:874
int int errdetail_internal(const char *fmt,...) pg_attribute_printf(1
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:36
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
HeapTuple ExecFetchSlotHeapTuple(TupleTableSlot *slot, bool materialize, bool *shouldFree)
TupleTableSlot * ExecStoreBufferHeapTuple(HeapTuple tuple, TupleTableSlot *slot, Buffer buffer)
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:461
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:643
int NBuffers
Definition globals.c:142
Oid MyDatabaseTableSpace
Definition globals.c:96
Oid MyDatabaseId
Definition globals.c:94
void simple_heap_update(Relation relation, const ItemPointerData *otid, HeapTuple tup, TU_UpdateIndexes *update_indexes)
Definition heapam.c:4570
static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, LockTupleMode lockmode, bool *current_is_member)
Definition heapam.c:7695
void heap_insert(Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate)
Definition heapam.c:2152
static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup)
Definition heapam.c:9160
XLogRecPtr log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags)
Definition heapam.c:8904
static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2)
Definition heapam.c:5409
static TM_Result heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax, const ItemPointerData *tid, TransactionId xid, LockTupleMode mode)
Definition heapam.c:5781
static void heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir)
Definition heapam.c:707
bool heap_inplace_lock(Relation relation, HeapTuple oldtup_ptr, Buffer buffer, void(*release_callback)(void *), void *arg)
Definition heapam.c:6451
bool heap_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf)
Definition heapam.c:1669
#define BOTTOMUP_TOLERANCE_NBLOCKS
Definition heapam.c:190
static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options)
Definition heapam.c:2343
static BlockNumber heap_scan_stream_read_next_parallel(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition heapam.c:252
static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
Definition heapam.c:8775
static bool heap_acquire_tuplock(Relation relation, const ItemPointerData *tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock)
Definition heapam.c:5360
static int heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
Definition heapam.c:2391
static pg_attribute_always_inline int page_collect_tuples(HeapScanDesc scan, Snapshot snapshot, Page page, Buffer buffer, BlockNumber block, int lines, bool all_visible, bool check_serializable)
Definition heapam.c:522
static BlockNumber heap_scan_stream_read_next_serial(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition heapam.c:292
static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2)
Definition heapam.c:7546
void heap_finish_speculative(Relation relation, const ItemPointerData *tid)
Definition heapam.c:6182
void HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
Definition heapam.c:8073
bool heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
Definition heapam.c:1459
#define LOCKMODE_from_mxstatus(status)
Definition heapam.c:159
void heap_endscan(TableScanDesc sscan)
Definition heapam.c:1378
#define FRM_RETURN_IS_XID
Definition heapam.c:6739
#define TUPLOCK_from_mxstatus(status)
Definition heapam.c:218
void heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode)
Definition heapam.c:1319
void heap_inplace_unlock(Relation relation, HeapTuple oldtup, Buffer buffer)
Definition heapam.c:6729
TM_Result heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
Definition heapam.c:3323
static int index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
Definition heapam.c:8527
static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining, bool logLockFailure)
Definition heapam.c:7895
bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
Definition heapam.c:7910
TM_Result heap_delete(Relation relation, const ItemPointerData *tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
Definition heapam.c:2854
static TransactionId FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, const struct VacuumCutoffs *cutoffs, uint16 *flags, HeapPageFreeze *pagefrz)
Definition heapam.c:6790
static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy)
Definition heapam.c:9241
static pg_noinline BlockNumber heapgettup_initial_block(HeapScanDesc scan, ScanDirection dir)
Definition heapam.c:752
static TM_Result heap_lock_updated_tuple(Relation rel, uint16 prior_infomask, TransactionId prior_raw_xmax, const ItemPointerData *prior_ctid, TransactionId xid, LockTupleMode mode)
Definition heapam.c:6129
#define LockTupleTuplock(rel, tup, mode)
Definition heapam.c:167
bool heap_tuple_should_freeze(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, TransactionId *NoFreezePageRelfrozenXid, MultiXactId *NoFreezePageRelminMxid)
Definition heapam.c:7965
bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId FreezeLimit, TransactionId MultiXactCutoff)
Definition heapam.c:7501
void heap_inplace_update_and_unlock(Relation relation, HeapTuple oldtup, HeapTuple tuple, Buffer buffer)
Definition heapam.c:6589
static BlockNumber heapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir)
Definition heapam.c:876
static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
Definition heapam.c:7627
#define BOTTOMUP_MAX_NBLOCKS
Definition heapam.c:189
void ReleaseBulkInsertStatePin(BulkInsertState bistate)
Definition heapam.c:2114
#define FRM_MARK_COMMITTED
Definition heapam.c:6741
#define FRM_NOOP
Definition heapam.c:6737
static void index_delete_check_htid(TM_IndexDeleteOp *delstate, Page page, OffsetNumber maxoff, const ItemPointerData *htid, TM_IndexStatus *istatus)
Definition heapam.c:8158
HeapTuple heap_getnext(TableScanDesc sscan, ScanDirection direction)
Definition heapam.c:1420
bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call)
Definition heapam.c:1789
void heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
Definition heapam.c:7479
bool heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
Definition heapam.c:1562
static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining)
Definition heapam.c:7873
void heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid)
Definition heapam.c:1489
void heap_abort_speculative(Relation relation, const ItemPointerData *tid)
Definition heapam.c:6269
static BlockNumber bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data, void *per_buffer_data)
Definition heapam.c:317
TableScanDesc heap_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, ParallelTableScanDesc parallel_scan, uint32 flags)
Definition heapam.c:1164
static void heapgettup(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
Definition heapam.c:960
static Page heapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
Definition heapam.c:830
static uint8 compute_infobits(uint16 infomask, uint16 infomask2)
Definition heapam.c:2809
#define FRM_RETURN_IS_MULTI
Definition heapam.c:6740
#define FRM_INVALIDATE_XMAX
Definition heapam.c:6738
static bool heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, bool isnull1, bool isnull2)
Definition heapam.c:4429
static void index_delete_sort(TM_IndexDeleteOp *delstate)
Definition heapam.c:8563
void heap_prepare_pagescan(TableScanDesc sscan)
Definition heapam.c:616
static Bitmapset * HeapDetermineColumnsInfo(Relation relation, Bitmapset *interesting_cols, Bitmapset *external_cols, HeapTuple oldtup, HeapTuple newtup, bool *has_external)
Definition heapam.c:4480
static const int MultiXactStatusLock[MaxMultiXactStatus+1]
Definition heapam.c:207
void simple_heap_insert(Relation relation, HeapTuple tup)
Definition heapam.c:2796
static bool xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
Definition heapam.c:2831
#define UnlockTupleTuplock(rel, tup, mode)
Definition heapam.c:169
static TM_Result test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, LockTupleMode mode, HeapTuple tup, bool *needwait)
Definition heapam.c:5690
bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, HeapPageFreeze *pagefrz, HeapTupleFreeze *frz, bool *totally_frozen)
Definition heapam.c:7146
static void AssertHasSnapshotForToast(Relation rel)
Definition heapam.c:225
void simple_heap_delete(Relation relation, const ItemPointerData *tid)
Definition heapam.c:3277
static const struct @15 tupleLockExtraInfo[]
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared)
Definition heapam.c:8938
TransactionId HeapTupleGetUpdateXid(const HeapTupleHeaderData *tup)
Definition heapam.c:7679
TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition heapam.c:8218
void heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate)
Definition heapam.c:2423
#define ConditionalLockTupleTuplock(rel, tup, mode, log)
Definition heapam.c:171
static void initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
Definition heapam.c:357
static int bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups, TM_IndexDelete *deltids)
Definition heapam.c:8659
static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
Definition heapam.c:1070
TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_updates, Buffer *buffer, TM_FailureData *tmfd)
Definition heapam.c:4658
static void UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
Definition heapam.c:2063
static bool Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, bool nowait, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining, bool logLockFailure)
Definition heapam.c:7795
static int bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
Definition heapam.c:8702
void heap_get_latest_tid(TableScanDesc sscan, ItemPointer tid)
Definition heapam.c:1941
void heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
Definition heapam.c:500
void HeapCheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot)
Definition heapam.c:9345
static Page heapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
Definition heapam.c:799
static MultiXactStatus get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
Definition heapam.c:4611
void heap_pre_freeze_checks(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
Definition heapam.c:7426
BulkInsertState GetBulkInsertState(void)
Definition heapam.c:2085
void FreeBulkInsertState(BulkInsertState bistate)
Definition heapam.c:2102
#define HEAP_INSERT_SPECULATIVE
Definition heapam.h:40
#define HEAP_FREEZE_CHECK_XMAX_ABORTED
Definition heapam.h:150
struct HeapScanDescData * HeapScanDesc
Definition heapam.h:108
HTSV_Result
Definition heapam.h:137
@ HEAPTUPLE_RECENTLY_DEAD
Definition heapam.h:140
@ HEAPTUPLE_INSERT_IN_PROGRESS
Definition heapam.h:141
@ HEAPTUPLE_LIVE
Definition heapam.h:139
@ HEAPTUPLE_DELETE_IN_PROGRESS
Definition heapam.h:142
@ HEAPTUPLE_DEAD
Definition heapam.h:138
struct BitmapHeapScanDescData * BitmapHeapScanDesc
Definition heapam.h:116
#define HEAP_INSERT_FROZEN
Definition heapam.h:38
static void heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz)
Definition heapam.h:518
#define HEAP_FREEZE_CHECK_XMIN_COMMITTED
Definition heapam.h:149
#define HEAP_INSERT_NO_LOGICAL
Definition heapam.h:39
struct BulkInsertStateData * BulkInsertState
Definition heapam.h:46
const TableAmRoutine * GetHeapamTableAmRoutine(void)
void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid)
bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer)
bool HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest)
HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer)
int HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, int ntups, BatchMVCCState *batchmvcc, OffsetNumber *vistuples_dense)
bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, Buffer buffer)
#define XLH_INSERT_ON_TOAST_RELATION
Definition heapam_xlog.h:76
#define SizeOfHeapMultiInsert
#define XLOG_HEAP2_MULTI_INSERT
Definition heapam_xlog.h:64
#define SizeOfHeapUpdate
#define XLH_INVALID_XVAC
#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:87
#define SizeOfHeapVisible
#define XLOG_HEAP_HOT_UPDATE
Definition heapam_xlog.h:37
#define XLOG_HEAP_DELETE
Definition heapam_xlog.h:34
#define XLH_INSERT_IS_SPECULATIVE
Definition heapam_xlog.h:74
#define XLH_LOCK_ALL_FROZEN_CLEARED
#define XLH_DELETE_CONTAINS_OLD_KEY
#define XLH_UPDATE_CONTAINS_NEW_TUPLE
Definition heapam_xlog.h:90
#define XLH_INSERT_LAST_IN_MULTI
Definition heapam_xlog.h:73
#define XLH_INSERT_ALL_FROZEN_SET
Definition heapam_xlog.h:79
#define XLH_FREEZE_XVAC
#define XLOG_HEAP_UPDATE
Definition heapam_xlog.h:35
#define XLHL_XMAX_KEYSHR_LOCK
#define XLH_DELETE_ALL_VISIBLE_CLEARED
#define XLH_UPDATE_CONTAINS_OLD_TUPLE
Definition heapam_xlog.h:88
#define SizeOfHeapNewCid
#define SizeOfHeapLockUpdated
#define XLHL_XMAX_IS_MULTI
#define XLH_INSERT_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:72
#define SizeOfHeapHeader
#define XLH_DELETE_IS_PARTITION_MOVE
#define MinSizeOfHeapInplace
#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:85
#define XLHL_XMAX_LOCK_ONLY
#define XLOG_HEAP_INPLACE
Definition heapam_xlog.h:40
#define XLOG_HEAP2_LOCK_UPDATED
Definition heapam_xlog.h:65
#define XLH_UPDATE_SUFFIX_FROM_OLD
Definition heapam_xlog.h:92
#define XLH_UPDATE_PREFIX_FROM_OLD
Definition heapam_xlog.h:91
#define SizeOfMultiInsertTuple
#define XLHL_XMAX_EXCL_LOCK
#define XLOG_HEAP2_NEW_CID
Definition heapam_xlog.h:66
#define XLH_DELETE_CONTAINS_OLD_TUPLE
#define XLOG_HEAP_LOCK
Definition heapam_xlog.h:39
#define XLOG_HEAP_INSERT
Definition heapam_xlog.h:33
#define SizeOfHeapInsert
#define SizeOfHeapDelete
#define XLH_DELETE_IS_SUPER
#define XLH_UPDATE_CONTAINS_OLD_KEY
Definition heapam_xlog.h:89
#define XLHL_KEYS_UPDATED
#define XLOG_HEAP2_VISIBLE
Definition heapam_xlog.h:63
#define XLH_INSERT_CONTAINS_NEW_TUPLE
Definition heapam_xlog.h:75
#define XLOG_HEAP_INIT_PAGE
Definition heapam_xlog.h:47
#define SizeOfHeapConfirm
#define SizeOfHeapLock
#define XLOG_HEAP_CONFIRM
Definition heapam_xlog.h:38
void heap_toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative)
Definition heaptoast.c:43
HeapTuple heap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, int options)
Definition heaptoast.c:96
HeapTuple toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc)
Definition heaptoast.c:350
#define TOAST_TUPLE_THRESHOLD
Definition heaptoast.h:48
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition heaptuple.c:1037
void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull)
Definition heaptuple.c:1266
void heap_freetuple(HeapTuple htup)
Definition heaptuple.c:1384
void RelationPutHeapTuple(Relation relation, Buffer buffer, HeapTuple tuple, bool token)
Definition hio.c:35
Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertState bistate, Buffer *vmbuffer, Buffer *vmbuffer_other, int num_pages)
Definition hio.c:500
HeapTupleHeaderData * HeapTupleHeader
Definition htup.h:23
#define HEAP_MOVED_OFF
#define HEAP_XMAX_SHR_LOCK
static bool HeapTupleIsHotUpdated(const HeapTupleData *tuple)
#define HEAP_XMIN_FROZEN
static Datum heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
static bool HeapTupleHeaderXminFrozen(const HeapTupleHeaderData *tup)
#define HeapTupleHeaderGetNatts(tup)
static void HeapTupleHeaderSetXminFrozen(HeapTupleHeaderData *tup)
#define SizeofHeapTupleHeader
#define HEAP_KEYS_UPDATED
static bool HEAP_XMAX_IS_SHR_LOCKED(uint16 infomask)
static bool HEAP_XMAX_IS_LOCKED_ONLY(uint16 infomask)
static bool HeapTupleHeaderXminInvalid(const HeapTupleHeaderData *tup)
static void HeapTupleClearHotUpdated(const HeapTupleData *tuple)
static bool HeapTupleHasExternal(const HeapTupleData *tuple)
static TransactionId HeapTupleHeaderGetXvac(const HeapTupleHeaderData *tup)
#define HEAP2_XACT_MASK
static void HeapTupleHeaderSetCmax(HeapTupleHeaderData *tup, CommandId cid, bool iscombo)
#define HEAP_XMAX_LOCK_ONLY
static void HeapTupleHeaderClearHotUpdated(HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetCmin(HeapTupleHeaderData *tup, CommandId cid)
#define HEAP_XMAX_BITS
#define HEAP_LOCK_MASK
static CommandId HeapTupleHeaderGetRawCommandId(const HeapTupleHeaderData *tup)
static TransactionId HeapTupleHeaderGetRawXmax(const HeapTupleHeaderData *tup)
static bool HeapTupleHeaderIsHeapOnly(const HeapTupleHeaderData *tup)
static bool HeapTupleIsHeapOnly(const HeapTupleData *tuple)
#define HEAP_MOVED
static void HeapTupleSetHeapOnly(const HeapTupleData *tuple)
#define HEAP_XMAX_IS_MULTI
static bool HEAP_XMAX_IS_KEYSHR_LOCKED(uint16 infomask)
#define HEAP_XMAX_COMMITTED
static TransactionId HeapTupleHeaderGetXmin(const HeapTupleHeaderData *tup)
#define HEAP_COMBOCID
#define HEAP_XACT_MASK
static bool HeapTupleHeaderIndicatesMovedPartitions(const HeapTupleHeaderData *tup)
static void HeapTupleSetHotUpdated(const HeapTupleData *tuple)
#define HEAP_XMAX_EXCL_LOCK
static bool HeapTupleHeaderIsHotUpdated(const HeapTupleHeaderData *tup)
#define HEAP_XMAX_INVALID
static TransactionId HeapTupleHeaderGetRawXmin(const HeapTupleHeaderData *tup)
static void * GETSTRUCT(const HeapTupleData *tuple)
static void HeapTupleClearHeapOnly(const HeapTupleData *tuple)
#define MaxHeapAttributeNumber
static bool HeapTupleHeaderIsSpeculative(const HeapTupleHeaderData *tup)
static TransactionId HeapTupleHeaderGetUpdateXid(const HeapTupleHeaderData *tup)
#define MaxHeapTuplesPerPage
static bool HEAP_XMAX_IS_EXCL_LOCKED(uint16 infomask)
static void HeapTupleHeaderSetXmin(HeapTupleHeaderData *tup, TransactionId xid)
static bool HEAP_LOCKED_UPGRADED(uint16 infomask)
#define HEAP_UPDATED
#define HEAP_XMAX_KEYSHR_LOCK
static void HeapTupleHeaderSetMovedPartitions(HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetXmax(HeapTupleHeaderData *tup, TransactionId xid)
static bool HeapTupleHeaderXminCommitted(const HeapTupleHeaderData *tup)
#define IsParallelWorker()
Definition parallel.h:62
void index_close(Relation relation, LOCKMODE lockmode)
Definition indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition indexam.c:133
int remaining
Definition informix.c:692
#define INJECTION_POINT(name, arg)
void AcceptInvalidationMessages(void)
Definition inval.c:930
int inplaceGetInvalidationMessages(SharedInvalidationMessage **msgs, bool *RelcacheInitFileInval)
Definition inval.c:1088
void PreInplace_Inval(void)
Definition inval.c:1250
void CacheInvalidateHeapTupleInplace(Relation relation, HeapTuple key_equivalent_tuple)
Definition inval.c:1593
void AtInplace_Inval(void)
Definition inval.c:1263
void ForgetInplace_Inval(void)
Definition inval.c:1286
void CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple, HeapTuple newtuple)
Definition inval.c:1571
int b
Definition isn.c:74
int j
Definition isn.c:78
int i
Definition isn.c:77
#define ItemIdGetLength(itemId)
Definition itemid.h:59
#define ItemIdIsNormal(itemId)
Definition itemid.h:99
#define ItemIdGetRedirect(itemId)
Definition itemid.h:78
#define ItemIdIsUsed(itemId)
Definition itemid.h:92
#define ItemIdIsRedirected(itemId)
Definition itemid.h:106
#define ItemIdHasStorage(itemId)
Definition itemid.h:120
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
bool ItemPointerEquals(const ItemPointerData *pointer1, const ItemPointerData *pointer2)
Definition itemptr.c:35
static void ItemPointerSet(ItemPointerData *pointer, BlockNumber blockNumber, OffsetNumber offNum)
Definition itemptr.h:135
static void ItemPointerSetInvalid(ItemPointerData *pointer)
Definition itemptr.h:184
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
Definition itemptr.h:158
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition itemptr.h:147
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition itemptr.h:124
static bool ItemPointerIndicatesMovedPartitions(const ItemPointerData *pointer)
Definition itemptr.h:197
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
static BlockNumber ItemPointerGetBlockNumberNoCheck(const ItemPointerData *pointer)
Definition itemptr.h:93
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition itemptr.h:83
void UnlockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode)
Definition lmgr.c:601
bool ConditionalXactLockTableWait(TransactionId xid, bool logLockFailure)
Definition lmgr.c:739
void LockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode)
Definition lmgr.c:562
void XactLockTableWait(TransactionId xid, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper)
Definition lmgr.c:663
XLTW_Oper
Definition lmgr.h:25
@ XLTW_None
Definition lmgr.h:26
@ XLTW_Lock
Definition lmgr.h:29
@ XLTW_Delete
Definition lmgr.h:28
@ XLTW_LockUpdated
Definition lmgr.h:30
@ XLTW_Update
Definition lmgr.h:27
bool LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode, bool orstronger)
Definition lock.c:643
bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
Definition lock.c:623
bool log_lock_failures
Definition lock.c:54
int LOCKMODE
Definition lockdefs.h:26
#define ShareRowExclusiveLock
Definition lockdefs.h:41
#define AccessShareLock
Definition lockdefs.h:36
#define InplaceUpdateTupleLock
Definition lockdefs.h:48
#define ShareUpdateExclusiveLock
Definition lockdefs.h:39
LockWaitPolicy
Definition lockoptions.h:38
@ LockWaitSkip
Definition lockoptions.h:42
@ LockWaitBlock
Definition lockoptions.h:40
@ LockWaitError
Definition lockoptions.h:44
LockTupleMode
Definition lockoptions.h:51
@ LockTupleExclusive
Definition lockoptions.h:59
@ LockTupleNoKeyExclusive
Definition lockoptions.h:57
@ LockTupleShare
Definition lockoptions.h:55
@ LockTupleKeyShare
Definition lockoptions.h:53
#define SET_LOCKTAG_RELATION(locktag, dboid, reloid)
Definition locktag.h:81
#define SET_LOCKTAG_TUPLE(locktag, dboid, reloid, blocknum, offnum)
Definition locktag.h:117
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define IsBootstrapProcessingMode()
Definition miscadmin.h:477
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define IsNormalProcessingMode()
Definition miscadmin.h:479
#define END_CRIT_SECTION()
Definition miscadmin.h:152
MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
Definition multixact.c:400
bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
Definition multixact.c:2857
bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
Definition multixact.c:2871
bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
Definition multixact.c:511
void MultiXactIdSetOldestMember(void)
Definition multixact.c:585
MultiXactId MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
Definition multixact.c:704
MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, MultiXactStatus status2)
Definition multixact.c:347
int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly)
Definition multixact.c:1161
#define MultiXactIdIsValid(multi)
Definition multixact.h:29
MultiXactStatus
Definition multixact.h:37
@ MultiXactStatusForShare
Definition multixact.h:39
@ MultiXactStatusForNoKeyUpdate
Definition multixact.h:40
@ MultiXactStatusNoKeyUpdate
Definition multixact.h:43
@ MultiXactStatusUpdate
Definition multixact.h:45
@ MultiXactStatusForUpdate
Definition multixact.h:41
@ MultiXactStatusForKeyShare
Definition multixact.h:38
#define ISUPDATE_from_mxstatus(status)
Definition multixact.h:51
#define InvalidMultiXactId
Definition multixact.h:25
#define MaxMultiXactStatus
Definition multixact.h:48
static char * errmsg
#define InvalidOffsetNumber
Definition off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition off.h:39
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
#define FirstOffsetNumber
Definition off.h:27
#define OffsetNumberPrev(offsetNumber)
Definition off.h:54
#define MaxOffsetNumber
Definition off.h:28
Datum lower(PG_FUNCTION_ARGS)
Datum upper(PG_FUNCTION_ARGS)
Operator oper(ParseState *pstate, List *opname, Oid ltypeId, Oid rtypeId, bool noError, int location)
Definition parse_oper.c:372
int16 attlen
#define ERRCODE_DATA_CORRUPTED
static uint32 pg_nextpower2_32(uint32 num)
static PgChecksumMode mode
static const struct exclude_list_item skip[]
FormData_pg_class * Form_pg_class
Definition pg_class.h:160
END_CATALOG_STRUCT typedef FormData_pg_database * Form_pg_database
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define pgstat_count_heap_getnext(rel)
Definition pgstat.h:698
#define pgstat_count_heap_scan(rel)
Definition pgstat.h:693
void pgstat_count_heap_update(Relation rel, bool hot, bool newpage)
void pgstat_count_heap_delete(Relation rel)
void pgstat_count_heap_insert(Relation rel, PgStat_Counter n)
#define qsort(a, b, c, d)
Definition port.h:495
static Oid DatumGetObjectId(Datum X)
Definition postgres.h:242
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
#define InvalidOid
unsigned int Oid
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition predicate.c:4345
void CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
Definition predicate.c:4032
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition predicate.c:2585
void PredicateLockTID(Relation relation, const ItemPointerData *tid, Snapshot snapshot, TransactionId tuple_xid)
Definition predicate.c:2630
bool CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
Definition predicate.c:4000
static int fb(int x)
GlobalVisState * GlobalVisTestFor(Relation rel)
Definition procarray.c:4114
bool TransactionIdIsInProgress(TransactionId xid)
Definition procarray.c:1401
void heap_page_prune_opt(Relation relation, Buffer buffer, Buffer *vmbuffer)
Definition pruneheap.c:216
void read_stream_reset(ReadStream *stream)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
BlockNumber(* ReadStreamBlockNumberCB)(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition read_stream.h:77
#define READ_STREAM_DEFAULT
Definition read_stream.h:21
#define READ_STREAM_SEQUENTIAL
Definition read_stream.h:36
#define RelationGetRelid(relation)
Definition rel.h:514
#define RelationIsLogicallyLogged(relation)
Definition rel.h:710
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition rel.h:389
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetNumberOfAttributes(relation)
Definition rel.h:520
#define RelationGetRelationName(relation)
Definition rel.h:548
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition rel.h:693
#define RelationNeedsWAL(relation)
Definition rel.h:637
#define RelationUsesLocalBuffers(relation)
Definition rel.h:646
#define HEAP_DEFAULT_FILLFACTOR
Definition rel.h:360
void RelationDecrementReferenceCount(Relation rel)
Definition relcache.c:2189
Bitmapset * RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
Definition relcache.c:5293
void RelationIncrementReferenceCount(Relation rel)
Definition relcache.c:2176
@ INDEX_ATTR_BITMAP_KEY
Definition relcache.h:69
@ INDEX_ATTR_BITMAP_HOT_BLOCKING
Definition relcache.h:72
@ INDEX_ATTR_BITMAP_SUMMARIZED
Definition relcache.h:73
@ INDEX_ATTR_BITMAP_IDENTITY_KEY
Definition relcache.h:71
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
struct ParallelBlockTableScanDescData * ParallelBlockTableScanDesc
Definition relscan.h:103
#define ScanDirectionIsForward(direction)
Definition sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition sdir.h:50
ScanDirection
Definition sdir.h:25
@ ForwardScanDirection
Definition sdir.h:28
TransactionId RecentXmin
Definition snapmgr.c:160
void UnregisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:866
TransactionId TransactionXmin
Definition snapmgr.c:159
bool HaveRegisteredOrActiveSnapshot(void)
Definition snapmgr.c:1644
void InvalidateCatalogSnapshot(void)
Definition snapmgr.c:455
#define IsHistoricMVCCSnapshot(snapshot)
Definition snapmgr.h:67
#define SnapshotAny
Definition snapmgr.h:33
#define InitNonVacuumableSnapshot(snapshotdata, vistestp)
Definition snapmgr.h:50
#define IsMVCCSnapshot(snapshot)
Definition snapmgr.h:59
#define InvalidSnapshot
Definition snapshot.h:119
int get_tablespace_maintenance_io_concurrency(Oid spcid)
Definition spccache.c:230
#define init()
BlockNumber last_free
Definition hio.h:49
BufferAccessStrategy strategy
Definition hio.h:31
uint32 already_extended_by
Definition hio.h:50
BlockNumber next_free
Definition hio.h:48
Buffer current_buf
Definition hio.h:32
MultiXactId NoFreezePageRelminMxid
Definition heapam.h:244
TransactionId FreezePageConflictXid
Definition heapam.h:233
TransactionId FreezePageRelfrozenXid
Definition heapam.h:220
bool freeze_required
Definition heapam.h:194
MultiXactId FreezePageRelminMxid
Definition heapam.h:221
TransactionId NoFreezePageRelfrozenXid
Definition heapam.h:243
Buffer rs_vmbuffer
Definition heapam.h:101
BufferAccessStrategy rs_strategy
Definition heapam.h:73
ScanDirection rs_dir
Definition heapam.h:88
uint32 rs_ntuples
Definition heapam.h:105
OffsetNumber rs_coffset
Definition heapam.h:68
Buffer rs_cbuf
Definition heapam.h:70
ParallelBlockTableScanWorkerData * rs_parallelworkerdata
Definition heapam.h:95
BlockNumber rs_startblock
Definition heapam.h:62
HeapTupleData rs_ctup
Definition heapam.h:75
OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]
Definition heapam.h:106
BlockNumber rs_numblocks
Definition heapam.h:63
BlockNumber rs_nblocks
Definition heapam.h:61
ReadStream * rs_read_stream
Definition heapam.h:78
uint32 rs_cindex
Definition heapam.h:104
BlockNumber rs_prefetch_block
Definition heapam.h:89
BlockNumber rs_cblock
Definition heapam.h:69
TableScanDescData rs_base
Definition heapam.h:58
ItemPointerData t_self
Definition htup.h:65
uint32 t_len
Definition htup.h:64
HeapTupleHeader t_data
Definition htup.h:68
Oid t_tableOid
Definition htup.h:66
TransactionId t_xmin
union HeapTupleHeaderData::@51 t_choice
ItemPointerData t_ctid
HeapTupleFields t_heap
int16 npromisingtids
Definition heapam.c:198
LockRelId lockRelId
Definition rel.h:46
Oid relId
Definition rel.h:40
Oid dbId
Definition rel.h:41
TransactionId xid
Definition multixact.h:57
MultiXactStatus status
Definition multixact.h:58
LockInfoData rd_lockInfo
Definition rel.h:114
Form_pg_index rd_index
Definition rel.h:192
RelFileLocator rd_locator
Definition rel.h:57
Form_pg_class rd_rel
Definition rel.h:111
bool takenDuringRecovery
Definition snapshot.h:180
TransactionId xmax
Definition tableam.h:150
CommandId cmax
Definition tableam.h:151
ItemPointerData ctid
Definition tableam.h:149
ItemPointerData tid
Definition tableam.h:212
Relation rs_rd
Definition relscan.h:35
uint32 rs_flags
Definition relscan.h:63
struct ScanKeyData * rs_key
Definition relscan.h:38
struct SnapshotData * rs_snapshot
Definition relscan.h:36
struct ParallelTableScanDescData * rs_parallel
Definition relscan.h:65
TransactionId FreezeLimit
Definition vacuum.h:289
TransactionId OldestXmin
Definition vacuum.h:279
TransactionId relfrozenxid
Definition vacuum.h:263
MultiXactId relminmxid
Definition vacuum.h:264
MultiXactId MultiXactCutoff
Definition vacuum.h:290
MultiXactId OldestMxact
Definition vacuum.h:280
Definition c.h:778
OffsetNumber offnum
TransactionId SubTransGetTopmostTransaction(TransactionId xid)
Definition subtrans.c:163
void ss_report_location(Relation rel, BlockNumber location)
Definition syncscan.c:289
BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks)
Definition syncscan.c:254
#define FirstLowInvalidHeapAttributeNumber
Definition sysattr.h:27
#define TableOidAttributeNumber
Definition sysattr.h:26
bool RelationSupportsSysCache(Oid relid)
Definition syscache.c:762
void table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan, BlockNumber startblock, BlockNumber numblocks)
Definition tableam.c:451
BlockNumber table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan)
Definition tableam.c:546
bool synchronize_seqscans
Definition tableam.c:50
@ SO_ALLOW_STRAT
Definition tableam.h:58
@ SO_TYPE_TIDRANGESCAN
Definition tableam.h:53
@ SO_TEMP_SNAPSHOT
Definition tableam.h:65
@ SO_ALLOW_PAGEMODE
Definition tableam.h:62
@ SO_TYPE_SAMPLESCAN
Definition tableam.h:51
@ SO_ALLOW_SYNC
Definition tableam.h:60
@ SO_TYPE_SEQSCAN
Definition tableam.h:49
@ SO_TYPE_BITMAPSCAN
Definition tableam.h:50
TU_UpdateIndexes
Definition tableam.h:111
@ TU_Summarizing
Definition tableam.h:119
@ TU_All
Definition tableam.h:116
@ TU_None
Definition tableam.h:113
TM_Result
Definition tableam.h:73
@ TM_Ok
Definition tableam.h:78
@ TM_BeingModified
Definition tableam.h:100
@ TM_Deleted
Definition tableam.h:93
@ TM_WouldBlock
Definition tableam.h:103
@ TM_Updated
Definition tableam.h:90
@ TM_SelfModified
Definition tableam.h:84
@ TM_Invisible
Definition tableam.h:81
bool tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
Definition tidbitmap.c:1614
bool TransactionIdDidCommit(TransactionId transactionId)
Definition transam.c:126
bool TransactionIdDidAbort(TransactionId transactionId)
Definition transam.c:188
static bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition transam.h:297
#define InvalidTransactionId
Definition transam.h:31
static bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2)
Definition transam.h:282
static bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition transam.h:312
#define TransactionIdEquals(id1, id2)
Definition transam.h:43
#define TransactionIdIsValid(xid)
Definition transam.h:41
#define TransactionIdIsNormal(xid)
Definition transam.h:42
static bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition transam.h:263
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:193
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition tuptable.h:476
static bool HeapKeyTest(HeapTuple tuple, TupleDesc tupdesc, int nkeys, ScanKey keys)
Definition valid.h:28
static bool VARATT_IS_EXTERNAL(const void *PTR)
Definition varatt.h:354
bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
void visibilitymap_set_vmbits(BlockNumber heapBlk, Buffer vmBuf, uint8 flags, const RelFileLocator rlocator)
#define VISIBILITYMAP_VALID_BITS
#define VISIBILITYMAP_ALL_FROZEN
#define VISIBILITYMAP_XLOG_CATALOG_REL
#define VISIBILITYMAP_ALL_VISIBLE
TransactionId GetTopTransactionId(void)
Definition xact.c:428
TransactionId GetTopTransactionIdIfAny(void)
Definition xact.c:443
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition xact.c:943
bool IsInParallelMode(void)
Definition xact.c:1091
TransactionId GetCurrentTransactionId(void)
Definition xact.c:456
CommandId GetCurrentCommandId(bool used)
Definition xact.c:831
#define IsolationIsSerializable()
Definition xact.h:53
#define XLOG_INCLUDE_ORIGIN
Definition xlog.h:165
#define XLogHintBitIsNeeded()
Definition xlog.h:122
#define XLogStandbyInfoActive()
Definition xlog.h:125
uint64 XLogRecPtr
Definition xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition xloginsert.c:479
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition xloginsert.c:410
bool XLogCheckBufferNeedsBackup(Buffer buffer)
void XLogRegisterData(const void *data, uint32 len)
Definition xloginsert.c:369
void XLogSetRecordFlags(uint8 flags)
Definition xloginsert.c:461
void XLogRegisterBlock(uint8 block_id, RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blknum, const PageData *page, uint8 flags)
Definition xloginsert.c:314
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition xloginsert.c:246
void XLogBeginInsert(void)
Definition xloginsert.c:153
#define REGBUF_STANDARD
Definition xloginsert.h:35
#define REGBUF_NO_IMAGE
Definition xloginsert.h:33
#define REGBUF_KEEP_DATA
Definition xloginsert.h:36
#define REGBUF_WILL_INIT
Definition xloginsert.h:34

◆ FRM_INVALIDATE_XMAX

#define FRM_INVALIDATE_XMAX   0x0002

Definition at line 6738 of file heapam.c.

◆ FRM_MARK_COMMITTED

#define FRM_MARK_COMMITTED   0x0010

Definition at line 6741 of file heapam.c.

◆ FRM_NOOP

#define FRM_NOOP   0x0001

Definition at line 6737 of file heapam.c.

◆ FRM_RETURN_IS_MULTI

#define FRM_RETURN_IS_MULTI   0x0008

Definition at line 6740 of file heapam.c.

◆ FRM_RETURN_IS_XID

#define FRM_RETURN_IS_XID   0x0004

Definition at line 6739 of file heapam.c.

◆ LOCKMODE_from_mxstatus

#define LOCKMODE_from_mxstatus (   status)     (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)

Definition at line 159 of file heapam.c.

◆ LockTupleTuplock

#define LockTupleTuplock (   rel,
  tup,
  mode 
)     LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)

Definition at line 167 of file heapam.c.

◆ TUPLOCK_from_mxstatus

#define TUPLOCK_from_mxstatus (   status)     (MultiXactStatusLock[(status)])

Definition at line 218 of file heapam.c.

◆ UnlockTupleTuplock

#define UnlockTupleTuplock (   rel,
  tup,
  mode 
)     UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)

Definition at line 169 of file heapam.c.

Typedef Documentation

◆ IndexDeleteCounts

Function Documentation

◆ AssertHasSnapshotForToast()

static void AssertHasSnapshotForToast ( Relation  rel)
inlinestatic

Definition at line 225 of file heapam.c.

226{
227#ifdef USE_ASSERT_CHECKING
228
229 /* bootstrap mode in particular breaks this rule */
231 return;
232
233 /* if the relation doesn't have a TOAST table, we are good */
234 if (!OidIsValid(rel->rd_rel->reltoastrelid))
235 return;
236
238
239#endif /* USE_ASSERT_CHECKING */
240}

References Assert, HaveRegisteredOrActiveSnapshot(), IsNormalProcessingMode, OidIsValid, and RelationData::rd_rel.

Referenced by heap_delete(), heap_insert(), heap_multi_insert(), and heap_update().

◆ bitmapheap_stream_read_next()

static BlockNumber bitmapheap_stream_read_next ( ReadStream pgsr,
void private_data,
void per_buffer_data 
)
static

Definition at line 317 of file heapam.c.

319{
320 TBMIterateResult *tbmres = per_buffer_data;
323 TableScanDesc sscan = &hscan->rs_base;
324
325 for (;;)
326 {
328
329 /* no more entries in the bitmap */
330 if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
331 return InvalidBlockNumber;
332
333 /*
334 * Ignore any claimed entries past what we think is the end of the
335 * relation. It may have been extended after the start of our scan (we
336 * only hold an AccessShareLock, and it could be inserts from this
337 * backend). We don't take this optimization in SERIALIZABLE
338 * isolation though, as we need to examine all invisible tuples
339 * reachable by the index.
340 */
342 tbmres->blockno >= hscan->rs_nblocks)
343 continue;
344
345 return tbmres->blockno;
346 }
347
348 /* not reachable */
349 Assert(false);
350}

References Assert, CHECK_FOR_INTERRUPTS, fb(), InvalidBlockNumber, IsolationIsSerializable, and tbm_iterate().

Referenced by heap_beginscan().

◆ bottomup_nblocksfavorable()

static int bottomup_nblocksfavorable ( IndexDeleteCounts blockgroups,
int  nblockgroups,
TM_IndexDelete deltids 
)
static

Definition at line 8659 of file heapam.c.

8661{
8662 int64 lastblock = -1;
8663 int nblocksfavorable = 0;
8664
8665 Assert(nblockgroups >= 1);
8667
8668 /*
8669 * We tolerate heap blocks that will be accessed only slightly out of
8670 * physical order. Small blips occur when a pair of almost-contiguous
8671 * blocks happen to fall into different buckets (perhaps due only to a
8672 * small difference in npromisingtids that the bucketing scheme didn't
8673 * quite manage to ignore). We effectively ignore these blips by applying
8674 * a small tolerance. The precise tolerance we use is a little arbitrary,
8675 * but it works well enough in practice.
8676 */
8677 for (int b = 0; b < nblockgroups; b++)
8678 {
8679 IndexDeleteCounts *group = blockgroups + b;
8680 TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
8682
8683 if (lastblock != -1 &&
8686 break;
8687
8689 lastblock = block;
8690 }
8691
8692 /* Always indicate that there is at least 1 favorable block */
8694
8695 return nblocksfavorable;
8696}

References Assert, b, BOTTOMUP_MAX_NBLOCKS, BOTTOMUP_TOLERANCE_NBLOCKS, fb(), IndexDeleteCounts::ifirsttid, and ItemPointerGetBlockNumber().

Referenced by bottomup_sort_and_shrink().

◆ bottomup_sort_and_shrink()

static int bottomup_sort_and_shrink ( TM_IndexDeleteOp delstate)
static

Definition at line 8775 of file heapam.c.

8776{
8780 int nblockgroups = 0;
8781 int ncopied = 0;
8782 int nblocksfavorable = 0;
8783
8784 Assert(delstate->bottomup);
8785 Assert(delstate->ndeltids > 0);
8786
8787 /* Calculate per-heap-block count of TIDs */
8789 for (int i = 0; i < delstate->ndeltids; i++)
8790 {
8791 TM_IndexDelete *ideltid = &delstate->deltids[i];
8792 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8793 ItemPointer htid = &ideltid->tid;
8794 bool promising = istatus->promising;
8795
8797 {
8798 /* New block group */
8799 nblockgroups++;
8800
8803
8805 blockgroups[nblockgroups - 1].ifirsttid = i;
8806 blockgroups[nblockgroups - 1].ntids = 1;
8807 blockgroups[nblockgroups - 1].npromisingtids = 0;
8808 }
8809 else
8810 {
8811 blockgroups[nblockgroups - 1].ntids++;
8812 }
8813
8814 if (promising)
8815 blockgroups[nblockgroups - 1].npromisingtids++;
8816 }
8817
8818 /*
8819 * We're about ready to sort block groups to determine the optimal order
8820 * for visiting heap blocks. But before we do, round the number of
8821 * promising tuples for each block group up to the next power-of-two,
8822 * unless it is very low (less than 4), in which case we round up to 4.
8823 * npromisingtids is far too noisy to trust when choosing between a pair
8824 * of block groups that both have very low values.
8825 *
8826 * This scheme divides heap blocks/block groups into buckets. Each bucket
8827 * contains blocks that have _approximately_ the same number of promising
8828 * TIDs as each other. The goal is to ignore relatively small differences
8829 * in the total number of promising entries, so that the whole process can
8830 * give a little weight to heapam factors (like heap block locality)
8831 * instead. This isn't a trade-off, really -- we have nothing to lose. It
8832 * would be foolish to interpret small differences in npromisingtids
8833 * values as anything more than noise.
8834 *
8835 * We tiebreak on nhtids when sorting block group subsets that have the
8836 * same npromisingtids, but this has the same issues as npromisingtids,
8837 * and so nhtids is subject to the same power-of-two bucketing scheme. The
8838 * only reason that we don't fix nhtids in the same way here too is that
8839 * we'll need accurate nhtids values after the sort. We handle nhtids
8840 * bucketization dynamically instead (in the sort comparator).
8841 *
8842 * See bottomup_nblocksfavorable() for a full explanation of when and how
8843 * heap locality/favorable blocks can significantly influence when and how
8844 * heap blocks are accessed.
8845 */
8846 for (int b = 0; b < nblockgroups; b++)
8847 {
8848 IndexDeleteCounts *group = blockgroups + b;
8849
8850 /* Better off falling back on nhtids with low npromisingtids */
8851 if (group->npromisingtids <= 4)
8852 group->npromisingtids = 4;
8853 else
8854 group->npromisingtids =
8856 }
8857
8858 /* Sort groups and rearrange caller's deltids array */
8861 reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
8862
8864 /* Determine number of favorable blocks at the start of final deltids */
8866 delstate->deltids);
8867
8868 for (int b = 0; b < nblockgroups; b++)
8869 {
8870 IndexDeleteCounts *group = blockgroups + b;
8871 TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
8872
8874 sizeof(TM_IndexDelete) * group->ntids);
8875 ncopied += group->ntids;
8876 }
8877
8878 /* Copy final grouped and sorted TIDs back into start of caller's array */
8880 sizeof(TM_IndexDelete) * ncopied);
8881 delstate->ndeltids = ncopied;
8882
8885
8886 return nblocksfavorable;
8887}

References Assert, b, BlockNumberIsValid(), BOTTOMUP_MAX_NBLOCKS, bottomup_nblocksfavorable(), bottomup_sort_and_shrink_cmp(), fb(), i, IndexDeleteCounts::ifirsttid, InvalidBlockNumber, ItemPointerGetBlockNumber(), Min, IndexDeleteCounts::npromisingtids, IndexDeleteCounts::ntids, palloc(), palloc_array, pfree(), pg_nextpower2_32(), and qsort.

Referenced by heap_index_delete_tuples().

◆ bottomup_sort_and_shrink_cmp()

static int bottomup_sort_and_shrink_cmp ( const void arg1,
const void arg2 
)
static

Definition at line 8702 of file heapam.c.

8703{
8706
8707 /*
8708 * Most significant field is npromisingtids (which we invert the order of
8709 * so as to sort in desc order).
8710 *
8711 * Caller should have already normalized npromisingtids fields into
8712 * power-of-two values (buckets).
8713 */
8714 if (group1->npromisingtids > group2->npromisingtids)
8715 return -1;
8716 if (group1->npromisingtids < group2->npromisingtids)
8717 return 1;
8718
8719 /*
8720 * Tiebreak: desc ntids sort order.
8721 *
8722 * We cannot expect power-of-two values for ntids fields. We should
8723 * behave as if they were already rounded up for us instead.
8724 */
8725 if (group1->ntids != group2->ntids)
8726 {
8729
8730 if (ntids1 > ntids2)
8731 return -1;
8732 if (ntids1 < ntids2)
8733 return 1;
8734 }
8735
8736 /*
8737 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
8738 * block in deltids array) order.
8739 *
8740 * This is equivalent to sorting in ascending heap block number order
8741 * (among otherwise equal subsets of the array). This approach allows us
8742 * to avoid accessing the out-of-line TID. (We rely on the assumption
8743 * that the deltids array was sorted in ascending heap TID order when
8744 * these offsets to the first TID from each heap block group were formed.)
8745 */
8746 if (group1->ifirsttid > group2->ifirsttid)
8747 return 1;
8748 if (group1->ifirsttid < group2->ifirsttid)
8749 return -1;
8750
8752
8753 return 0;
8754}

References fb(), pg_nextpower2_32(), and pg_unreachable.

Referenced by bottomup_sort_and_shrink().

◆ compute_infobits()

◆ compute_new_xmax_infomask()

static void compute_new_xmax_infomask ( TransactionId  xmax,
uint16  old_infomask,
uint16  old_infomask2,
TransactionId  add_to_xmax,
LockTupleMode  mode,
bool  is_update,
TransactionId result_xmax,
uint16 result_infomask,
uint16 result_infomask2 
)
static

Definition at line 5409 of file heapam.c.

5414{
5415 TransactionId new_xmax;
5418
5420
5421l5:
5422 new_infomask = 0;
5423 new_infomask2 = 0;
5425 {
5426 /*
5427 * No previous locker; we just insert our own TransactionId.
5428 *
5429 * Note that it's critical that this case be the first one checked,
5430 * because there are several blocks below that come back to this one
5431 * to implement certain optimizations; old_infomask might contain
5432 * other dirty bits in those cases, but we don't really care.
5433 */
5434 if (is_update)
5435 {
5436 new_xmax = add_to_xmax;
5437 if (mode == LockTupleExclusive)
5439 }
5440 else
5441 {
5443 switch (mode)
5444 {
5445 case LockTupleKeyShare:
5446 new_xmax = add_to_xmax;
5448 break;
5449 case LockTupleShare:
5450 new_xmax = add_to_xmax;
5452 break;
5454 new_xmax = add_to_xmax;
5456 break;
5457 case LockTupleExclusive:
5458 new_xmax = add_to_xmax;
5461 break;
5462 default:
5463 new_xmax = InvalidTransactionId; /* silence compiler */
5464 elog(ERROR, "invalid lock mode");
5465 }
5466 }
5467 }
5469 {
5471
5472 /*
5473 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5474 * cross-check.
5475 */
5477
5478 /*
5479 * A multixact together with LOCK_ONLY set but neither lock bit set
5480 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5481 * anymore. This check is critical for databases upgraded by
5482 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5483 * that such multis are never passed.
5484 */
5486 {
5489 goto l5;
5490 }
5491
5492 /*
5493 * If the XMAX is already a MultiXactId, then we need to expand it to
5494 * include add_to_xmax; but if all the members were lockers and are
5495 * all gone, we can do away with the IS_MULTI bit and just set
5496 * add_to_xmax as the only locker/updater. If all lockers are gone
5497 * and we have an updater that aborted, we can also do without a
5498 * multi.
5499 *
5500 * The cost of doing GetMultiXactIdMembers would be paid by
5501 * MultiXactIdExpand if we weren't to do this, so this check is not
5502 * incurring extra work anyhow.
5503 */
5505 {
5508 old_infomask)))
5509 {
5510 /*
5511 * Reset these bits and restart; otherwise fall through to
5512 * create a new multi below.
5513 */
5516 goto l5;
5517 }
5518 }
5519
5521
5522 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5523 new_status);
5525 }
5527 {
5528 /*
5529 * It's a committed update, so we need to preserve him as updater of
5530 * the tuple.
5531 */
5532 MultiXactStatus status;
5534
5536 status = MultiXactStatusUpdate;
5537 else
5539
5541
5542 /*
5543 * since it's not running, it's obviously impossible for the old
5544 * updater to be identical to the current one, so we need not check
5545 * for that case as we do in the block above.
5546 */
5547 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5549 }
5550 else if (TransactionIdIsInProgress(xmax))
5551 {
5552 /*
5553 * If the XMAX is a valid, in-progress TransactionId, then we need to
5554 * create a new MultiXactId that includes both the old locker or
5555 * updater and our own TransactionId.
5556 */
5560
5562 {
5568 {
5571 else
5573 }
5574 else
5575 {
5576 /*
5577 * LOCK_ONLY can be present alone only when a page has been
5578 * upgraded by pg_upgrade. But in that case,
5579 * TransactionIdIsInProgress() should have returned false. We
5580 * assume it's no longer locked in this case.
5581 */
5582 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5585 goto l5;
5586 }
5587 }
5588 else
5589 {
5590 /* it's an update, but which kind? */
5593 else
5595 }
5596
5598
5599 /*
5600 * If the lock to be acquired is for the same TransactionId as the
5601 * existing lock, there's an optimization possible: consider only the
5602 * strongest of both locks as the only one present, and restart.
5603 */
5604 if (xmax == add_to_xmax)
5605 {
5606 /*
5607 * Note that it's not possible for the original tuple to be
5608 * updated: we wouldn't be here because the tuple would have been
5609 * invisible and we wouldn't try to update it. As a subtlety,
5610 * this code can also run when traversing an update chain to lock
5611 * future versions of a tuple. But we wouldn't be here either,
5612 * because the add_to_xmax would be different from the original
5613 * updater.
5614 */
5616
5617 /* acquire the strongest of both */
5618 if (mode < old_mode)
5619 mode = old_mode;
5620 /* mustn't touch is_update */
5621
5623 goto l5;
5624 }
5625
5626 /* otherwise, just fall back to creating a new multixact */
5628 new_xmax = MultiXactIdCreate(xmax, old_status,
5631 }
5634 {
5635 /*
5636 * It's a committed update, so we gotta preserve him as updater of the
5637 * tuple.
5638 */
5639 MultiXactStatus status;
5641
5643 status = MultiXactStatusUpdate;
5644 else
5646
5648
5649 /*
5650 * since it's not running, it's obviously impossible for the old
5651 * updater to be identical to the current one, so we need not check
5652 * for that case as we do in the block above.
5653 */
5654 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5656 }
5657 else
5658 {
5659 /*
5660 * Can get here iff the locking/updating transaction was running when
5661 * the infomask was extracted from the tuple, but finished before
5662 * TransactionIdIsInProgress got to run. Deal with it as if there was
5663 * no locker at all in the first place.
5664 */
5666 goto l5;
5667 }
5668
5671 *result_xmax = new_xmax;
5672}

References Assert, elog, ERROR, fb(), get_mxact_status_for_lock(), GetMultiXactIdHintBits(), HEAP_KEYS_UPDATED, HEAP_LOCKED_UPGRADED(), HEAP_XMAX_COMMITTED, HEAP_XMAX_EXCL_LOCK, HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HEAP_XMAX_SHR_LOCK, InvalidTransactionId, LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, mode, MultiXactIdCreate(), MultiXactIdExpand(), MultiXactIdGetUpdateXid(), MultiXactIdIsRunning(), MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), TUPLOCK_from_mxstatus, and WARNING.

Referenced by heap_delete(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), and heap_update().

◆ ConditionalMultiXactIdWait()

static bool ConditionalMultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
Relation  rel,
int remaining,
bool  logLockFailure 
)
static

Definition at line 7895 of file heapam.c.

7898{
7899 return Do_MultiXactIdWait(multi, status, infomask, true,
7901}

References Do_MultiXactIdWait(), fb(), remaining, and XLTW_None.

Referenced by heap_lock_tuple().

◆ Do_MultiXactIdWait()

static bool Do_MultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
bool  nowait,
Relation  rel,
const ItemPointerData ctid,
XLTW_Oper  oper,
int remaining,
bool  logLockFailure 
)
static

Definition at line 7795 of file heapam.c.

7799{
7800 bool result = true;
7801 MultiXactMember *members;
7802 int nmembers;
7803 int remain = 0;
7804
7805 /* for pre-pg_upgrade tuples, no need to sleep at all */
7806 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7807 GetMultiXactIdMembers(multi, &members, false,
7809
7810 if (nmembers >= 0)
7811 {
7812 int i;
7813
7814 for (i = 0; i < nmembers; i++)
7815 {
7816 TransactionId memxid = members[i].xid;
7817 MultiXactStatus memstatus = members[i].status;
7818
7820 {
7821 remain++;
7822 continue;
7823 }
7824
7826 LOCKMODE_from_mxstatus(status)))
7827 {
7829 remain++;
7830 continue;
7831 }
7832
7833 /*
7834 * This member conflicts with our multi, so we have to sleep (or
7835 * return failure, if asked to avoid waiting.)
7836 *
7837 * Note that we don't set up an error context callback ourselves,
7838 * but instead we pass the info down to XactLockTableWait. This
7839 * might seem a bit wasteful because the context is set up and
7840 * tore down for each member of the multixact, but in reality it
7841 * should be barely noticeable, and it avoids duplicate code.
7842 */
7843 if (nowait)
7844 {
7846 if (!result)
7847 break;
7848 }
7849 else
7850 XactLockTableWait(memxid, rel, ctid, oper);
7851 }
7852
7853 pfree(members);
7854 }
7855
7856 if (remaining)
7857 *remaining = remain;
7858
7859 return result;
7860}

References ConditionalXactLockTableWait(), DoLockModesConflict(), fb(), GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), i, LOCKMODE_from_mxstatus, oper(), pfree(), remaining, MultiXactMember::status, TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), XactLockTableWait(), and MultiXactMember::xid.

Referenced by ConditionalMultiXactIdWait(), and MultiXactIdWait().

◆ DoesMultiXactIdConflict()

static bool DoesMultiXactIdConflict ( MultiXactId  multi,
uint16  infomask,
LockTupleMode  lockmode,
bool current_is_member 
)
static

Definition at line 7695 of file heapam.c.

7697{
7698 int nmembers;
7699 MultiXactMember *members;
7700 bool result = false;
7701 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7702
7704 return false;
7705
7706 nmembers = GetMultiXactIdMembers(multi, &members, false,
7708 if (nmembers >= 0)
7709 {
7710 int i;
7711
7712 for (i = 0; i < nmembers; i++)
7713 {
7716
7717 if (result && (current_is_member == NULL || *current_is_member))
7718 break;
7719
7720 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7721
7722 /* ignore members from current xact (but track their presence) */
7723 memxid = members[i].xid;
7725 {
7726 if (current_is_member != NULL)
7727 *current_is_member = true;
7728 continue;
7729 }
7730 else if (result)
7731 continue;
7732
7733 /* ignore members that don't conflict with the lock we want */
7735 continue;
7736
7737 if (ISUPDATE_from_mxstatus(members[i].status))
7738 {
7739 /* ignore aborted updaters */
7741 continue;
7742 }
7743 else
7744 {
7745 /* ignore lockers-only that are no longer in progress */
7747 continue;
7748 }
7749
7750 /*
7751 * Whatever remains are either live lockers that conflict with our
7752 * wanted lock, and updaters that are not aborted. Those conflict
7753 * with what we want. Set up to return true, but keep going to
7754 * look for the current transaction among the multixact members,
7755 * if needed.
7756 */
7757 result = true;
7758 }
7759 pfree(members);
7760 }
7761
7762 return result;
7763}

References DoLockModesConflict(), fb(), GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), i, ISUPDATE_from_mxstatus, LOCKMODE_from_mxstatus, pfree(), TransactionIdDidAbort(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), tupleLockExtraInfo, and MultiXactMember::xid.

Referenced by heap_delete(), heap_inplace_lock(), heap_lock_tuple(), and heap_update().

◆ ExtractReplicaIdentity()

static HeapTuple ExtractReplicaIdentity ( Relation  relation,
HeapTuple  tp,
bool  key_required,
bool copy 
)
static

Definition at line 9241 of file heapam.c.

9243{
9244 TupleDesc desc = RelationGetDescr(relation);
9245 char replident = relation->rd_rel->relreplident;
9248 bool nulls[MaxHeapAttributeNumber];
9250
9251 *copy = false;
9252
9253 if (!RelationIsLogicallyLogged(relation))
9254 return NULL;
9255
9256 if (replident == REPLICA_IDENTITY_NOTHING)
9257 return NULL;
9258
9259 if (replident == REPLICA_IDENTITY_FULL)
9260 {
9261 /*
9262 * When logging the entire old tuple, it very well could contain
9263 * toasted columns. If so, force them to be inlined.
9264 */
9265 if (HeapTupleHasExternal(tp))
9266 {
9267 *copy = true;
9268 tp = toast_flatten_tuple(tp, desc);
9269 }
9270 return tp;
9271 }
9272
9273 /* if the key isn't required and we're only logging the key, we're done */
9274 if (!key_required)
9275 return NULL;
9276
9277 /* find out the replica identity columns */
9280
9281 /*
9282 * If there's no defined replica identity columns, treat as !key_required.
9283 * (This case should not be reachable from heap_update, since that should
9284 * calculate key_required accurately. But heap_delete just passes
9285 * constant true for key_required, so we can hit this case in deletes.)
9286 */
9287 if (bms_is_empty(idattrs))
9288 return NULL;
9289
9290 /*
9291 * Construct a new tuple containing only the replica identity columns,
9292 * with nulls elsewhere. While we're at it, assert that the replica
9293 * identity columns aren't null.
9294 */
9295 heap_deform_tuple(tp, desc, values, nulls);
9296
9297 for (int i = 0; i < desc->natts; i++)
9298 {
9300 idattrs))
9301 Assert(!nulls[i]);
9302 else
9303 nulls[i] = true;
9304 }
9305
9306 key_tuple = heap_form_tuple(desc, values, nulls);
9307 *copy = true;
9308
9310
9311 /*
9312 * If the tuple, which by here only contains indexed columns, still has
9313 * toasted columns, force them to be inlined. This is somewhat unlikely
9314 * since there's limits on the size of indexed columns, so we don't
9315 * duplicate toast_flatten_tuple()s functionality in the above loop over
9316 * the indexed columns, even if it would be more efficient.
9317 */
9319 {
9321
9324 }
9325
9326 return key_tuple;
9327}

References Assert, bms_free(), bms_is_empty, bms_is_member(), fb(), FirstLowInvalidHeapAttributeNumber, heap_deform_tuple(), heap_form_tuple(), heap_freetuple(), HeapTupleHasExternal(), i, INDEX_ATTR_BITMAP_IDENTITY_KEY, MaxHeapAttributeNumber, TupleDescData::natts, RelationData::rd_rel, RelationGetDescr, RelationGetIndexAttrBitmap(), RelationIsLogicallyLogged, toast_flatten_tuple(), and values.

Referenced by heap_delete(), and heap_update().

◆ FreeBulkInsertState()

◆ FreezeMultiXactId()

static TransactionId FreezeMultiXactId ( MultiXactId  multi,
uint16  t_infomask,
const struct VacuumCutoffs cutoffs,
uint16 flags,
HeapPageFreeze pagefrz 
)
static

Definition at line 6790 of file heapam.c.

6793{
6795 MultiXactMember *members;
6796 int nmembers;
6797 bool need_replace;
6798 int nnewmembers;
6800 bool has_lockers;
6802 bool update_committed;
6803 TransactionId FreezePageRelfrozenXid;
6804
6805 *flags = 0;
6806
6807 /* We should only be called in Multis */
6808 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6809
6810 if (!MultiXactIdIsValid(multi) ||
6811 HEAP_LOCKED_UPGRADED(t_infomask))
6812 {
6813 *flags |= FRM_INVALIDATE_XMAX;
6814 pagefrz->freeze_required = true;
6815 return InvalidTransactionId;
6816 }
6817 else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
6818 ereport(ERROR,
6820 errmsg_internal("found multixact %u from before relminmxid %u",
6821 multi, cutoffs->relminmxid)));
6822 else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
6823 {
6825
6826 /*
6827 * This old multi cannot possibly have members still running, but
6828 * verify just in case. If it was a locker only, it can be removed
6829 * without any further consideration; but if it contained an update,
6830 * we might need to preserve it.
6831 */
6832 if (MultiXactIdIsRunning(multi,
6833 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6834 ereport(ERROR,
6836 errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
6837 multi, cutoffs->OldestMxact)));
6838
6839 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6840 {
6841 *flags |= FRM_INVALIDATE_XMAX;
6842 pagefrz->freeze_required = true;
6843 return InvalidTransactionId;
6844 }
6845
6846 /* replace multi with single XID for its updater? */
6847 update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
6849 ereport(ERROR,
6851 errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
6852 multi, update_xact,
6853 cutoffs->relfrozenxid)));
6854 else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
6855 {
6856 /*
6857 * Updater XID has to have aborted (otherwise the tuple would have
6858 * been pruned away instead, since updater XID is < OldestXmin).
6859 * Just remove xmax.
6860 */
6862 ereport(ERROR,
6864 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6865 multi, update_xact,
6866 cutoffs->OldestXmin)));
6867 *flags |= FRM_INVALIDATE_XMAX;
6868 pagefrz->freeze_required = true;
6869 return InvalidTransactionId;
6870 }
6871
6872 /* Have to keep updater XID as new xmax */
6873 *flags |= FRM_RETURN_IS_XID;
6874 pagefrz->freeze_required = true;
6875 return update_xact;
6876 }
6877
6878 /*
6879 * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
6880 * need to walk the whole members array to figure out what to do, if
6881 * anything.
6882 */
6883 nmembers =
6884 GetMultiXactIdMembers(multi, &members, false,
6885 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6886 if (nmembers <= 0)
6887 {
6888 /* Nothing worth keeping */
6889 *flags |= FRM_INVALIDATE_XMAX;
6890 pagefrz->freeze_required = true;
6891 return InvalidTransactionId;
6892 }
6893
6894 /*
6895 * The FRM_NOOP case is the only case where we might need to ratchet back
6896 * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
6897 * case where our caller might ratchet back its NoFreezePageRelfrozenXid
6898 * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
6899 * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
6900 * trackers managed by VACUUM being ratcheting back by xmax to the degree
6901 * required to make it safe to leave xmax undisturbed, independent of
6902 * whether or not page freezing is triggered somewhere else.
6903 *
6904 * Our policy is to force freezing in every case other than FRM_NOOP,
6905 * which obviates the need to maintain either set of trackers, anywhere.
6906 * Every other case will reliably execute a freeze plan for xmax that
6907 * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
6908 * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
6909 * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
6910 * OldestXmin/OldestMxact, so later values never need to be tracked here.)
6911 */
6912 need_replace = false;
6913 FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
6914 for (int i = 0; i < nmembers; i++)
6915 {
6916 TransactionId xid = members[i].xid;
6917
6918 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6919
6920 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
6921 {
6922 /* Can't violate the FreezeLimit postcondition */
6923 need_replace = true;
6924 break;
6925 }
6926 if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
6927 FreezePageRelfrozenXid = xid;
6928 }
6929
6930 /* Can't violate the MultiXactCutoff postcondition, either */
6931 if (!need_replace)
6933
6934 if (!need_replace)
6935 {
6936 /*
6937 * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
6938 * both together to make it safe to retain this particular multi after
6939 * freezing its page
6940 */
6941 *flags |= FRM_NOOP;
6942 pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
6943 if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
6944 pagefrz->FreezePageRelminMxid = multi;
6945 pfree(members);
6946 return multi;
6947 }
6948
6949 /*
6950 * Do a more thorough second pass over the multi to figure out which
6951 * member XIDs actually need to be kept. Checking the precise status of
6952 * individual members might even show that we don't need to keep anything.
6953 * That is quite possible even though the Multi must be >= OldestMxact,
6954 * since our second pass only keeps member XIDs when it's truly necessary;
6955 * even member XIDs >= OldestXmin often won't be kept by second pass.
6956 */
6957 nnewmembers = 0;
6959 has_lockers = false;
6961 update_committed = false;
6962
6963 /*
6964 * Determine whether to keep each member xid, or to ignore it instead
6965 */
6966 for (int i = 0; i < nmembers; i++)
6967 {
6968 TransactionId xid = members[i].xid;
6969 MultiXactStatus mstatus = members[i].status;
6970
6971 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6972
6973 if (!ISUPDATE_from_mxstatus(mstatus))
6974 {
6975 /*
6976 * Locker XID (not updater XID). We only keep lockers that are
6977 * still running.
6978 */
6981 {
6982 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6983 ereport(ERROR,
6985 errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
6986 multi, xid,
6987 cutoffs->OldestXmin)));
6988 newmembers[nnewmembers++] = members[i];
6989 has_lockers = true;
6990 }
6991
6992 continue;
6993 }
6994
6995 /*
6996 * Updater XID (not locker XID). Should we keep it?
6997 *
6998 * Since the tuple wasn't totally removed when vacuum pruned, the
6999 * update Xid cannot possibly be older than OldestXmin cutoff unless
7000 * the updater XID aborted. If the updater transaction is known
7001 * aborted or crashed then it's okay to ignore it, otherwise not.
7002 *
7003 * In any case the Multi should never contain two updaters, whatever
7004 * their individual commit status. Check for that first, in passing.
7005 */
7007 ereport(ERROR,
7009 errmsg_internal("multixact %u has two or more updating members",
7010 multi),
7011 errdetail_internal("First updater XID=%u second updater XID=%u.",
7012 update_xid, xid)));
7013
7014 /*
7015 * As with all tuple visibility routines, it's critical to test
7016 * TransactionIdIsInProgress before TransactionIdDidCommit, because of
7017 * race conditions explained in detail in heapam_visibility.c.
7018 */
7021 update_xid = xid;
7022 else if (TransactionIdDidCommit(xid))
7023 {
7024 /*
7025 * The transaction committed, so we can tell caller to set
7026 * HEAP_XMAX_COMMITTED. (We can only do this because we know the
7027 * transaction is not running.)
7028 */
7029 update_committed = true;
7030 update_xid = xid;
7031 }
7032 else
7033 {
7034 /*
7035 * Not in progress, not committed -- must be aborted or crashed;
7036 * we can ignore it.
7037 */
7038 continue;
7039 }
7040
7041 /*
7042 * We determined that updater must be kept -- add it to pending new
7043 * members list
7044 */
7045 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
7046 ereport(ERROR,
7048 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
7049 multi, xid, cutoffs->OldestXmin)));
7050 newmembers[nnewmembers++] = members[i];
7051 }
7052
7053 pfree(members);
7054
7055 /*
7056 * Determine what to do with caller's multi based on information gathered
7057 * during our second pass
7058 */
7059 if (nnewmembers == 0)
7060 {
7061 /* Nothing worth keeping */
7062 *flags |= FRM_INVALIDATE_XMAX;
7064 }
7066 {
7067 /*
7068 * If there's a single member and it's an update, pass it back alone
7069 * without creating a new Multi. (XXX we could do this when there's a
7070 * single remaining locker, too, but that would complicate the API too
7071 * much; moreover, the case with the single updater is more
7072 * interesting, because those are longer-lived.)
7073 */
7074 Assert(nnewmembers == 1);
7075 *flags |= FRM_RETURN_IS_XID;
7076 if (update_committed)
7077 *flags |= FRM_MARK_COMMITTED;
7079 }
7080 else
7081 {
7082 /*
7083 * Create a new multixact with the surviving members of the previous
7084 * one, to set as new Xmax in the tuple
7085 */
7087 *flags |= FRM_RETURN_IS_MULTI;
7088 }
7089
7091
7092 pagefrz->freeze_required = true;
7093 return newxmax;
7094}

References Assert, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errdetail_internal(), errmsg_internal(), ERROR, fb(), HeapPageFreeze::freeze_required, VacuumCutoffs::FreezeLimit, HeapPageFreeze::FreezePageRelfrozenXid, HeapPageFreeze::FreezePageRelminMxid, FRM_INVALIDATE_XMAX, FRM_MARK_COMMITTED, FRM_NOOP, FRM_RETURN_IS_MULTI, FRM_RETURN_IS_XID, GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, i, InvalidTransactionId, ISUPDATE_from_mxstatus, VacuumCutoffs::MultiXactCutoff, MultiXactIdCreateFromMembers(), MultiXactIdGetUpdateXid(), MultiXactIdIsRunning(), MultiXactIdIsValid, MultiXactIdPrecedes(), VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, palloc_array, pfree(), VacuumCutoffs::relfrozenxid, VacuumCutoffs::relminmxid, MultiXactMember::status, TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), TransactionIdIsValid, TransactionIdPrecedes(), and MultiXactMember::xid.

Referenced by heap_prepare_freeze_tuple().

◆ get_mxact_status_for_lock()

static MultiXactStatus get_mxact_status_for_lock ( LockTupleMode  mode,
bool  is_update 
)
static

Definition at line 4611 of file heapam.c.

4612{
4613 int retval;
4614
4615 if (is_update)
4616 retval = tupleLockExtraInfo[mode].updstatus;
4617 else
4618 retval = tupleLockExtraInfo[mode].lockstatus;
4619
4620 if (retval == -1)
4621 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4622 is_update ? "true" : "false");
4623
4624 return (MultiXactStatus) retval;
4625}

References elog, ERROR, fb(), mode, and tupleLockExtraInfo.

Referenced by compute_new_xmax_infomask(), heap_lock_tuple(), and test_lockmode_for_conflict().

◆ GetBulkInsertState()

◆ GetMultiXactIdHintBits()

static void GetMultiXactIdHintBits ( MultiXactId  multi,
uint16 new_infomask,
uint16 new_infomask2 
)
static

Definition at line 7546 of file heapam.c.

7548{
7549 int nmembers;
7550 MultiXactMember *members;
7551 int i;
7553 uint16 bits2 = 0;
7554 bool has_update = false;
7556
7557 /*
7558 * We only use this in multis we just created, so they cannot be values
7559 * pre-pg_upgrade.
7560 */
7561 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7562
7563 for (i = 0; i < nmembers; i++)
7564 {
7566
7567 /*
7568 * Remember the strongest lock mode held by any member of the
7569 * multixact.
7570 */
7571 mode = TUPLOCK_from_mxstatus(members[i].status);
7572 if (mode > strongest)
7573 strongest = mode;
7574
7575 /* See what other bits we need */
7576 switch (members[i].status)
7577 {
7581 break;
7582
7585 break;
7586
7588 has_update = true;
7589 break;
7590
7593 has_update = true;
7594 break;
7595 }
7596 }
7597
7600 bits |= HEAP_XMAX_EXCL_LOCK;
7601 else if (strongest == LockTupleShare)
7602 bits |= HEAP_XMAX_SHR_LOCK;
7603 else if (strongest == LockTupleKeyShare)
7604 bits |= HEAP_XMAX_KEYSHR_LOCK;
7605
7606 if (!has_update)
7607 bits |= HEAP_XMAX_LOCK_ONLY;
7608
7609 if (nmembers > 0)
7610 pfree(members);
7611
7612 *new_infomask = bits;
7614}

References fb(), GetMultiXactIdMembers(), HEAP_KEYS_UPDATED, HEAP_XMAX_EXCL_LOCK, HEAP_XMAX_IS_MULTI, HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HEAP_XMAX_SHR_LOCK, i, LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, mode, MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, pfree(), and TUPLOCK_from_mxstatus.

Referenced by compute_new_xmax_infomask(), heap_prepare_freeze_tuple(), and heap_update().

◆ heap_abort_speculative()

void heap_abort_speculative ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 6269 of file heapam.c.

6270{
6272 ItemId lp;
6273 HeapTupleData tp;
6274 Page page;
6275 BlockNumber block;
6276 Buffer buffer;
6277
6279
6280 block = ItemPointerGetBlockNumber(tid);
6281 buffer = ReadBuffer(relation, block);
6282 page = BufferGetPage(buffer);
6283
6285
6286 /*
6287 * Page can't be all visible, we just inserted into it, and are still
6288 * running.
6289 */
6290 Assert(!PageIsAllVisible(page));
6291
6294
6295 tp.t_tableOid = RelationGetRelid(relation);
6296 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6297 tp.t_len = ItemIdGetLength(lp);
6298 tp.t_self = *tid;
6299
6300 /*
6301 * Sanity check that the tuple really is a speculatively inserted tuple,
6302 * inserted by us.
6303 */
6304 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6305 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6306 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6307 elog(ERROR, "attempted to kill a non-speculative tuple");
6309
6310 /*
6311 * No need to check for serializable conflicts here. There is never a
6312 * need for a combo CID, either. No need to extract replica identity, or
6313 * do anything special with infomask bits.
6314 */
6315
6317
6318 /*
6319 * The tuple will become DEAD immediately. Flag that this page is a
6320 * candidate for pruning by setting xmin to TransactionXmin. While not
6321 * immediately prunable, it is the oldest xid we can cheaply determine
6322 * that's safe against wraparound / being older than the table's
6323 * relfrozenxid. To defend against the unlikely case of a new relation
6324 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6325 * if so (vacuum can't subsequently move relfrozenxid to beyond
6326 * TransactionXmin, so there's no race here).
6327 */
6329 {
6330 TransactionId relfrozenxid = relation->rd_rel->relfrozenxid;
6332
6333 if (TransactionIdPrecedes(TransactionXmin, relfrozenxid))
6334 prune_xid = relfrozenxid;
6335 else
6338 }
6339
6340 /* store transaction information of xact deleting the tuple */
6343
6344 /*
6345 * Set the tuple header xmin to InvalidTransactionId. This makes the
6346 * tuple immediately invisible everyone. (In particular, to any
6347 * transactions waiting on the speculative token, woken up later.)
6348 */
6350
6351 /* Clear the speculative insertion token too */
6352 tp.t_data->t_ctid = tp.t_self;
6353
6354 MarkBufferDirty(buffer);
6355
6356 /*
6357 * XLOG stuff
6358 *
6359 * The WAL records generated here match heap_delete(). The same recovery
6360 * routines are used.
6361 */
6362 if (RelationNeedsWAL(relation))
6363 {
6366
6368 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6369 tp.t_data->t_infomask2);
6371 xlrec.xmax = xid;
6372
6376
6377 /* No replica identity & replication origin logged */
6378
6380
6381 PageSetLSN(page, recptr);
6382 }
6383
6385
6387
6388 if (HeapTupleHasExternal(&tp))
6389 {
6390 Assert(!IsToastRelation(relation));
6391 heap_toast_delete(relation, &tp, true);
6392 }
6393
6394 /*
6395 * Never need to mark tuple for invalidation, since catalogs don't support
6396 * speculative insertion
6397 */
6398
6399 /* Now we can release the buffer */
6400 ReleaseBuffer(buffer);
6401
6402 /* count deletion, as we counted the insertion too */
6403 pgstat_count_heap_delete(relation);
6404}

References Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), compute_infobits(), elog, END_CRIT_SECTION, ERROR, fb(), xl_heap_delete::flags, GetCurrentTransactionId(), HEAP_MOVED, heap_toast_delete(), HEAP_XMAX_BITS, HeapTupleHasExternal(), HeapTupleHeaderIsHeapOnly(), HeapTupleHeaderIsSpeculative(), HeapTupleHeaderSetXmin(), InvalidTransactionId, IsToastRelation(), ItemIdGetLength, ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), MarkBufferDirty(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_delete(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetRelid, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapDelete, START_CRIT_SECTION, HeapTupleHeaderData::t_choice, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_heap, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, HeapTupleFields::t_xmin, TransactionIdIsValid, TransactionIdPrecedes(), TransactionXmin, XLH_DELETE_IS_SUPER, XLOG_HEAP_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by heapam_tuple_complete_speculative(), and toast_delete_datum().

◆ heap_acquire_tuplock()

static bool heap_acquire_tuplock ( Relation  relation,
const ItemPointerData tid,
LockTupleMode  mode,
LockWaitPolicy  wait_policy,
bool have_tuple_lock 
)
static

Definition at line 5360 of file heapam.c.

5362{
5363 if (*have_tuple_lock)
5364 return true;
5365
5366 switch (wait_policy)
5367 {
5368 case LockWaitBlock:
5369 LockTupleTuplock(relation, tid, mode);
5370 break;
5371
5372 case LockWaitSkip:
5373 if (!ConditionalLockTupleTuplock(relation, tid, mode, false))
5374 return false;
5375 break;
5376
5377 case LockWaitError:
5379 ereport(ERROR,
5381 errmsg("could not obtain lock on row in relation \"%s\"",
5382 RelationGetRelationName(relation))));
5383 break;
5384 }
5385 *have_tuple_lock = true;
5386
5387 return true;
5388}

References ConditionalLockTupleTuplock, ereport, errcode(), errmsg, ERROR, fb(), LockTupleTuplock, LockWaitBlock, LockWaitError, LockWaitSkip, log_lock_failures, mode, and RelationGetRelationName.

Referenced by heap_delete(), heap_lock_tuple(), and heap_update().

◆ heap_attr_equals()

static bool heap_attr_equals ( TupleDesc  tupdesc,
int  attrnum,
Datum  value1,
Datum  value2,
bool  isnull1,
bool  isnull2 
)
static

Definition at line 4429 of file heapam.c.

4431{
4432 /*
4433 * If one value is NULL and other is not, then they are certainly not
4434 * equal
4435 */
4436 if (isnull1 != isnull2)
4437 return false;
4438
4439 /*
4440 * If both are NULL, they can be considered equal.
4441 */
4442 if (isnull1)
4443 return true;
4444
4445 /*
4446 * We do simple binary comparison of the two datums. This may be overly
4447 * strict because there can be multiple binary representations for the
4448 * same logical value. But we should be OK as long as there are no false
4449 * positives. Using a type-specific equality operator is messy because
4450 * there could be multiple notions of equality in different operator
4451 * classes; furthermore, we cannot safely invoke user-defined functions
4452 * while holding exclusive buffer lock.
4453 */
4454 if (attrnum <= 0)
4455 {
4456 /* The only allowed system columns are OIDs, so do this */
4458 }
4459 else
4460 {
4462
4464 att = TupleDescCompactAttr(tupdesc, attrnum - 1);
4465 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4466 }
4467}

References Assert, DatumGetObjectId(), datumIsEqual(), fb(), and TupleDescCompactAttr().

Referenced by HeapDetermineColumnsInfo().

◆ heap_beginscan()

TableScanDesc heap_beginscan ( Relation  relation,
Snapshot  snapshot,
int  nkeys,
ScanKey  key,
ParallelTableScanDesc  parallel_scan,
uint32  flags 
)

Definition at line 1164 of file heapam.c.

1168{
1169 HeapScanDesc scan;
1170
1171 /*
1172 * increment relation ref count while scanning relation
1173 *
1174 * This is just to make really sure the relcache entry won't go away while
1175 * the scan has a pointer to it. Caller should be holding the rel open
1176 * anyway, so this is redundant in all normal scenarios...
1177 */
1179
1180 /*
1181 * allocate and initialize scan descriptor
1182 */
1183 if (flags & SO_TYPE_BITMAPSCAN)
1184 {
1186
1187 /*
1188 * Bitmap Heap scans do not have any fields that a normal Heap Scan
1189 * does not have, so no special initializations required here.
1190 */
1191 scan = (HeapScanDesc) bscan;
1192 }
1193 else
1195
1196 scan->rs_base.rs_rd = relation;
1197 scan->rs_base.rs_snapshot = snapshot;
1198 scan->rs_base.rs_nkeys = nkeys;
1199 scan->rs_base.rs_flags = flags;
1200 scan->rs_base.rs_parallel = parallel_scan;
1201 scan->rs_strategy = NULL; /* set in initscan */
1202 scan->rs_cbuf = InvalidBuffer;
1203
1204 /*
1205 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1206 */
1207 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1209
1210 /* Check that a historic snapshot is not used for non-catalog tables */
1211 if (snapshot &&
1212 IsHistoricMVCCSnapshot(snapshot) &&
1214 {
1215 ereport(ERROR,
1217 errmsg("cannot query non-catalog table \"%s\" during logical decoding",
1218 RelationGetRelationName(relation))));
1219 }
1220
1221 /*
1222 * For seqscan and sample scans in a serializable transaction, acquire a
1223 * predicate lock on the entire relation. This is required not only to
1224 * lock all the matching tuples, but also to conflict with new insertions
1225 * into the table. In an indexscan, we take page locks on the index pages
1226 * covering the range specified in the scan qual, but in a heap scan there
1227 * is nothing more fine-grained to lock. A bitmap scan is a different
1228 * story, there we have already scanned the index and locked the index
1229 * pages covering the predicate. But in that case we still have to lock
1230 * any matching heap tuples. For sample scan we could optimize the locking
1231 * to be at least page-level granularity, but we'd need to add per-tuple
1232 * locking for that.
1233 */
1235 {
1236 /*
1237 * Ensure a missing snapshot is noticed reliably, even if the
1238 * isolation mode means predicate locking isn't performed (and
1239 * therefore the snapshot isn't used here).
1240 */
1241 Assert(snapshot);
1242 PredicateLockRelation(relation, snapshot);
1243 }
1244
1245 /* we only need to set this up once */
1246 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1247
1248 /*
1249 * Allocate memory to keep track of page allocation for parallel workers
1250 * when doing a parallel scan.
1251 */
1252 if (parallel_scan != NULL)
1254 else
1256
1257 /*
1258 * we do this here instead of in initscan() because heap_rescan also calls
1259 * initscan() and we don't want to allocate memory again
1260 */
1261 if (nkeys > 0)
1262 scan->rs_base.rs_key = palloc_array(ScanKeyData, nkeys);
1263 else
1264 scan->rs_base.rs_key = NULL;
1265
1266 initscan(scan, key, false);
1267
1268 scan->rs_read_stream = NULL;
1269
1270 /*
1271 * Set up a read stream for sequential scans and TID range scans. This
1272 * should be done after initscan() because initscan() allocates the
1273 * BufferAccessStrategy object passed to the read stream API.
1274 */
1275 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN ||
1277 {
1279
1280 if (scan->rs_base.rs_parallel)
1282 else
1284
1285 /* ---
1286 * It is safe to use batchmode as the only locks taken by `cb`
1287 * are never taken while waiting for IO:
1288 * - SyncScanLock is used in the non-parallel case
1289 * - in the parallel case, only spinlocks and atomics are used
1290 * ---
1291 */
1294 scan->rs_strategy,
1295 scan->rs_base.rs_rd,
1297 cb,
1298 scan,
1299 0);
1300 }
1301 else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
1302 {
1305 scan->rs_strategy,
1306 scan->rs_base.rs_rd,
1309 scan,
1310 sizeof(TBMIterateResult));
1311 }
1312
1313 scan->rs_vmbuffer = InvalidBuffer;
1314
1315 return (TableScanDesc) scan;
1316}

References Assert, bitmapheap_stream_read_next(), ereport, errcode(), errmsg, ERROR, fb(), heap_scan_stream_read_next_parallel(), heap_scan_stream_read_next_serial(), initscan(), InvalidBuffer, IsHistoricMVCCSnapshot, IsMVCCSnapshot, MAIN_FORKNUM, palloc_array, palloc_object, PredicateLockRelation(), read_stream_begin_relation(), READ_STREAM_DEFAULT, READ_STREAM_SEQUENTIAL, READ_STREAM_USE_BATCHING, RelationGetRelationName, RelationGetRelid, RelationIncrementReferenceCount(), RelationIsAccessibleInLogicalDecoding, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_flags, TableScanDescData::rs_key, TableScanDescData::rs_nkeys, TableScanDescData::rs_parallel, HeapScanDescData::rs_parallelworkerdata, TableScanDescData::rs_rd, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, HeapScanDescData::rs_strategy, HeapScanDescData::rs_vmbuffer, SO_TYPE_BITMAPSCAN, SO_TYPE_SAMPLESCAN, SO_TYPE_SEQSCAN, SO_TYPE_TIDRANGESCAN, and HeapTupleData::t_tableOid.

◆ heap_delete()

TM_Result heap_delete ( Relation  relation,
const ItemPointerData tid,
CommandId  cid,
Snapshot  crosscheck,
bool  wait,
TM_FailureData tmfd,
bool  changingPart 
)

Definition at line 2854 of file heapam.c.

2857{
2858 TM_Result result;
2860 ItemId lp;
2861 HeapTupleData tp;
2862 Page page;
2863 BlockNumber block;
2864 Buffer buffer;
2865 Buffer vmbuffer = InvalidBuffer;
2866 TransactionId new_xmax;
2869 bool have_tuple_lock = false;
2870 bool iscombo;
2871 bool all_visible_cleared = false;
2872 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2873 bool old_key_copied = false;
2874
2876
2877 AssertHasSnapshotForToast(relation);
2878
2879 /*
2880 * Forbid this during a parallel operation, lest it allocate a combo CID.
2881 * Other workers might need that combo CID for visibility checks, and we
2882 * have no provision for broadcasting it to them.
2883 */
2884 if (IsInParallelMode())
2885 ereport(ERROR,
2887 errmsg("cannot delete tuples during a parallel operation")));
2888
2889 block = ItemPointerGetBlockNumber(tid);
2890 buffer = ReadBuffer(relation, block);
2891 page = BufferGetPage(buffer);
2892
2893 /*
2894 * Before locking the buffer, pin the visibility map page if it appears to
2895 * be necessary. Since we haven't got the lock yet, someone else might be
2896 * in the middle of changing this, so we'll need to recheck after we have
2897 * the lock.
2898 */
2899 if (PageIsAllVisible(page))
2900 visibilitymap_pin(relation, block, &vmbuffer);
2901
2903
2906
2907 tp.t_tableOid = RelationGetRelid(relation);
2908 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2909 tp.t_len = ItemIdGetLength(lp);
2910 tp.t_self = *tid;
2911
2912l1:
2913
2914 /*
2915 * If we didn't pin the visibility map page and the page has become all
2916 * visible while we were busy locking the buffer, we'll have to unlock and
2917 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2918 * unfortunate, but hopefully shouldn't happen often.
2919 */
2920 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2921 {
2923 visibilitymap_pin(relation, block, &vmbuffer);
2925 }
2926
2927 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2928
2929 if (result == TM_Invisible)
2930 {
2931 UnlockReleaseBuffer(buffer);
2932 ereport(ERROR,
2934 errmsg("attempted to delete invisible tuple")));
2935 }
2936 else if (result == TM_BeingModified && wait)
2937 {
2940
2941 /* must copy state data before unlocking buffer */
2944
2945 /*
2946 * Sleep until concurrent transaction ends -- except when there's a
2947 * single locker and it's our own transaction. Note we don't care
2948 * which lock mode the locker has, because we need the strongest one.
2949 *
2950 * Before sleeping, we need to acquire tuple lock to establish our
2951 * priority for the tuple (see heap_lock_tuple). LockTuple will
2952 * release us when we are next-in-line for the tuple.
2953 *
2954 * If we are forced to "start over" below, we keep the tuple lock;
2955 * this arranges that we stay at the head of the line while rechecking
2956 * tuple state.
2957 */
2959 {
2960 bool current_is_member = false;
2961
2964 {
2966
2967 /*
2968 * Acquire the lock, if necessary (but skip it when we're
2969 * requesting a lock and already have one; avoids deadlock).
2970 */
2971 if (!current_is_member)
2974
2975 /* wait for multixact */
2977 relation, &(tp.t_self), XLTW_Delete,
2978 NULL);
2980
2981 /*
2982 * If xwait had just locked the tuple then some other xact
2983 * could update this tuple before we get to this point. Check
2984 * for xmax change, and start over if so.
2985 *
2986 * We also must start over if we didn't pin the VM page, and
2987 * the page has become all visible.
2988 */
2989 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2992 xwait))
2993 goto l1;
2994 }
2995
2996 /*
2997 * You might think the multixact is necessarily done here, but not
2998 * so: it could have surviving members, namely our own xact or
2999 * other subxacts of this backend. It is legal for us to delete
3000 * the tuple in either case, however (the latter case is
3001 * essentially a situation of upgrading our former shared lock to
3002 * exclusive). We don't bother changing the on-disk hint bits
3003 * since we are about to overwrite the xmax altogether.
3004 */
3005 }
3007 {
3008 /*
3009 * Wait for regular transaction to end; but first, acquire tuple
3010 * lock.
3011 */
3015 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3017
3018 /*
3019 * xwait is done, but if xwait had just locked the tuple then some
3020 * other xact could update this tuple before we get to this point.
3021 * Check for xmax change, and start over if so.
3022 *
3023 * We also must start over if we didn't pin the VM page, and the
3024 * page has become all visible.
3025 */
3026 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
3029 xwait))
3030 goto l1;
3031
3032 /* Otherwise check if it committed or aborted */
3033 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3034 }
3035
3036 /*
3037 * We may overwrite if previous xmax aborted, or if it committed but
3038 * only locked the tuple without updating it.
3039 */
3040 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3043 result = TM_Ok;
3044 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
3045 result = TM_Updated;
3046 else
3047 result = TM_Deleted;
3048 }
3049
3050 /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3051 if (result != TM_Ok)
3052 {
3053 Assert(result == TM_SelfModified ||
3054 result == TM_Updated ||
3055 result == TM_Deleted ||
3056 result == TM_BeingModified);
3058 Assert(result != TM_Updated ||
3060 }
3061
3062 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3063 {
3064 /* Perform additional check for transaction-snapshot mode RI updates */
3065 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3066 result = TM_Updated;
3067 }
3068
3069 if (result != TM_Ok)
3070 {
3071 tmfd->ctid = tp.t_data->t_ctid;
3073 if (result == TM_SelfModified)
3075 else
3076 tmfd->cmax = InvalidCommandId;
3077 UnlockReleaseBuffer(buffer);
3078 if (have_tuple_lock)
3080 if (vmbuffer != InvalidBuffer)
3081 ReleaseBuffer(vmbuffer);
3082 return result;
3083 }
3084
3085 /*
3086 * We're about to do the actual delete -- check for conflict first, to
3087 * avoid possibly having to roll back work we've just done.
3088 *
3089 * This is safe without a recheck as long as there is no possibility of
3090 * another process scanning the page between this check and the delete
3091 * being visible to the scan (i.e., an exclusive buffer content lock is
3092 * continuously held from this point until the tuple delete is visible).
3093 */
3095
3096 /* replace cid with a combo CID if necessary */
3098
3099 /*
3100 * Compute replica identity tuple before entering the critical section so
3101 * we don't PANIC upon a memory allocation failure.
3102 */
3103 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3104
3105 /*
3106 * If this is the first possibly-multixact-able operation in the current
3107 * transaction, set my per-backend OldestMemberMXactId setting. We can be
3108 * certain that the transaction will never become a member of any older
3109 * MultiXactIds than that. (We have to do this even if we end up just
3110 * using our own TransactionId below, since some other backend could
3111 * incorporate our XID into a MultiXact immediately afterwards.)
3112 */
3114
3117 xid, LockTupleExclusive, true,
3118 &new_xmax, &new_infomask, &new_infomask2);
3119
3121
3122 /*
3123 * If this transaction commits, the tuple will become DEAD sooner or
3124 * later. Set flag that this page is a candidate for pruning once our xid
3125 * falls below the OldestXmin horizon. If the transaction finally aborts,
3126 * the subsequent page pruning will be a no-op and the hint will be
3127 * cleared.
3128 */
3129 PageSetPrunable(page, xid);
3130
3131 if (PageIsAllVisible(page))
3132 {
3133 all_visible_cleared = true;
3134 PageClearAllVisible(page);
3135 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3136 vmbuffer, VISIBILITYMAP_VALID_BITS);
3137 }
3138
3139 /* store transaction information of xact deleting the tuple */
3145 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3147 /* Make sure there is no forward chain link in t_ctid */
3148 tp.t_data->t_ctid = tp.t_self;
3149
3150 /* Signal that this is actually a move into another partition */
3151 if (changingPart)
3153
3154 MarkBufferDirty(buffer);
3155
3156 /*
3157 * XLOG stuff
3158 *
3159 * NB: heap_abort_speculative() uses the same xlog record and replay
3160 * routines.
3161 */
3162 if (RelationNeedsWAL(relation))
3163 {
3167
3168 /*
3169 * For logical decode we need combo CIDs to properly decode the
3170 * catalog
3171 */
3173 log_heap_new_cid(relation, &tp);
3174
3175 xlrec.flags = 0;
3178 if (changingPart)
3180 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3181 tp.t_data->t_infomask2);
3183 xlrec.xmax = new_xmax;
3184
3185 if (old_key_tuple != NULL)
3186 {
3187 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3189 else
3191 }
3192
3195
3197
3198 /*
3199 * Log replica identity of the deleted tuple if there is one
3200 */
3201 if (old_key_tuple != NULL)
3202 {
3203 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3204 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3205 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3206
3208 XLogRegisterData((char *) old_key_tuple->t_data
3210 old_key_tuple->t_len
3212 }
3213
3214 /* filtering by origin on a row level is much more efficient */
3216
3218
3219 PageSetLSN(page, recptr);
3220 }
3221
3223
3225
3226 if (vmbuffer != InvalidBuffer)
3227 ReleaseBuffer(vmbuffer);
3228
3229 /*
3230 * If the tuple has toasted out-of-line attributes, we need to delete
3231 * those items too. We have to do this before releasing the buffer
3232 * because we need to look at the contents of the tuple, but it's OK to
3233 * release the content lock on the buffer first.
3234 */
3235 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3236 relation->rd_rel->relkind != RELKIND_MATVIEW)
3237 {
3238 /* toast table entries should never be recursively toasted */
3240 }
3241 else if (HeapTupleHasExternal(&tp))
3242 heap_toast_delete(relation, &tp, false);
3243
3244 /*
3245 * Mark tuple for invalidation from system caches at next command
3246 * boundary. We have to do this before releasing the buffer because we
3247 * need to look at the contents of the tuple.
3248 */
3249 CacheInvalidateHeapTuple(relation, &tp, NULL);
3250
3251 /* Now we can release the buffer */
3252 ReleaseBuffer(buffer);
3253
3254 /*
3255 * Release the lmgr tuple lock, if we had it.
3256 */
3257 if (have_tuple_lock)
3259
3260 pgstat_count_heap_delete(relation);
3261
3264
3265 return TM_Ok;
3266}

References Assert, AssertHasSnapshotForToast(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), TM_FailureData::ctid, DoesMultiXactIdConflict(), END_CRIT_SECTION, ereport, errcode(), errmsg, ERROR, ExtractReplicaIdentity(), fb(), GetCurrentTransactionId(), heap_acquire_tuplock(), heap_freetuple(), HEAP_MOVED, heap_toast_delete(), HEAP_XMAX_BITS, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HeapTupleHasExternal(), HeapTupleHeaderAdjustCmax(), HeapTupleHeaderClearHotUpdated(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetCmax(), HeapTupleHeaderSetMovedPartitions(), HeapTupleHeaderSetXmax(), HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesVisibility(), InvalidBuffer, InvalidCommandId, InvalidSnapshot, IsInParallelMode(), ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), LockTupleExclusive, LockWaitBlock, log_heap_new_cid(), MarkBufferDirty(), MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusUpdate, PageClearAllVisible(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_delete(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetRelid, RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapDelete, SizeOfHeapHeader, SizeofHeapTupleHeader, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TransactionIdEquals, TransactionIdIsCurrentTransactionId(), UnlockReleaseBuffer(), UnlockTupleTuplock, UpdateXmaxHintBits(), visibilitymap_clear(), visibilitymap_pin(), VISIBILITYMAP_VALID_BITS, XactLockTableWait(), XLH_DELETE_ALL_VISIBLE_CLEARED, XLH_DELETE_CONTAINS_OLD_KEY, XLH_DELETE_CONTAINS_OLD_TUPLE, XLH_DELETE_IS_PARTITION_MOVE, XLOG_HEAP_DELETE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLogSetRecordFlags(), XLTW_Delete, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_delete(), and simple_heap_delete().

◆ heap_endscan()

void heap_endscan ( TableScanDesc  sscan)

Definition at line 1378 of file heapam.c.

1379{
1381
1382 /* Note: no locking manipulations needed */
1383
1384 /*
1385 * unpin scan buffers
1386 */
1387 if (BufferIsValid(scan->rs_cbuf))
1388 ReleaseBuffer(scan->rs_cbuf);
1389
1390 if (BufferIsValid(scan->rs_vmbuffer))
1392
1393 /*
1394 * Must free the read stream before freeing the BufferAccessStrategy.
1395 */
1396 if (scan->rs_read_stream)
1398
1399 /*
1400 * decrement relation reference count and free scan descriptor storage
1401 */
1403
1404 if (scan->rs_base.rs_key)
1405 pfree(scan->rs_base.rs_key);
1406
1407 if (scan->rs_strategy != NULL)
1409
1410 if (scan->rs_parallelworkerdata != NULL)
1412
1413 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1415
1416 pfree(scan);
1417}

References BufferIsValid(), fb(), FreeAccessStrategy(), pfree(), read_stream_end(), RelationDecrementReferenceCount(), ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, TableScanDescData::rs_key, HeapScanDescData::rs_parallelworkerdata, TableScanDescData::rs_rd, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, HeapScanDescData::rs_strategy, HeapScanDescData::rs_vmbuffer, SO_TEMP_SNAPSHOT, and UnregisterSnapshot().

◆ heap_fetch()

bool heap_fetch ( Relation  relation,
Snapshot  snapshot,
HeapTuple  tuple,
Buffer userbuf,
bool  keep_buf 
)

Definition at line 1669 of file heapam.c.

1674{
1675 ItemPointer tid = &(tuple->t_self);
1676 ItemId lp;
1677 Buffer buffer;
1678 Page page;
1679 OffsetNumber offnum;
1680 bool valid;
1681
1682 /*
1683 * Fetch and pin the appropriate page of the relation.
1684 */
1685 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1686
1687 /*
1688 * Need share lock on buffer to examine tuple commit status.
1689 */
1691 page = BufferGetPage(buffer);
1692
1693 /*
1694 * We'd better check for out-of-range offnum in case of VACUUM since the
1695 * TID was obtained.
1696 */
1697 offnum = ItemPointerGetOffsetNumber(tid);
1699 {
1701 ReleaseBuffer(buffer);
1703 tuple->t_data = NULL;
1704 return false;
1705 }
1706
1707 /*
1708 * get the item line pointer corresponding to the requested tid
1709 */
1710 lp = PageGetItemId(page, offnum);
1711
1712 /*
1713 * Must check for deleted tuple.
1714 */
1715 if (!ItemIdIsNormal(lp))
1716 {
1718 ReleaseBuffer(buffer);
1720 tuple->t_data = NULL;
1721 return false;
1722 }
1723
1724 /*
1725 * fill in *tuple fields
1726 */
1727 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1728 tuple->t_len = ItemIdGetLength(lp);
1729 tuple->t_tableOid = RelationGetRelid(relation);
1730
1731 /*
1732 * check tuple visibility, then release lock
1733 */
1734 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1735
1736 if (valid)
1737 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1739
1740 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1741
1743
1744 if (valid)
1745 {
1746 /*
1747 * All checks passed, so return the tuple as valid. Caller is now
1748 * responsible for releasing the buffer.
1749 */
1750 *userbuf = buffer;
1751
1752 return true;
1753 }
1754
1755 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1756 if (keep_buf)
1757 *userbuf = buffer;
1758 else
1759 {
1760 ReleaseBuffer(buffer);
1762 tuple->t_data = NULL;
1763 }
1764
1765 return false;
1766}

References BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetPage(), fb(), HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetXmin(), HeapTupleSatisfiesVisibility(), InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PredicateLockTID(), ReadBuffer(), RelationGetRelid, ReleaseBuffer(), HeapTupleData::t_data, HeapTupleData::t_len, HeapTupleData::t_self, and HeapTupleData::t_tableOid.

Referenced by heap_lock_updated_tuple_rec(), heapam_fetch_row_version(), and heapam_tuple_lock().

◆ heap_fetch_next_buffer()

static void heap_fetch_next_buffer ( HeapScanDesc  scan,
ScanDirection  dir 
)
inlinestatic

Definition at line 707 of file heapam.c.

708{
709 Assert(scan->rs_read_stream);
710
711 /* release previous scan buffer, if any */
712 if (BufferIsValid(scan->rs_cbuf))
713 {
714 ReleaseBuffer(scan->rs_cbuf);
715 scan->rs_cbuf = InvalidBuffer;
716 }
717
718 /*
719 * Be sure to check for interrupts at least once per page. Checks at
720 * higher code levels won't be able to stop a seqscan that encounters many
721 * pages' worth of consecutive dead tuples.
722 */
724
725 /*
726 * If the scan direction is changing, reset the prefetch block to the
727 * current block. Otherwise, we will incorrectly prefetch the blocks
728 * between the prefetch block and the current block again before
729 * prefetching blocks in the new, correct scan direction.
730 */
731 if (unlikely(scan->rs_dir != dir))
732 {
733 scan->rs_prefetch_block = scan->rs_cblock;
735 }
736
737 scan->rs_dir = dir;
738
740 if (BufferIsValid(scan->rs_cbuf))
742}

References Assert, BufferGetBlockNumber(), BufferIsValid(), CHECK_FOR_INTERRUPTS, fb(), InvalidBuffer, read_stream_next_buffer(), read_stream_reset(), ReleaseBuffer(), HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_dir, HeapScanDescData::rs_prefetch_block, HeapScanDescData::rs_read_stream, and unlikely.

Referenced by heapgettup(), and heapgettup_pagemode().

◆ heap_finish_speculative()

void heap_finish_speculative ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 6182 of file heapam.c.

6183{
6184 Buffer buffer;
6185 Page page;
6186 OffsetNumber offnum;
6187 ItemId lp;
6188 HeapTupleHeader htup;
6189
6190 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
6192 page = BufferGetPage(buffer);
6193
6194 offnum = ItemPointerGetOffsetNumber(tid);
6196 elog(ERROR, "offnum out of range");
6197 lp = PageGetItemId(page, offnum);
6198 if (!ItemIdIsNormal(lp))
6199 elog(ERROR, "invalid lp");
6200
6201 htup = (HeapTupleHeader) PageGetItem(page, lp);
6202
6203 /* NO EREPORT(ERROR) from here till changes are logged */
6205
6207
6208 MarkBufferDirty(buffer);
6209
6210 /*
6211 * Replace the speculative insertion token with a real t_ctid, pointing to
6212 * itself like it does on regular tuples.
6213 */
6214 htup->t_ctid = *tid;
6215
6216 /* XLOG stuff */
6217 if (RelationNeedsWAL(relation))
6218 {
6221
6223
6225
6226 /* We want the same filtering on this as on a plain insert */
6228
6231
6233
6234 PageSetLSN(page, recptr);
6235 }
6236
6238
6239 UnlockReleaseBuffer(buffer);
6240}

References Assert, BUFFER_LOCK_EXCLUSIVE, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, fb(), HeapTupleHeaderIsSpeculative(), ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), MarkBufferDirty(), xl_heap_confirm::offnum, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PageSetLSN(), ReadBuffer(), REGBUF_STANDARD, RelationNeedsWAL, SizeOfHeapConfirm, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, UnlockReleaseBuffer(), XLOG_HEAP_CONFIRM, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heapam_tuple_complete_speculative().

◆ heap_freeze_prepared_tuples()

void heap_freeze_prepared_tuples ( Buffer  buffer,
HeapTupleFreeze tuples,
int  ntuples 
)

Definition at line 7479 of file heapam.c.

7480{
7481 Page page = BufferGetPage(buffer);
7482
7483 for (int i = 0; i < ntuples; i++)
7484 {
7485 HeapTupleFreeze *frz = tuples + i;
7486 ItemId itemid = PageGetItemId(page, frz->offset);
7487 HeapTupleHeader htup;
7488
7489 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7491 }
7492}

References BufferGetPage(), fb(), heap_execute_freeze_tuple(), i, PageGetItem(), and PageGetItemId().

Referenced by heap_page_prune_and_freeze().

◆ heap_freeze_tuple()

bool heap_freeze_tuple ( HeapTupleHeader  tuple,
TransactionId  relfrozenxid,
TransactionId  relminmxid,
TransactionId  FreezeLimit,
TransactionId  MultiXactCutoff 
)

Definition at line 7501 of file heapam.c.

7504{
7506 bool do_freeze;
7507 bool totally_frozen;
7508 struct VacuumCutoffs cutoffs;
7509 HeapPageFreeze pagefrz;
7510
7511 cutoffs.relfrozenxid = relfrozenxid;
7512 cutoffs.relminmxid = relminmxid;
7513 cutoffs.OldestXmin = FreezeLimit;
7514 cutoffs.OldestMxact = MultiXactCutoff;
7515 cutoffs.FreezeLimit = FreezeLimit;
7516 cutoffs.MultiXactCutoff = MultiXactCutoff;
7517
7518 pagefrz.freeze_required = true;
7519 pagefrz.FreezePageRelfrozenXid = FreezeLimit;
7520 pagefrz.FreezePageRelminMxid = MultiXactCutoff;
7521 pagefrz.FreezePageConflictXid = InvalidTransactionId;
7522 pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
7523 pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
7524
7525 do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
7526 &pagefrz, &frz, &totally_frozen);
7527
7528 /*
7529 * Note that because this is not a WAL-logged operation, we don't need to
7530 * fill in the offset in the freeze record.
7531 */
7532
7533 if (do_freeze)
7535 return do_freeze;
7536}

References fb(), VacuumCutoffs::FreezeLimit, heap_execute_freeze_tuple(), heap_prepare_freeze_tuple(), InvalidTransactionId, VacuumCutoffs::MultiXactCutoff, VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, VacuumCutoffs::relfrozenxid, and VacuumCutoffs::relminmxid.

Referenced by rewrite_heap_tuple().

◆ heap_get_latest_tid()

void heap_get_latest_tid ( TableScanDesc  sscan,
ItemPointer  tid 
)

Definition at line 1941 of file heapam.c.

1943{
1944 Relation relation = sscan->rs_rd;
1945 Snapshot snapshot = sscan->rs_snapshot;
1946 ItemPointerData ctid;
1948
1949 /*
1950 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1951 * Assume that t_ctid links are valid however - there shouldn't be invalid
1952 * ones in the table.
1953 */
1955
1956 /*
1957 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1958 * need to examine, and *tid is the TID we will return if ctid turns out
1959 * to be bogus.
1960 *
1961 * Note that we will loop until we reach the end of the t_ctid chain.
1962 * Depending on the snapshot passed, there might be at most one visible
1963 * version of the row, but we don't try to optimize for that.
1964 */
1965 ctid = *tid;
1966 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1967 for (;;)
1968 {
1969 Buffer buffer;
1970 Page page;
1971 OffsetNumber offnum;
1972 ItemId lp;
1973 HeapTupleData tp;
1974 bool valid;
1975
1976 /*
1977 * Read, pin, and lock the page.
1978 */
1979 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1981 page = BufferGetPage(buffer);
1982
1983 /*
1984 * Check for bogus item number. This is not treated as an error
1985 * condition because it can happen while following a t_ctid link. We
1986 * just assume that the prior tid is OK and return it unchanged.
1987 */
1988 offnum = ItemPointerGetOffsetNumber(&ctid);
1990 {
1991 UnlockReleaseBuffer(buffer);
1992 break;
1993 }
1994 lp = PageGetItemId(page, offnum);
1995 if (!ItemIdIsNormal(lp))
1996 {
1997 UnlockReleaseBuffer(buffer);
1998 break;
1999 }
2000
2001 /* OK to access the tuple */
2002 tp.t_self = ctid;
2003 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2004 tp.t_len = ItemIdGetLength(lp);
2005 tp.t_tableOid = RelationGetRelid(relation);
2006
2007 /*
2008 * After following a t_ctid link, we might arrive at an unrelated
2009 * tuple. Check for XMIN match.
2010 */
2013 {
2014 UnlockReleaseBuffer(buffer);
2015 break;
2016 }
2017
2018 /*
2019 * Check tuple visibility; if visible, set it as the new result
2020 * candidate.
2021 */
2022 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2023 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2024 if (valid)
2025 *tid = ctid;
2026
2027 /*
2028 * If there's a valid t_ctid link, follow it, else we're done.
2029 */
2030 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2034 {
2035 UnlockReleaseBuffer(buffer);
2036 break;
2037 }
2038
2039 ctid = tp.t_data->t_ctid;
2041 UnlockReleaseBuffer(buffer);
2042 } /* end of loop */
2043}

References Assert, BUFFER_LOCK_SHARE, BufferGetPage(), fb(), HEAP_XMAX_INVALID, HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIndicatesMovedPartitions(), HeapTupleHeaderIsOnlyLocked(), HeapTupleSatisfiesVisibility(), InvalidTransactionId, ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), ReadBuffer(), RelationGetRelid, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TransactionIdEquals, TransactionIdIsValid, and UnlockReleaseBuffer().

◆ heap_getnext()

HeapTuple heap_getnext ( TableScanDesc  sscan,
ScanDirection  direction 
)

Definition at line 1420 of file heapam.c.

1421{
1423
1424 /*
1425 * This is still widely used directly, without going through table AM, so
1426 * add a safety check. It's possible we should, at a later point,
1427 * downgrade this to an assert. The reason for checking the AM routine,
1428 * rather than the AM oid, is that this allows to write regression tests
1429 * that create another AM reusing the heap handler.
1430 */
1431 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1432 ereport(ERROR,
1434 errmsg_internal("only heap AM is supported")));
1435
1436 /* Note: no locking manipulations needed */
1437
1439 heapgettup_pagemode(scan, direction,
1440 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1441 else
1442 heapgettup(scan, direction,
1443 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1444
1445 if (scan->rs_ctup.t_data == NULL)
1446 return NULL;
1447
1448 /*
1449 * if we get here it means we have a new current scan tuple, so point to
1450 * the proper return buffer and return the tuple.
1451 */
1452
1454
1455 return &scan->rs_ctup;
1456}

References ereport, errcode(), errmsg_internal(), ERROR, fb(), GetHeapamTableAmRoutine(), heapgettup(), heapgettup_pagemode(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_ctup, TableScanDescData::rs_flags, TableScanDescData::rs_key, TableScanDescData::rs_nkeys, TableScanDescData::rs_rd, SO_ALLOW_PAGEMODE, HeapTupleData::t_data, and unlikely.

Referenced by AlterTableMoveAll(), AlterTableSpaceOptions(), check_db_file_conflict(), CreateDatabaseUsingFileCopy(), do_autovacuum(), DropSetting(), DropTableSpace(), find_typed_table_dependencies(), get_all_vacuum_rels(), get_database_list(), get_subscription_list(), get_tables_to_repack(), get_tablespace_name(), get_tablespace_oid(), GetAllPublicationRelations(), getRelationsInNamespace(), GetSchemaPublicationRelations(), heapam_index_build_range_scan(), heapam_index_validate_scan(), objectsInSchemaToOids(), pgrowlocks(), pgstat_heap(), populate_typ_list(), ReindexMultipleTables(), remove_dbtablespaces(), RemoveSubscriptionRel(), RenameTableSpace(), ThereIsAtLeastOneRole(), and vac_truncate_clog().

◆ heap_getnextslot()

bool heap_getnextslot ( TableScanDesc  sscan,
ScanDirection  direction,
TupleTableSlot slot 
)

Definition at line 1459 of file heapam.c.

1460{
1462
1463 /* Note: no locking manipulations needed */
1464
1465 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1466 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1467 else
1468 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1469
1470 if (scan->rs_ctup.t_data == NULL)
1471 {
1472 ExecClearTuple(slot);
1473 return false;
1474 }
1475
1476 /*
1477 * if we get here it means we have a new current scan tuple, so point to
1478 * the proper return buffer and return the tuple.
1479 */
1480
1482
1483 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1484 scan->rs_cbuf);
1485 return true;
1486}

References ExecClearTuple(), ExecStoreBufferHeapTuple(), fb(), heapgettup(), heapgettup_pagemode(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_rd, SO_ALLOW_PAGEMODE, and HeapTupleData::t_data.

◆ heap_getnextslot_tidrange()

bool heap_getnextslot_tidrange ( TableScanDesc  sscan,
ScanDirection  direction,
TupleTableSlot slot 
)

Definition at line 1562 of file heapam.c.

1564{
1566 ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
1567 ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
1568
1569 /* Note: no locking manipulations needed */
1570 for (;;)
1571 {
1572 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1573 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1574 else
1575 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1576
1577 if (scan->rs_ctup.t_data == NULL)
1578 {
1579 ExecClearTuple(slot);
1580 return false;
1581 }
1582
1583 /*
1584 * heap_set_tidrange will have used heap_setscanlimits to limit the
1585 * range of pages we scan to only ones that can contain the TID range
1586 * we're scanning for. Here we must filter out any tuples from these
1587 * pages that are outside of that range.
1588 */
1589 if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1590 {
1591 ExecClearTuple(slot);
1592
1593 /*
1594 * When scanning backwards, the TIDs will be in descending order.
1595 * Future tuples in this direction will be lower still, so we can
1596 * just return false to indicate there will be no more tuples.
1597 */
1598 if (ScanDirectionIsBackward(direction))
1599 return false;
1600
1601 continue;
1602 }
1603
1604 /*
1605 * Likewise for the final page, we must filter out TIDs greater than
1606 * maxtid.
1607 */
1608 if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1609 {
1610 ExecClearTuple(slot);
1611
1612 /*
1613 * When scanning forward, the TIDs will be in ascending order.
1614 * Future tuples in this direction will be higher still, so we can
1615 * just return false to indicate there will be no more tuples.
1616 */
1617 if (ScanDirectionIsForward(direction))
1618 return false;
1619 continue;
1620 }
1621
1622 break;
1623 }
1624
1625 /*
1626 * if we get here it means we have a new current scan tuple, so point to
1627 * the proper return buffer and return the tuple.
1628 */
1630
1631 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1632 return true;
1633}

References ExecClearTuple(), ExecStoreBufferHeapTuple(), fb(), heapgettup(), heapgettup_pagemode(), ItemPointerCompare(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_rd, ScanDirectionIsBackward, ScanDirectionIsForward, SO_ALLOW_PAGEMODE, HeapTupleData::t_data, and HeapTupleData::t_self.

◆ heap_hot_search_buffer()

bool heap_hot_search_buffer ( ItemPointer  tid,
Relation  relation,
Buffer  buffer,
Snapshot  snapshot,
HeapTuple  heapTuple,
bool all_dead,
bool  first_call 
)

Definition at line 1789 of file heapam.c.

1792{
1793 Page page = BufferGetPage(buffer);
1795 BlockNumber blkno;
1796 OffsetNumber offnum;
1797 bool at_chain_start;
1798 bool valid;
1799 bool skip;
1800 GlobalVisState *vistest = NULL;
1801
1802 /* If this is not the first call, previous call returned a (live!) tuple */
1803 if (all_dead)
1805
1806 blkno = ItemPointerGetBlockNumber(tid);
1807 offnum = ItemPointerGetOffsetNumber(tid);
1809 skip = !first_call;
1810
1811 /* XXX: we should assert that a snapshot is pushed or registered */
1813 Assert(BufferGetBlockNumber(buffer) == blkno);
1814
1815 /* Scan through possible multiple members of HOT-chain */
1816 for (;;)
1817 {
1818 ItemId lp;
1819
1820 /* check for bogus TID */
1822 break;
1823
1824 lp = PageGetItemId(page, offnum);
1825
1826 /* check for unused, dead, or redirected items */
1827 if (!ItemIdIsNormal(lp))
1828 {
1829 /* We should only see a redirect at start of chain */
1831 {
1832 /* Follow the redirect */
1833 offnum = ItemIdGetRedirect(lp);
1834 at_chain_start = false;
1835 continue;
1836 }
1837 /* else must be end of chain */
1838 break;
1839 }
1840
1841 /*
1842 * Update heapTuple to point to the element of the HOT chain we're
1843 * currently investigating. Having t_self set correctly is important
1844 * because the SSI checks and the *Satisfies routine for historical
1845 * MVCC snapshots need the correct tid to decide about the visibility.
1846 */
1847 heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1848 heapTuple->t_len = ItemIdGetLength(lp);
1849 heapTuple->t_tableOid = RelationGetRelid(relation);
1850 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1851
1852 /*
1853 * Shouldn't see a HEAP_ONLY tuple at chain start.
1854 */
1856 break;
1857
1858 /*
1859 * The xmin should match the previous xmax value, else chain is
1860 * broken.
1861 */
1865 break;
1866
1867 /*
1868 * When first_call is true (and thus, skip is initially false) we'll
1869 * return the first tuple we find. But on later passes, heapTuple
1870 * will initially be pointing to the tuple we returned last time.
1871 * Returning it again would be incorrect (and would loop forever), so
1872 * we skip it and return the next match we find.
1873 */
1874 if (!skip)
1875 {
1876 /* If it's visible per the snapshot, we must return it */
1877 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1879 buffer, snapshot);
1880
1881 if (valid)
1882 {
1883 ItemPointerSetOffsetNumber(tid, offnum);
1884 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1886 if (all_dead)
1887 *all_dead = false;
1888 return true;
1889 }
1890 }
1891 skip = false;
1892
1893 /*
1894 * If we can't see it, maybe no one else can either. At caller
1895 * request, check whether all chain members are dead to all
1896 * transactions.
1897 *
1898 * Note: if you change the criterion here for what is "dead", fix the
1899 * planner's get_actual_variable_range() function to match.
1900 */
1901 if (all_dead && *all_dead)
1902 {
1903 if (!vistest)
1904 vistest = GlobalVisTestFor(relation);
1905
1906 if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1907 *all_dead = false;
1908 }
1909
1910 /*
1911 * Check to see if HOT chain continues past this tuple; if so fetch
1912 * the next offnum and loop around.
1913 */
1915 {
1916 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1917 blkno);
1918 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1919 at_chain_start = false;
1921 }
1922 else
1923 break; /* end of chain */
1924 }
1925
1926 return false;
1927}

References Assert, BufferGetBlockNumber(), BufferGetPage(), fb(), GlobalVisTestFor(), HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleIsHeapOnly(), HeapTupleIsHotUpdated(), HeapTupleIsSurelyDead(), HeapTupleSatisfiesVisibility(), InvalidTransactionId, ItemIdGetLength, ItemIdGetRedirect, ItemIdIsNormal, ItemIdIsRedirected, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerSet(), ItemPointerSetOffsetNumber(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PredicateLockTID(), RecentXmin, RelationGetRelid, skip, TransactionIdEquals, and TransactionIdIsValid.

Referenced by BitmapHeapScanNextBlock(), heap_index_delete_tuples(), and heapam_index_fetch_tuple().

◆ heap_index_delete_tuples()

TransactionId heap_index_delete_tuples ( Relation  rel,
TM_IndexDeleteOp delstate 
)

Definition at line 8218 of file heapam.c.

8219{
8220 /* Initial assumption is that earlier pruning took care of conflict */
8221 TransactionId snapshotConflictHorizon = InvalidTransactionId;
8224 Page page = NULL;
8227#ifdef USE_PREFETCH
8230#endif
8232 int finalndeltids = 0,
8233 nblocksaccessed = 0;
8234
8235 /* State that's only used in bottom-up index deletion case */
8236 int nblocksfavorable = 0;
8237 int curtargetfreespace = delstate->bottomupfreespace,
8238 lastfreespace = 0,
8239 actualfreespace = 0;
8240 bool bottomup_final_block = false;
8241
8243
8244 /* Sort caller's deltids array by TID for further processing */
8246
8247 /*
8248 * Bottom-up case: resort deltids array in an order attuned to where the
8249 * greatest number of promising TIDs are to be found, and determine how
8250 * many blocks from the start of sorted array should be considered
8251 * favorable. This will also shrink the deltids array in order to
8252 * eliminate completely unfavorable blocks up front.
8253 */
8254 if (delstate->bottomup)
8256
8257#ifdef USE_PREFETCH
8258 /* Initialize prefetch state. */
8260 prefetch_state.next_item = 0;
8261 prefetch_state.ndeltids = delstate->ndeltids;
8262 prefetch_state.deltids = delstate->deltids;
8263
8264 /*
8265 * Determine the prefetch distance that we will attempt to maintain.
8266 *
8267 * Since the caller holds a buffer lock somewhere in rel, we'd better make
8268 * sure that isn't a catalog relation before we call code that does
8269 * syscache lookups, to avoid risk of deadlock.
8270 */
8271 if (IsCatalogRelation(rel))
8273 else
8276
8277 /* Cap initial prefetch distance for bottom-up deletion caller */
8278 if (delstate->bottomup)
8279 {
8283 }
8284
8285 /* Start prefetching. */
8287#endif
8288
8289 /* Iterate over deltids, determine which to delete, check their horizon */
8290 Assert(delstate->ndeltids > 0);
8291 for (int i = 0; i < delstate->ndeltids; i++)
8292 {
8293 TM_IndexDelete *ideltid = &delstate->deltids[i];
8294 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8295 ItemPointer htid = &ideltid->tid;
8296 OffsetNumber offnum;
8297
8298 /*
8299 * Read buffer, and perform required extra steps each time a new block
8300 * is encountered. Avoid refetching if it's the same block as the one
8301 * from the last htid.
8302 */
8303 if (blkno == InvalidBlockNumber ||
8305 {
8306 /*
8307 * Consider giving up early for bottom-up index deletion caller
8308 * first. (Only prefetch next-next block afterwards, when it
8309 * becomes clear that we're at least going to access the next
8310 * block in line.)
8311 *
8312 * Sometimes the first block frees so much space for bottom-up
8313 * caller that the deletion process can end without accessing any
8314 * more blocks. It is usually necessary to access 2 or 3 blocks
8315 * per bottom-up deletion operation, though.
8316 */
8317 if (delstate->bottomup)
8318 {
8319 /*
8320 * We often allow caller to delete a few additional items
8321 * whose entries we reached after the point that space target
8322 * from caller was satisfied. The cost of accessing the page
8323 * was already paid at that point, so it made sense to finish
8324 * it off. When that happened, we finalize everything here
8325 * (by finishing off the whole bottom-up deletion operation
8326 * without needlessly paying the cost of accessing any more
8327 * blocks).
8328 */
8330 break;
8331
8332 /*
8333 * Give up when we didn't enable our caller to free any
8334 * additional space as a result of processing the page that we
8335 * just finished up with. This rule is the main way in which
8336 * we keep the cost of bottom-up deletion under control.
8337 */
8339 break;
8340 lastfreespace = actualfreespace; /* for next time */
8341
8342 /*
8343 * Deletion operation (which is bottom-up) will definitely
8344 * access the next block in line. Prepare for that now.
8345 *
8346 * Decay target free space so that we don't hang on for too
8347 * long with a marginal case. (Space target is only truly
8348 * helpful when it allows us to recognize that we don't need
8349 * to access more than 1 or 2 blocks to satisfy caller due to
8350 * agreeable workload characteristics.)
8351 *
8352 * We are a bit more patient when we encounter contiguous
8353 * blocks, though: these are treated as favorable blocks. The
8354 * decay process is only applied when the next block in line
8355 * is not a favorable/contiguous block. This is not an
8356 * exception to the general rule; we still insist on finding
8357 * at least one deletable item per block accessed. See
8358 * bottomup_nblocksfavorable() for full details of the theory
8359 * behind favorable blocks and heap block locality in general.
8360 *
8361 * Note: The first block in line is always treated as a
8362 * favorable block, so the earliest possible point that the
8363 * decay can be applied is just before we access the second
8364 * block in line. The Assert() verifies this for us.
8365 */
8367 if (nblocksfavorable > 0)
8369 else
8370 curtargetfreespace /= 2;
8371 }
8372
8373 /* release old buffer */
8374 if (BufferIsValid(buf))
8376
8378 buf = ReadBuffer(rel, blkno);
8380 Assert(!delstate->bottomup ||
8382
8383#ifdef USE_PREFETCH
8384
8385 /*
8386 * To maintain the prefetch distance, prefetch one more page for
8387 * each page we read.
8388 */
8390#endif
8391
8393
8394 page = BufferGetPage(buf);
8395 maxoff = PageGetMaxOffsetNumber(page);
8396 }
8397
8398 /*
8399 * In passing, detect index corruption involving an index page with a
8400 * TID that points to a location in the heap that couldn't possibly be
8401 * correct. We only do this with actual TIDs from caller's index page
8402 * (not items reached by traversing through a HOT chain).
8403 */
8405
8406 if (istatus->knowndeletable)
8407 Assert(!delstate->bottomup && !istatus->promising);
8408 else
8409 {
8410 ItemPointerData tmp = *htid;
8412
8413 /* Are any tuples from this HOT chain non-vacuumable? */
8415 &heapTuple, NULL, true))
8416 continue; /* can't delete entry */
8417
8418 /* Caller will delete, since whole HOT chain is vacuumable */
8419 istatus->knowndeletable = true;
8420
8421 /* Maintain index free space info for bottom-up deletion case */
8422 if (delstate->bottomup)
8423 {
8424 Assert(istatus->freespace > 0);
8425 actualfreespace += istatus->freespace;
8427 bottomup_final_block = true;
8428 }
8429 }
8430
8431 /*
8432 * Maintain snapshotConflictHorizon value for deletion operation as a
8433 * whole by advancing current value using heap tuple headers. This is
8434 * loosely based on the logic for pruning a HOT chain.
8435 */
8437 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
8438 for (;;)
8439 {
8440 ItemId lp;
8441 HeapTupleHeader htup;
8442
8443 /* Sanity check (pure paranoia) */
8444 if (offnum < FirstOffsetNumber)
8445 break;
8446
8447 /*
8448 * An offset past the end of page's line pointer array is possible
8449 * when the array was truncated
8450 */
8451 if (offnum > maxoff)
8452 break;
8453
8454 lp = PageGetItemId(page, offnum);
8456 {
8457 offnum = ItemIdGetRedirect(lp);
8458 continue;
8459 }
8460
8461 /*
8462 * We'll often encounter LP_DEAD line pointers (especially with an
8463 * entry marked knowndeletable by our caller up front). No heap
8464 * tuple headers get examined for an htid that leads us to an
8465 * LP_DEAD item. This is okay because the earlier pruning
8466 * operation that made the line pointer LP_DEAD in the first place
8467 * must have considered the original tuple header as part of
8468 * generating its own snapshotConflictHorizon value.
8469 *
8470 * Relying on XLOG_HEAP2_PRUNE_VACUUM_SCAN records like this is
8471 * the same strategy that index vacuuming uses in all cases. Index
8472 * VACUUM WAL records don't even have a snapshotConflictHorizon
8473 * field of their own for this reason.
8474 */
8475 if (!ItemIdIsNormal(lp))
8476 break;
8477
8478 htup = (HeapTupleHeader) PageGetItem(page, lp);
8479
8480 /*
8481 * Check the tuple XMIN against prior XMAX, if any
8482 */
8485 break;
8486
8488 &snapshotConflictHorizon);
8489
8490 /*
8491 * If the tuple is not HOT-updated, then we are at the end of this
8492 * HOT-chain. No need to visit later tuples from the same update
8493 * chain (they get their own index entries) -- just move on to
8494 * next htid from index AM caller.
8495 */
8496 if (!HeapTupleHeaderIsHotUpdated(htup))
8497 break;
8498
8499 /* Advance to next HOT chain member */
8500 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
8501 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
8503 }
8504
8505 /* Enable further/final shrinking of deltids for caller */
8506 finalndeltids = i + 1;
8507 }
8508
8510
8511 /*
8512 * Shrink deltids array to exclude non-deletable entries at the end. This
8513 * is not just a minor optimization. Final deltids array size might be
8514 * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
8515 * ndeltids being zero in all cases with zero total deletable entries.
8516 */
8517 Assert(finalndeltids > 0 || delstate->bottomup);
8518 delstate->ndeltids = finalndeltids;
8519
8520 return snapshotConflictHorizon;
8521}

References Assert, BOTTOMUP_MAX_NBLOCKS, bottomup_sort_and_shrink(), buf, BUFFER_LOCK_SHARE, BufferGetPage(), BufferIsValid(), fb(), FirstOffsetNumber, get_tablespace_maintenance_io_concurrency(), GlobalVisTestFor(), heap_hot_search_buffer(), HeapTupleHeaderAdvanceConflictHorizon(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIsHotUpdated(), i, index_delete_check_htid(), index_delete_sort(), InitNonVacuumableSnapshot, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, InvalidTransactionId, IsCatalogRelation(), ItemIdGetRedirect, ItemIdIsNormal, ItemIdIsRedirected, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), maintenance_io_concurrency, Min, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationData::rd_rel, ReadBuffer(), HeapTupleHeaderData::t_ctid, TransactionIdEquals, TransactionIdIsValid, and UnlockReleaseBuffer().

◆ heap_inplace_lock()

bool heap_inplace_lock ( Relation  relation,
HeapTuple  oldtup_ptr,
Buffer  buffer,
void(*)(void *)  release_callback,
void arg 
)

Definition at line 6451 of file heapam.c.

6454{
6455 HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */
6456 TM_Result result;
6457 bool ret;
6458
6459#ifdef USE_ASSERT_CHECKING
6460 if (RelationGetRelid(relation) == RelationRelationId)
6462#endif
6463
6464 Assert(BufferIsValid(buffer));
6465
6466 /*
6467 * Register shared cache invals if necessary. Other sessions may finish
6468 * inplace updates of this tuple between this step and LockTuple(). Since
6469 * inplace updates don't change cache keys, that's harmless.
6470 *
6471 * While it's tempting to register invals only after confirming we can
6472 * return true, the following obstacle precludes reordering steps that
6473 * way. Registering invals might reach a CatalogCacheInitializeCache()
6474 * that locks "buffer". That would hang indefinitely if running after our
6475 * own LockBuffer(). Hence, we must register invals before LockBuffer().
6476 */
6478
6479 LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6481
6482 /*----------
6483 * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
6484 *
6485 * - wait unconditionally
6486 * - already locked tuple above, since inplace needs that unconditionally
6487 * - don't recheck header after wait: simpler to defer to next iteration
6488 * - don't try to continue even if the updater aborts: likewise
6489 * - no crosscheck
6490 */
6492 buffer);
6493
6494 if (result == TM_Invisible)
6495 {
6496 /* no known way this can happen */
6497 ereport(ERROR,
6499 errmsg_internal("attempted to overwrite invisible tuple")));
6500 }
6501 else if (result == TM_SelfModified)
6502 {
6503 /*
6504 * CREATE INDEX might reach this if an expression is silly enough to
6505 * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
6506 * statements might get here after a heap_update() of the same row, in
6507 * the absence of an intervening CommandCounterIncrement().
6508 */
6509 ereport(ERROR,
6511 errmsg("tuple to be updated was already modified by an operation triggered by the current command")));
6512 }
6513 else if (result == TM_BeingModified)
6514 {
6517
6519 infomask = oldtup.t_data->t_infomask;
6520
6522 {
6525 int remain;
6526
6528 lockmode, NULL))
6529 {
6532 ret = false;
6534 relation, &oldtup.t_self, XLTW_Update,
6535 &remain);
6536 }
6537 else
6538 ret = true;
6539 }
6541 ret = true;
6543 ret = true;
6544 else
6545 {
6548 ret = false;
6549 XactLockTableWait(xwait, relation, &oldtup.t_self,
6550 XLTW_Update);
6551 }
6552 }
6553 else
6554 {
6555 ret = (result == TM_Ok);
6556 if (!ret)
6557 {
6560 }
6561 }
6562
6563 /*
6564 * GetCatalogSnapshot() relies on invalidation messages to know when to
6565 * take a new snapshot. COMMIT of xwait is responsible for sending the
6566 * invalidation. We're not acquiring heavyweight locks sufficient to
6567 * block if not yet sent, so we must take a new snapshot to ensure a later
6568 * attempt has a fair chance. While we don't need this if xwait aborted,
6569 * don't bother optimizing that.
6570 */
6571 if (!ret)
6572 {
6573 UnlockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6576 }
6577 return ret;
6578}

References arg, Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsValid(), CacheInvalidateHeapTupleInplace(), DoesMultiXactIdConflict(), ereport, errcode(), errmsg, errmsg_internal(), ERROR, fb(), ForgetInplace_Inval(), GetCurrentCommandId(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleSatisfiesUpdate(), InplaceUpdateTupleLock, InvalidateCatalogSnapshot(), LockBuffer(), LockTuple(), LockTupleNoKeyExclusive, MultiXactIdWait(), MultiXactStatusNoKeyUpdate, RelationGetRelid, TM_BeingModified, TM_Invisible, TM_Ok, TM_SelfModified, TransactionIdIsCurrentTransactionId(), UnlockTuple(), XactLockTableWait(), and XLTW_Update.

Referenced by systable_inplace_update_begin().

◆ heap_inplace_unlock()

void heap_inplace_unlock ( Relation  relation,
HeapTuple  oldtup,
Buffer  buffer 
)

◆ heap_inplace_update_and_unlock()

void heap_inplace_update_and_unlock ( Relation  relation,
HeapTuple  oldtup,
HeapTuple  tuple,
Buffer  buffer 
)

Definition at line 6589 of file heapam.c.

6592{
6593 HeapTupleHeader htup = oldtup->t_data;
6594 uint32 oldlen;
6595 uint32 newlen;
6596 char *dst;
6597 char *src;
6598 int nmsgs = 0;
6600 bool RelcacheInitFileInval = false;
6601
6602 Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
6603 oldlen = oldtup->t_len - htup->t_hoff;
6604 newlen = tuple->t_len - tuple->t_data->t_hoff;
6605 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6606 elog(ERROR, "wrong tuple length");
6607
6608 dst = (char *) htup + htup->t_hoff;
6609 src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6610
6611 /* Like RecordTransactionCommit(), log only if needed */
6614 &RelcacheInitFileInval);
6615
6616 /*
6617 * Unlink relcache init files as needed. If unlinking, acquire
6618 * RelCacheInitLock until after associated invalidations. By doing this
6619 * in advance, if we checkpoint and then crash between inplace
6620 * XLogInsert() and inval, we don't rely on StartupXLOG() ->
6621 * RelationCacheInitFileRemove(). That uses elevel==LOG, so replay would
6622 * neglect to PANIC on EIO.
6623 */
6625
6626 /*----------
6627 * NO EREPORT(ERROR) from here till changes are complete
6628 *
6629 * Our exclusive buffer lock won't stop a reader having already pinned and
6630 * checked visibility for this tuple. With the usual order of changes
6631 * (i.e. updating the buffer contents before WAL logging), a reader could
6632 * observe our not-yet-persistent update to relfrozenxid and update
6633 * datfrozenxid based on that. A crash in that moment could allow
6634 * datfrozenxid to overtake relfrozenxid:
6635 *
6636 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6637 * ["R" is a VACUUM tbl]
6638 * D: vac_update_datfrozenxid() -> systable_beginscan(pg_class)
6639 * D: systable_getnext() returns pg_class tuple of tbl
6640 * R: memcpy() into pg_class tuple of tbl
6641 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6642 * [crash]
6643 * [recovery restores datfrozenxid w/o relfrozenxid]
6644 *
6645 * We avoid that by using a temporary copy of the buffer to hide our
6646 * change from other backends until the change has been WAL-logged. We
6647 * apply our change to the temporary copy and WAL-log it, before modifying
6648 * the real page. That way any action a reader of the in-place-updated
6649 * value takes will be WAL logged after this change.
6650 */
6652
6653 MarkBufferDirty(buffer);
6654
6655 /* XLOG stuff */
6656 if (RelationNeedsWAL(relation))
6657 {
6660 char *origdata = (char *) BufferGetBlock(buffer);
6661 Page page = BufferGetPage(buffer);
6662 uint16 lower = ((PageHeader) page)->pd_lower;
6663 uint16 upper = ((PageHeader) page)->pd_upper;
6665 RelFileLocator rlocator;
6666 ForkNumber forkno;
6667 BlockNumber blkno;
6669
6670 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6671 xlrec.dbId = MyDatabaseId;
6673 xlrec.relcacheInitFileInval = RelcacheInitFileInval;
6674 xlrec.nmsgs = nmsgs;
6675
6678 if (nmsgs != 0)
6680 nmsgs * sizeof(SharedInvalidationMessage));
6681
6682 /* register block matching what buffer will look like after changes */
6687 BufferGetTag(buffer, &rlocator, &forkno, &blkno);
6688 Assert(forkno == MAIN_FORKNUM);
6689 XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
6691 XLogRegisterBufData(0, src, newlen);
6692
6693 /* inplace updates aren't decoded atm, don't log the origin */
6694
6696
6697 PageSetLSN(page, recptr);
6698 }
6699
6700 memcpy(dst, src, newlen);
6701
6703
6704 /*
6705 * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we
6706 * do this before UnlockTuple().
6707 */
6709
6711 UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
6712
6713 AcceptInvalidationMessages(); /* local processing of just-sent inval */
6714
6715 /*
6716 * Queue a transactional inval, for logical decoding and for third-party
6717 * code that might have been relying on it since long before inplace
6718 * update adopted immediate invalidation. See README.tuplock section
6719 * "Reading inplace-updated columns" for logical decoding details.
6720 */
6722 CacheInvalidateHeapTuple(relation, tuple, NULL);
6723}

References AcceptInvalidationMessages(), Assert, AtInplace_Inval(), BUFFER_LOCK_UNLOCK, BufferGetBlock(), BufferGetPage(), BufferGetTag(), CacheInvalidateHeapTuple(), elog, END_CRIT_SECTION, ERROR, fb(), inplaceGetInvalidationMessages(), InplaceUpdateTupleLock, IsBootstrapProcessingMode, ItemPointerEquals(), ItemPointerGetOffsetNumber(), LockBuffer(), lower(), MAIN_FORKNUM, MarkBufferDirty(), MinSizeOfHeapInplace, MyDatabaseId, MyDatabaseTableSpace, PageSetLSN(), PreInplace_Inval(), REGBUF_STANDARD, RelationNeedsWAL, START_CRIT_SECTION, HeapTupleData::t_data, HeapTupleHeaderData::t_hoff, HeapTupleData::t_len, HeapTupleData::t_self, UnlockTuple(), upper(), XLOG_HEAP_INPLACE, XLogBeginInsert(), XLogInsert(), XLogRegisterBlock(), XLogRegisterBufData(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by systable_inplace_update_finish().

◆ heap_insert()

void heap_insert ( Relation  relation,
HeapTuple  tup,
CommandId  cid,
int  options,
BulkInsertState  bistate 
)

Definition at line 2152 of file heapam.c.

2154{
2157 Buffer buffer;
2158 Buffer vmbuffer = InvalidBuffer;
2159 bool all_visible_cleared = false;
2160
2161 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2164
2165 AssertHasSnapshotForToast(relation);
2166
2167 /*
2168 * Fill in tuple header fields and toast the tuple if necessary.
2169 *
2170 * Note: below this point, heaptup is the data we actually intend to store
2171 * into the relation; tup is the caller's original untoasted data.
2172 */
2173 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2174
2175 /*
2176 * Find buffer to insert this tuple into. If the page is all visible,
2177 * this will also pin the requisite visibility map page.
2178 */
2179 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2180 InvalidBuffer, options, bistate,
2181 &vmbuffer, NULL,
2182 0);
2183
2184 /*
2185 * We're about to do the actual insert -- but check for conflict first, to
2186 * avoid possibly having to roll back work we've just done.
2187 *
2188 * This is safe without a recheck as long as there is no possibility of
2189 * another process scanning the page between this check and the insert
2190 * being visible to the scan (i.e., an exclusive buffer content lock is
2191 * continuously held from this point until the tuple insert is visible).
2192 *
2193 * For a heap insert, we only need to check for table-level SSI locks. Our
2194 * new tuple can't possibly conflict with existing tuple locks, and heap
2195 * page locks are only consolidated versions of tuple locks; they do not
2196 * lock "gaps" as index page locks do. So we don't need to specify a
2197 * buffer when making the call, which makes for a faster check.
2198 */
2200
2201 /* NO EREPORT(ERROR) from here till changes are logged */
2203
2204 RelationPutHeapTuple(relation, buffer, heaptup,
2206
2207 if (PageIsAllVisible(BufferGetPage(buffer)))
2208 {
2209 all_visible_cleared = true;
2211 visibilitymap_clear(relation,
2213 vmbuffer, VISIBILITYMAP_VALID_BITS);
2214 }
2215
2216 /*
2217 * XXX Should we set PageSetPrunable on this page ?
2218 *
2219 * The inserting transaction may eventually abort thus making this tuple
2220 * DEAD and hence available for pruning. Though we don't want to optimize
2221 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2222 * aborted tuple will never be pruned until next vacuum is triggered.
2223 *
2224 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2225 */
2226
2227 MarkBufferDirty(buffer);
2228
2229 /* XLOG stuff */
2230 if (RelationNeedsWAL(relation))
2231 {
2235 Page page = BufferGetPage(buffer);
2236 uint8 info = XLOG_HEAP_INSERT;
2237 int bufflags = 0;
2238
2239 /*
2240 * If this is a catalog, we need to transmit combo CIDs to properly
2241 * decode, so log that as well.
2242 */
2244 log_heap_new_cid(relation, heaptup);
2245
2246 /*
2247 * If this is the single and first tuple on page, we can reinit the
2248 * page instead of restoring the whole thing. Set flag, and hide
2249 * buffer references from XLogInsert.
2250 */
2253 {
2254 info |= XLOG_HEAP_INIT_PAGE;
2256 }
2257
2258 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2259 xlrec.flags = 0;
2265
2266 /*
2267 * For logical decoding, we need the tuple even if we're doing a full
2268 * page write, so make sure it's included even if we take a full-page
2269 * image. (XXX We could alternatively store a pointer into the FPW).
2270 */
2271 if (RelationIsLogicallyLogged(relation) &&
2273 {
2276
2277 if (IsToastRelation(relation))
2279 }
2280
2283
2284 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2285 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2286 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2287
2288 /*
2289 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2290 * write the whole page to the xlog, we don't need to store
2291 * xl_heap_header in the xlog.
2292 */
2295 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2297 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2299
2300 /* filtering by origin on a row level is much more efficient */
2302
2303 recptr = XLogInsert(RM_HEAP_ID, info);
2304
2305 PageSetLSN(page, recptr);
2306 }
2307
2309
2310 UnlockReleaseBuffer(buffer);
2311 if (vmbuffer != InvalidBuffer)
2312 ReleaseBuffer(vmbuffer);
2313
2314 /*
2315 * If tuple is cacheable, mark it for invalidation from the caches in case
2316 * we abort. Note it is OK to do this after releasing the buffer, because
2317 * the heaptup data structure is all in local memory, not in the shared
2318 * buffer.
2319 */
2321
2322 /* Note: speculative insertions are counted too, even if aborted later */
2323 pgstat_count_heap_insert(relation, 1);
2324
2325 /*
2326 * If heaptup is a private copy, release it. Don't forget to copy t_self
2327 * back to the caller's image, too.
2328 */
2329 if (heaptup != tup)
2330 {
2331 tup->t_self = heaptup->t_self;
2333 }
2334}

References Assert, AssertHasSnapshotForToast(), BufferGetBlockNumber(), BufferGetPage(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), END_CRIT_SECTION, fb(), FirstOffsetNumber, GetCurrentTransactionId(), heap_freetuple(), HEAP_INSERT_NO_LOGICAL, HEAP_INSERT_SPECULATIVE, heap_prepare_insert(), HeapTupleHeaderGetNatts, InvalidBlockNumber, InvalidBuffer, IsToastRelation(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), log_heap_new_cid(), MarkBufferDirty(), PageClearAllVisible(), PageGetMaxOffsetNumber(), PageIsAllVisible(), PageSetLSN(), pgstat_count_heap_insert(), REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetBufferForTuple(), RelationGetNumberOfAttributes, RelationIsAccessibleInLogicalDecoding, RelationIsLogicallyLogged, RelationNeedsWAL, RelationPutHeapTuple(), ReleaseBuffer(), SizeOfHeapHeader, SizeOfHeapInsert, SizeofHeapTupleHeader, START_CRIT_SECTION, UnlockReleaseBuffer(), visibilitymap_clear(), VISIBILITYMAP_VALID_BITS, XLH_INSERT_ALL_VISIBLE_CLEARED, XLH_INSERT_CONTAINS_NEW_TUPLE, XLH_INSERT_IS_SPECULATIVE, XLH_INSERT_ON_TOAST_RELATION, XLOG_HEAP_INIT_PAGE, XLOG_HEAP_INSERT, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heapam_tuple_insert(), heapam_tuple_insert_speculative(), simple_heap_insert(), and toast_save_datum().

◆ heap_lock_tuple()

TM_Result heap_lock_tuple ( Relation  relation,
HeapTuple  tuple,
CommandId  cid,
LockTupleMode  mode,
LockWaitPolicy  wait_policy,
bool  follow_updates,
Buffer buffer,
TM_FailureData tmfd 
)

Definition at line 4658 of file heapam.c.

4662{
4663 TM_Result result;
4664 ItemPointer tid = &(tuple->t_self);
4665 ItemId lp;
4666 Page page;
4667 Buffer vmbuffer = InvalidBuffer;
4668 BlockNumber block;
4669 TransactionId xid,
4670 xmax;
4674 bool first_time = true;
4675 bool skip_tuple_lock = false;
4676 bool have_tuple_lock = false;
4677 bool cleared_all_frozen = false;
4678
4679 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4680 block = ItemPointerGetBlockNumber(tid);
4681
4682 /*
4683 * Before locking the buffer, pin the visibility map page if it appears to
4684 * be necessary. Since we haven't got the lock yet, someone else might be
4685 * in the middle of changing this, so we'll need to recheck after we have
4686 * the lock.
4687 */
4688 if (PageIsAllVisible(BufferGetPage(*buffer)))
4689 visibilitymap_pin(relation, block, &vmbuffer);
4690
4692
4693 page = BufferGetPage(*buffer);
4696
4697 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4698 tuple->t_len = ItemIdGetLength(lp);
4699 tuple->t_tableOid = RelationGetRelid(relation);
4700
4701l3:
4702 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4703
4704 if (result == TM_Invisible)
4705 {
4706 /*
4707 * This is possible, but only when locking a tuple for ON CONFLICT DO
4708 * SELECT/UPDATE. We return this value here rather than throwing an
4709 * error in order to give that case the opportunity to throw a more
4710 * specific error.
4711 */
4712 result = TM_Invisible;
4713 goto out_locked;
4714 }
4715 else if (result == TM_BeingModified ||
4716 result == TM_Updated ||
4717 result == TM_Deleted)
4718 {
4722 bool require_sleep;
4723 ItemPointerData t_ctid;
4724
4725 /* must copy state data before unlocking buffer */
4727 infomask = tuple->t_data->t_infomask;
4728 infomask2 = tuple->t_data->t_infomask2;
4729 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4730
4732
4733 /*
4734 * If any subtransaction of the current top transaction already holds
4735 * a lock as strong as or stronger than what we're requesting, we
4736 * effectively hold the desired lock already. We *must* succeed
4737 * without trying to take the tuple lock, else we will deadlock
4738 * against anyone wanting to acquire a stronger lock.
4739 *
4740 * Note we only do this the first time we loop on the HTSU result;
4741 * there is no point in testing in subsequent passes, because
4742 * evidently our own transaction cannot have acquired a new lock after
4743 * the first time we checked.
4744 */
4745 if (first_time)
4746 {
4747 first_time = false;
4748
4750 {
4751 int i;
4752 int nmembers;
4753 MultiXactMember *members;
4754
4755 /*
4756 * We don't need to allow old multixacts here; if that had
4757 * been the case, HeapTupleSatisfiesUpdate would have returned
4758 * MayBeUpdated and we wouldn't be here.
4759 */
4760 nmembers =
4761 GetMultiXactIdMembers(xwait, &members, false,
4763
4764 for (i = 0; i < nmembers; i++)
4765 {
4766 /* only consider members of our own transaction */
4767 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4768 continue;
4769
4770 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4771 {
4772 pfree(members);
4773 result = TM_Ok;
4774 goto out_unlocked;
4775 }
4776 else
4777 {
4778 /*
4779 * Disable acquisition of the heavyweight tuple lock.
4780 * Otherwise, when promoting a weaker lock, we might
4781 * deadlock with another locker that has acquired the
4782 * heavyweight tuple lock and is waiting for our
4783 * transaction to finish.
4784 *
4785 * Note that in this case we still need to wait for
4786 * the multixact if required, to avoid acquiring
4787 * conflicting locks.
4788 */
4789 skip_tuple_lock = true;
4790 }
4791 }
4792
4793 if (members)
4794 pfree(members);
4795 }
4797 {
4798 switch (mode)
4799 {
4800 case LockTupleKeyShare:
4804 result = TM_Ok;
4805 goto out_unlocked;
4806 case LockTupleShare:
4809 {
4810 result = TM_Ok;
4811 goto out_unlocked;
4812 }
4813 break;
4816 {
4817 result = TM_Ok;
4818 goto out_unlocked;
4819 }
4820 break;
4821 case LockTupleExclusive:
4824 {
4825 result = TM_Ok;
4826 goto out_unlocked;
4827 }
4828 break;
4829 }
4830 }
4831 }
4832
4833 /*
4834 * Initially assume that we will have to wait for the locking
4835 * transaction(s) to finish. We check various cases below in which
4836 * this can be turned off.
4837 */
4838 require_sleep = true;
4839 if (mode == LockTupleKeyShare)
4840 {
4841 /*
4842 * If we're requesting KeyShare, and there's no update present, we
4843 * don't need to wait. Even if there is an update, we can still
4844 * continue if the key hasn't been modified.
4845 *
4846 * However, if there are updates, we need to walk the update chain
4847 * to mark future versions of the row as locked, too. That way,
4848 * if somebody deletes that future version, we're protected
4849 * against the key going away. This locking of future versions
4850 * could block momentarily, if a concurrent transaction is
4851 * deleting a key; or it could return a value to the effect that
4852 * the transaction deleting the key has already committed. So we
4853 * do this before re-locking the buffer; otherwise this would be
4854 * prone to deadlocks.
4855 *
4856 * Note that the TID we're locking was grabbed before we unlocked
4857 * the buffer. For it to change while we're not looking, the
4858 * other properties we're testing for below after re-locking the
4859 * buffer would also change, in which case we would restart this
4860 * loop above.
4861 */
4863 {
4864 bool updated;
4865
4867
4868 /*
4869 * If there are updates, follow the update chain; bail out if
4870 * that cannot be done.
4871 */
4872 if (follow_updates && updated &&
4873 !ItemPointerEquals(&tuple->t_self, &t_ctid))
4874 {
4875 TM_Result res;
4876
4877 res = heap_lock_updated_tuple(relation,
4878 infomask, xwait, &t_ctid,
4880 mode);
4881 if (res != TM_Ok)
4882 {
4883 result = res;
4884 /* recovery code expects to have buffer lock held */
4886 goto failed;
4887 }
4888 }
4889
4891
4892 /*
4893 * Make sure it's still an appropriate lock, else start over.
4894 * Also, if it wasn't updated before we released the lock, but
4895 * is updated now, we start over too; the reason is that we
4896 * now need to follow the update chain to lock the new
4897 * versions.
4898 */
4899 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4900 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4901 !updated))
4902 goto l3;
4903
4904 /* Things look okay, so we can skip sleeping */
4905 require_sleep = false;
4906
4907 /*
4908 * Note we allow Xmax to change here; other updaters/lockers
4909 * could have modified it before we grabbed the buffer lock.
4910 * However, this is not a problem, because with the recheck we
4911 * just did we ensure that they still don't conflict with the
4912 * lock we want.
4913 */
4914 }
4915 }
4916 else if (mode == LockTupleShare)
4917 {
4918 /*
4919 * If we're requesting Share, we can similarly avoid sleeping if
4920 * there's no update and no exclusive lock present.
4921 */
4924 {
4926
4927 /*
4928 * Make sure it's still an appropriate lock, else start over.
4929 * See above about allowing xmax to change.
4930 */
4933 goto l3;
4934 require_sleep = false;
4935 }
4936 }
4937 else if (mode == LockTupleNoKeyExclusive)
4938 {
4939 /*
4940 * If we're requesting NoKeyExclusive, we might also be able to
4941 * avoid sleeping; just ensure that there no conflicting lock
4942 * already acquired.
4943 */
4945 {
4947 mode, NULL))
4948 {
4949 /*
4950 * No conflict, but if the xmax changed under us in the
4951 * meantime, start over.
4952 */
4956 xwait))
4957 goto l3;
4958
4959 /* otherwise, we're good */
4960 require_sleep = false;
4961 }
4962 }
4964 {
4966
4967 /* if the xmax changed in the meantime, start over */
4970 xwait))
4971 goto l3;
4972 /* otherwise, we're good */
4973 require_sleep = false;
4974 }
4975 }
4976
4977 /*
4978 * As a check independent from those above, we can also avoid sleeping
4979 * if the current transaction is the sole locker of the tuple. Note
4980 * that the strength of the lock already held is irrelevant; this is
4981 * not about recording the lock in Xmax (which will be done regardless
4982 * of this optimization, below). Also, note that the cases where we
4983 * hold a lock stronger than we are requesting are already handled
4984 * above by not doing anything.
4985 *
4986 * Note we only deal with the non-multixact case here; MultiXactIdWait
4987 * is well equipped to deal with this situation on its own.
4988 */
4991 {
4992 /* ... but if the xmax changed in the meantime, start over */
4996 xwait))
4997 goto l3;
4999 require_sleep = false;
5000 }
5001
5002 /*
5003 * Time to sleep on the other transaction/multixact, if necessary.
5004 *
5005 * If the other transaction is an update/delete that's already
5006 * committed, then sleeping cannot possibly do any good: if we're
5007 * required to sleep, get out to raise an error instead.
5008 *
5009 * By here, we either have already acquired the buffer exclusive lock,
5010 * or we must wait for the locking transaction or multixact; so below
5011 * we ensure that we grab buffer lock after the sleep.
5012 */
5013 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
5014 {
5016 goto failed;
5017 }
5018 else if (require_sleep)
5019 {
5020 /*
5021 * Acquire tuple lock to establish our priority for the tuple, or
5022 * die trying. LockTuple will release us when we are next-in-line
5023 * for the tuple. We must do this even if we are share-locking,
5024 * but not if we already have a weaker lock on the tuple.
5025 *
5026 * If we are forced to "start over" below, we keep the tuple lock;
5027 * this arranges that we stay at the head of the line while
5028 * rechecking tuple state.
5029 */
5030 if (!skip_tuple_lock &&
5031 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
5033 {
5034 /*
5035 * This can only happen if wait_policy is Skip and the lock
5036 * couldn't be obtained.
5037 */
5038 result = TM_WouldBlock;
5039 /* recovery code expects to have buffer lock held */
5041 goto failed;
5042 }
5043
5045 {
5047
5048 /* We only ever lock tuples, never update them */
5049 if (status >= MultiXactStatusNoKeyUpdate)
5050 elog(ERROR, "invalid lock mode in heap_lock_tuple");
5051
5052 /* wait for multixact to end, or die trying */
5053 switch (wait_policy)
5054 {
5055 case LockWaitBlock:
5057 relation, &tuple->t_self, XLTW_Lock, NULL);
5058 break;
5059 case LockWaitSkip:
5061 status, infomask, relation,
5062 NULL, false))
5063 {
5064 result = TM_WouldBlock;
5065 /* recovery code expects to have buffer lock held */
5067 goto failed;
5068 }
5069 break;
5070 case LockWaitError:
5072 status, infomask, relation,
5074 ereport(ERROR,
5076 errmsg("could not obtain lock on row in relation \"%s\"",
5077 RelationGetRelationName(relation))));
5078
5079 break;
5080 }
5081
5082 /*
5083 * Of course, the multixact might not be done here: if we're
5084 * requesting a light lock mode, other transactions with light
5085 * locks could still be alive, as well as locks owned by our
5086 * own xact or other subxacts of this backend. We need to
5087 * preserve the surviving MultiXact members. Note that it
5088 * isn't absolutely necessary in the latter case, but doing so
5089 * is simpler.
5090 */
5091 }
5092 else
5093 {
5094 /* wait for regular transaction to end, or die trying */
5095 switch (wait_policy)
5096 {
5097 case LockWaitBlock:
5098 XactLockTableWait(xwait, relation, &tuple->t_self,
5099 XLTW_Lock);
5100 break;
5101 case LockWaitSkip:
5103 {
5104 result = TM_WouldBlock;
5105 /* recovery code expects to have buffer lock held */
5107 goto failed;
5108 }
5109 break;
5110 case LockWaitError:
5112 ereport(ERROR,
5114 errmsg("could not obtain lock on row in relation \"%s\"",
5115 RelationGetRelationName(relation))));
5116 break;
5117 }
5118 }
5119
5120 /* if there are updates, follow the update chain */
5122 !ItemPointerEquals(&tuple->t_self, &t_ctid))
5123 {
5124 TM_Result res;
5125
5126 res = heap_lock_updated_tuple(relation,
5127 infomask, xwait, &t_ctid,
5129 mode);
5130 if (res != TM_Ok)
5131 {
5132 result = res;
5133 /* recovery code expects to have buffer lock held */
5135 goto failed;
5136 }
5137 }
5138
5140
5141 /*
5142 * xwait is done, but if xwait had just locked the tuple then some
5143 * other xact could update this tuple before we get to this point.
5144 * Check for xmax change, and start over if so.
5145 */
5148 xwait))
5149 goto l3;
5150
5152 {
5153 /*
5154 * Otherwise check if it committed or aborted. Note we cannot
5155 * be here if the tuple was only locked by somebody who didn't
5156 * conflict with us; that would have been handled above. So
5157 * that transaction must necessarily be gone by now. But
5158 * don't check for this in the multixact case, because some
5159 * locker transactions might still be running.
5160 */
5161 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5162 }
5163 }
5164
5165 /* By here, we're certain that we hold buffer exclusive lock again */
5166
5167 /*
5168 * We may lock if previous xmax aborted, or if it committed but only
5169 * locked the tuple without updating it; or if we didn't have to wait
5170 * at all for whatever reason.
5171 */
5172 if (!require_sleep ||
5173 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5176 result = TM_Ok;
5177 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
5178 result = TM_Updated;
5179 else
5180 result = TM_Deleted;
5181 }
5182
5183failed:
5184 if (result != TM_Ok)
5185 {
5186 Assert(result == TM_SelfModified || result == TM_Updated ||
5187 result == TM_Deleted || result == TM_WouldBlock);
5188
5189 /*
5190 * When locking a tuple under LockWaitSkip semantics and we fail with
5191 * TM_WouldBlock above, it's possible for concurrent transactions to
5192 * release the lock and set HEAP_XMAX_INVALID in the meantime. So
5193 * this assert is slightly different from the equivalent one in
5194 * heap_delete and heap_update.
5195 */
5196 Assert((result == TM_WouldBlock) ||
5197 !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5198 Assert(result != TM_Updated ||
5199 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
5200 tmfd->ctid = tuple->t_data->t_ctid;
5201 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5202 if (result == TM_SelfModified)
5203 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5204 else
5205 tmfd->cmax = InvalidCommandId;
5206 goto out_locked;
5207 }
5208
5209 /*
5210 * If we didn't pin the visibility map page and the page has become all
5211 * visible while we were busy locking the buffer, or during some
5212 * subsequent window during which we had it unlocked, we'll have to unlock
5213 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5214 * unfortunate, especially since we'll now have to recheck whether the
5215 * tuple has been locked or updated under us, but hopefully it won't
5216 * happen very often.
5217 */
5218 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5219 {
5221 visibilitymap_pin(relation, block, &vmbuffer);
5223 goto l3;
5224 }
5225
5226 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5227 old_infomask = tuple->t_data->t_infomask;
5228
5229 /*
5230 * If this is the first possibly-multixact-able operation in the current
5231 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5232 * certain that the transaction will never become a member of any older
5233 * MultiXactIds than that. (We have to do this even if we end up just
5234 * using our own TransactionId below, since some other backend could
5235 * incorporate our XID into a MultiXact immediately afterwards.)
5236 */
5238
5239 /*
5240 * Compute the new xmax and infomask to store into the tuple. Note we do
5241 * not modify the tuple just yet, because that would leave it in the wrong
5242 * state if multixact.c elogs.
5243 */
5245 GetCurrentTransactionId(), mode, false,
5246 &xid, &new_infomask, &new_infomask2);
5247
5249
5250 /*
5251 * Store transaction information of xact locking the tuple.
5252 *
5253 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5254 * possibly generating a useless combo CID. Moreover, if we're locking a
5255 * previously updated tuple, it's important to preserve the Cmax.
5256 *
5257 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5258 * we would break the HOT chain.
5259 */
5262 tuple->t_data->t_infomask |= new_infomask;
5263 tuple->t_data->t_infomask2 |= new_infomask2;
5266 HeapTupleHeaderSetXmax(tuple->t_data, xid);
5267
5268 /*
5269 * Make sure there is no forward chain link in t_ctid. Note that in the
5270 * cases where the tuple has been updated, we must not overwrite t_ctid,
5271 * because it was set by the updater. Moreover, if the tuple has been
5272 * updated, we need to follow the update chain to lock the new versions of
5273 * the tuple as well.
5274 */
5276 tuple->t_data->t_ctid = *tid;
5277
5278 /* Clear only the all-frozen bit on visibility map if needed */
5279 if (PageIsAllVisible(page) &&
5280 visibilitymap_clear(relation, block, vmbuffer,
5282 cleared_all_frozen = true;
5283
5284
5285 MarkBufferDirty(*buffer);
5286
5287 /*
5288 * XLOG stuff. You might think that we don't need an XLOG record because
5289 * there is no state change worth restoring after a crash. You would be
5290 * wrong however: we have just written either a TransactionId or a
5291 * MultiXactId that may never have been seen on disk before, and we need
5292 * to make sure that there are XLOG entries covering those ID numbers.
5293 * Else the same IDs might be re-used after a crash, which would be
5294 * disastrous if this page made it to disk before the crash. Essentially
5295 * we have to enforce the WAL log-before-data rule even in this case.
5296 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5297 * entries for everything anyway.)
5298 */
5299 if (RelationNeedsWAL(relation))
5300 {
5303
5306
5307 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5308 xlrec.xmax = xid;
5309 xlrec.infobits_set = compute_infobits(new_infomask,
5310 tuple->t_data->t_infomask2);
5313
5314 /* we don't decode row locks atm, so no need to log the origin */
5315
5317
5318 PageSetLSN(page, recptr);
5319 }
5320
5322
5323 result = TM_Ok;
5324
5327
5329 if (BufferIsValid(vmbuffer))
5330 ReleaseBuffer(vmbuffer);
5331
5332 /*
5333 * Don't update the visibility map here. Locking a tuple doesn't change
5334 * visibility info.
5335 */
5336
5337 /*
5338 * Now that we have successfully marked the tuple as locked, we can
5339 * release the lmgr tuple lock, if we had it.
5340 */
5341 if (have_tuple_lock)
5342 UnlockTupleTuplock(relation, tid, mode);
5343
5344 return result;
5345}

References Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), BufferIsValid(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), ConditionalMultiXactIdWait(), ConditionalXactLockTableWait(), TM_FailureData::ctid, DoesMultiXactIdConflict(), elog, END_CRIT_SECTION, ereport, errcode(), errmsg, ERROR, fb(), get_mxact_status_for_lock(), GetCurrentTransactionId(), GetMultiXactIdMembers(), heap_acquire_tuplock(), HEAP_KEYS_UPDATED, heap_lock_updated_tuple(), HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HeapTupleHeaderClearHotUpdated(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetXmax(), HeapTupleSatisfiesUpdate(), i, InvalidBuffer, InvalidCommandId, ItemIdGetLength, ItemIdIsNormal, ItemPointerCopy(), ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, LockWaitBlock, LockWaitError, LockWaitSkip, log_lock_failures, MarkBufferDirty(), mode, MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusNoKeyUpdate, PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), pfree(), ReadBuffer(), REGBUF_STANDARD, RelationGetRelationName, RelationGetRelid, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapLock, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TM_WouldBlock, TransactionIdEquals, TransactionIdIsCurrentTransactionId(), TUPLOCK_from_mxstatus, UnlockTupleTuplock, UpdateXmaxHintBits(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP_LOCK, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLTW_Lock, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_lock().

◆ heap_lock_updated_tuple()

static TM_Result heap_lock_updated_tuple ( Relation  rel,
uint16  prior_infomask,
TransactionId  prior_raw_xmax,
const ItemPointerData prior_ctid,
TransactionId  xid,
LockTupleMode  mode 
)
static

Definition at line 6129 of file heapam.c.

6134{
6135 INJECTION_POINT("heap_lock_updated_tuple", NULL);
6136
6137 /*
6138 * If the tuple has moved into another partition (effectively a delete)
6139 * stop here.
6140 */
6142 {
6144
6145 /*
6146 * If this is the first possibly-multixact-able operation in the
6147 * current transaction, set my per-backend OldestMemberMXactId
6148 * setting. We can be certain that the transaction will never become a
6149 * member of any older MultiXactIds than that. (We have to do this
6150 * even if we end up just using our own TransactionId below, since
6151 * some other backend could incorporate our XID into a MultiXact
6152 * immediately afterwards.)
6153 */
6155
6159 }
6160
6161 /* nothing to lock */
6162 return TM_Ok;
6163}

References fb(), heap_lock_updated_tuple_rec(), HEAP_XMAX_IS_MULTI, INJECTION_POINT, ItemPointerIndicatesMovedPartitions(), mode, MultiXactIdGetUpdateXid(), MultiXactIdSetOldestMember(), and TM_Ok.

Referenced by heap_lock_tuple().

◆ heap_lock_updated_tuple_rec()

static TM_Result heap_lock_updated_tuple_rec ( Relation  rel,
TransactionId  priorXmax,
const ItemPointerData tid,
TransactionId  xid,
LockTupleMode  mode 
)
static

Definition at line 5781 of file heapam.c.

5784{
5785 TM_Result result;
5788 Buffer buf;
5793 TransactionId xmax,
5794 new_xmax;
5795 bool cleared_all_frozen = false;
5797 Buffer vmbuffer = InvalidBuffer;
5798 BlockNumber block;
5799
5800 ItemPointerCopy(tid, &tupid);
5801
5802 for (;;)
5803 {
5804 new_infomask = 0;
5805 new_xmax = InvalidTransactionId;
5807 ItemPointerCopy(&tupid, &(mytup.t_self));
5808
5809 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
5810 {
5811 /*
5812 * if we fail to find the updated version of the tuple, it's
5813 * because it was vacuumed/pruned away after its creator
5814 * transaction aborted. So behave as if we got to the end of the
5815 * chain, and there's no further tuple to lock: return success to
5816 * caller.
5817 */
5818 result = TM_Ok;
5819 goto out_unlocked;
5820 }
5821
5822l4:
5824
5825 /*
5826 * Before locking the buffer, pin the visibility map page if it
5827 * appears to be necessary. Since we haven't got the lock yet,
5828 * someone else might be in the middle of changing this, so we'll need
5829 * to recheck after we have the lock.
5830 */
5832 {
5833 visibilitymap_pin(rel, block, &vmbuffer);
5834 pinned_desired_page = true;
5835 }
5836 else
5837 pinned_desired_page = false;
5838
5840
5841 /*
5842 * If we didn't pin the visibility map page and the page has become
5843 * all visible while we were busy locking the buffer, we'll have to
5844 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5845 * That's a bit unfortunate, but hopefully shouldn't happen often.
5846 *
5847 * Note: in some paths through this function, we will reach here
5848 * holding a pin on a vm page that may or may not be the one matching
5849 * this page. If this page isn't all-visible, we won't use the vm
5850 * page, but we hold onto such a pin till the end of the function.
5851 */
5853 {
5855 visibilitymap_pin(rel, block, &vmbuffer);
5857 }
5858
5859 /*
5860 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5861 * end of the chain, we're done, so return success.
5862 */
5865 priorXmax))
5866 {
5867 result = TM_Ok;
5868 goto out_locked;
5869 }
5870
5871 /*
5872 * Also check Xmin: if this tuple was created by an aborted
5873 * (sub)transaction, then we already locked the last live one in the
5874 * chain, thus we're done, so return success.
5875 */
5877 {
5878 result = TM_Ok;
5879 goto out_locked;
5880 }
5881
5882 old_infomask = mytup.t_data->t_infomask;
5883 old_infomask2 = mytup.t_data->t_infomask2;
5884 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5885
5886 /*
5887 * If this tuple version has been updated or locked by some concurrent
5888 * transaction(s), what we do depends on whether our lock mode
5889 * conflicts with what those other transactions hold, and also on the
5890 * status of them.
5891 */
5893 {
5895 bool needwait;
5896
5899 {
5900 int nmembers;
5901 int i;
5902 MultiXactMember *members;
5903
5904 /*
5905 * We don't need a test for pg_upgrade'd tuples: this is only
5906 * applied to tuples after the first in an update chain. Said
5907 * first tuple in the chain may well be locked-in-9.2-and-
5908 * pg_upgraded, but that one was already locked by our caller,
5909 * not us; and any subsequent ones cannot be because our
5910 * caller must necessarily have obtained a snapshot later than
5911 * the pg_upgrade itself.
5912 */
5913 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5914
5915 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5917 for (i = 0; i < nmembers; i++)
5918 {
5919 result = test_lockmode_for_conflict(members[i].status,
5920 members[i].xid,
5921 mode,
5922 &mytup,
5923 &needwait);
5924
5925 /*
5926 * If the tuple was already locked by ourselves in a
5927 * previous iteration of this (say heap_lock_tuple was
5928 * forced to restart the locking loop because of a change
5929 * in xmax), then we hold the lock already on this tuple
5930 * version and we don't need to do anything; and this is
5931 * not an error condition either. We just need to skip
5932 * this tuple and continue locking the next version in the
5933 * update chain.
5934 */
5935 if (result == TM_SelfModified)
5936 {
5937 pfree(members);
5938 goto next;
5939 }
5940
5941 if (needwait)
5942 {
5944 XactLockTableWait(members[i].xid, rel,
5945 &mytup.t_self,
5947 pfree(members);
5948 goto l4;
5949 }
5950 if (result != TM_Ok)
5951 {
5952 pfree(members);
5953 goto out_locked;
5954 }
5955 }
5956 if (members)
5957 pfree(members);
5958 }
5959 else
5960 {
5961 MultiXactStatus status;
5962
5963 /*
5964 * For a non-multi Xmax, we first need to compute the
5965 * corresponding MultiXactStatus by using the infomask bits.
5966 */
5968 {
5972 status = MultiXactStatusForShare;
5974 {
5976 status = MultiXactStatusForUpdate;
5977 else
5979 }
5980 else
5981 {
5982 /*
5983 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5984 * as share-locked in the old cluster) shouldn't be
5985 * seen in the middle of an update chain.
5986 */
5987 elog(ERROR, "invalid lock status in tuple");
5988 }
5989 }
5990 else
5991 {
5992 /* it's an update, but which kind? */
5994 status = MultiXactStatusUpdate;
5995 else
5997 }
5998
5999 result = test_lockmode_for_conflict(status, rawxmax, mode,
6000 &mytup, &needwait);
6001
6002 /*
6003 * If the tuple was already locked by ourselves in a previous
6004 * iteration of this (say heap_lock_tuple was forced to
6005 * restart the locking loop because of a change in xmax), then
6006 * we hold the lock already on this tuple version and we don't
6007 * need to do anything; and this is not an error condition
6008 * either. We just need to skip this tuple and continue
6009 * locking the next version in the update chain.
6010 */
6011 if (result == TM_SelfModified)
6012 goto next;
6013
6014 if (needwait)
6015 {
6017 XactLockTableWait(rawxmax, rel, &mytup.t_self,
6019 goto l4;
6020 }
6021 if (result != TM_Ok)
6022 {
6023 goto out_locked;
6024 }
6025 }
6026 }
6027
6028 /* compute the new Xmax and infomask values for the tuple ... */
6029 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
6030 xid, mode, false,
6031 &new_xmax, &new_infomask, &new_infomask2);
6032
6034 visibilitymap_clear(rel, block, vmbuffer,
6036 cleared_all_frozen = true;
6037
6039
6040 /* ... and set them */
6041 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
6042 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
6043 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6044 mytup.t_data->t_infomask |= new_infomask;
6045 mytup.t_data->t_infomask2 |= new_infomask2;
6046
6048
6049 /* XLOG stuff */
6050 if (RelationNeedsWAL(rel))
6051 {
6054 Page page = BufferGetPage(buf);
6055
6058
6059 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
6060 xlrec.xmax = new_xmax;
6062 xlrec.flags =
6064
6066
6068
6069 PageSetLSN(page, recptr);
6070 }
6071
6073
6074next:
6075 /* if we find the end of update chain, we're done. */
6076 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
6078 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
6080 {
6081 result = TM_Ok;
6082 goto out_locked;
6083 }
6084
6085 /* tail recursion */
6087 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
6089 }
6090
6091 result = TM_Ok;
6092
6095
6097 if (vmbuffer != InvalidBuffer)
6098 ReleaseBuffer(vmbuffer);
6099
6100 return result;
6101}

References Assert, buf, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), CHECK_FOR_INTERRUPTS, compute_infobits(), compute_new_xmax_infomask(), elog, END_CRIT_SECTION, ERROR, fb(), GetMultiXactIdMembers(), heap_fetch(), HEAP_KEYS_UPDATED, HEAP_LOCKED_UPGRADED(), HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIndicatesMovedPartitions(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetXmax(), i, InvalidBuffer, InvalidTransactionId, ItemPointerCopy(), ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), MarkBufferDirty(), mode, MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, next, PageIsAllVisible(), PageSetLSN(), pfree(), REGBUF_STANDARD, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapLockUpdated, SnapshotAny, START_CRIT_SECTION, test_lockmode_for_conflict(), TM_Ok, TM_SelfModified, TransactionIdDidAbort(), TransactionIdEquals, TransactionIdIsValid, UnlockReleaseBuffer(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP2_LOCK_UPDATED, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), and XLTW_LockUpdated.

Referenced by heap_lock_updated_tuple().

◆ heap_multi_insert()

void heap_multi_insert ( Relation  relation,
TupleTableSlot **  slots,
int  ntuples,
CommandId  cid,
int  options,
BulkInsertState  bistate 
)

Definition at line 2423 of file heapam.c.

2425{
2428 int i;
2429 int ndone;
2431 Page page;
2432 Buffer vmbuffer = InvalidBuffer;
2433 bool needwal;
2437 bool starting_with_empty_page = false;
2438 int npages = 0;
2439 int npages_used = 0;
2440
2441 /* currently not needed (thus unsupported) for heap_multi_insert() */
2443
2444 AssertHasSnapshotForToast(relation);
2445
2446 needwal = RelationNeedsWAL(relation);
2449
2450 /* Toast and set header data in all the slots */
2451 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2452 for (i = 0; i < ntuples; i++)
2453 {
2454 HeapTuple tuple;
2455
2456 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2457 slots[i]->tts_tableOid = RelationGetRelid(relation);
2458 tuple->t_tableOid = slots[i]->tts_tableOid;
2459 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2460 options);
2461 }
2462
2463 /*
2464 * We're about to do the actual inserts -- but check for conflict first,
2465 * to minimize the possibility of having to roll back work we've just
2466 * done.
2467 *
2468 * A check here does not definitively prevent a serialization anomaly;
2469 * that check MUST be done at least past the point of acquiring an
2470 * exclusive buffer content lock on every buffer that will be affected,
2471 * and MAY be done after all inserts are reflected in the buffers and
2472 * those locks are released; otherwise there is a race condition. Since
2473 * multiple buffers can be locked and unlocked in the loop below, and it
2474 * would not be feasible to identify and lock all of those buffers before
2475 * the loop, we must do a final check at the end.
2476 *
2477 * The check here could be omitted with no loss of correctness; it is
2478 * present strictly as an optimization.
2479 *
2480 * For heap inserts, we only need to check for table-level SSI locks. Our
2481 * new tuples can't possibly conflict with existing tuple locks, and heap
2482 * page locks are only consolidated versions of tuple locks; they do not
2483 * lock "gaps" as index page locks do. So we don't need to specify a
2484 * buffer when making the call, which makes for a faster check.
2485 */
2487
2488 ndone = 0;
2489 while (ndone < ntuples)
2490 {
2491 Buffer buffer;
2492 bool all_visible_cleared = false;
2493 bool all_frozen_set = false;
2494 int nthispage;
2495
2497
2498 /*
2499 * Compute number of pages needed to fit the to-be-inserted tuples in
2500 * the worst case. This will be used to determine how much to extend
2501 * the relation by in RelationGetBufferForTuple(), if needed. If we
2502 * filled a prior page from scratch, we can just update our last
2503 * computation, but if we started with a partially filled page,
2504 * recompute from scratch, the number of potentially required pages
2505 * can vary due to tuples needing to fit onto the page, page headers
2506 * etc.
2507 */
2508 if (ndone == 0 || !starting_with_empty_page)
2509 {
2510 npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2512 npages_used = 0;
2513 }
2514 else
2515 npages_used++;
2516
2517 /*
2518 * Find buffer where at least the next tuple will fit. If the page is
2519 * all-visible, this will also pin the requisite visibility map page.
2520 *
2521 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2522 * empty page. See all_frozen_set below.
2523 */
2524 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2525 InvalidBuffer, options, bistate,
2526 &vmbuffer, NULL,
2527 npages - npages_used);
2528 page = BufferGetPage(buffer);
2529
2531
2533 {
2534 all_frozen_set = true;
2535 /* Lock the vmbuffer before entering the critical section */
2537 }
2538
2539 /* NO EREPORT(ERROR) from here till changes are logged */
2541
2542 /*
2543 * RelationGetBufferForTuple has ensured that the first tuple fits.
2544 * Put that on the page, and then as many other tuples as fit.
2545 */
2546 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2547
2548 /*
2549 * For logical decoding we need combo CIDs to properly decode the
2550 * catalog.
2551 */
2552 if (needwal && need_cids)
2553 log_heap_new_cid(relation, heaptuples[ndone]);
2554
2555 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2556 {
2558
2559 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2560 break;
2561
2562 RelationPutHeapTuple(relation, buffer, heaptup, false);
2563
2564 /*
2565 * For logical decoding we need combo CIDs to properly decode the
2566 * catalog.
2567 */
2568 if (needwal && need_cids)
2569 log_heap_new_cid(relation, heaptup);
2570 }
2571
2572 /*
2573 * If the page is all visible, need to clear that, unless we're only
2574 * going to add further frozen rows to it.
2575 *
2576 * If we're only adding already frozen rows to a previously empty
2577 * page, mark it as all-frozen and update the visibility map. We're
2578 * already holding a pin on the vmbuffer.
2579 */
2581 {
2582 all_visible_cleared = true;
2583 PageClearAllVisible(page);
2584 visibilitymap_clear(relation,
2585 BufferGetBlockNumber(buffer),
2586 vmbuffer, VISIBILITYMAP_VALID_BITS);
2587 }
2588 else if (all_frozen_set)
2589 {
2590 PageSetAllVisible(page);
2591 PageClearPrunable(page);
2593 vmbuffer,
2596 relation->rd_locator);
2597 }
2598
2599 /*
2600 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2601 */
2602
2603 MarkBufferDirty(buffer);
2604
2605 /* XLOG stuff */
2606 if (needwal)
2607 {
2611 char *tupledata;
2612 int totaldatalen;
2613 char *scratchptr = scratch.data;
2614 bool init;
2615 int bufflags = 0;
2616
2617 /*
2618 * If the page was previously empty, we can reinit the page
2619 * instead of restoring the whole thing.
2620 */
2622
2623 /* allocate xl_heap_multi_insert struct from the scratch area */
2626
2627 /*
2628 * Allocate offsets array. Unless we're reinitializing the page,
2629 * in that case the tuples are stored in order starting at
2630 * FirstOffsetNumber and we don't need to store the offsets
2631 * explicitly.
2632 */
2633 if (!init)
2634 scratchptr += nthispage * sizeof(OffsetNumber);
2635
2636 /* the rest of the scratch space is used for tuple data */
2637 tupledata = scratchptr;
2638
2639 /* check that the mutually exclusive flags are not both set */
2641
2642 xlrec->flags = 0;
2645
2646 /*
2647 * We don't have to worry about including a conflict xid in the
2648 * WAL record, as HEAP_INSERT_FROZEN intentionally violates
2649 * visibility rules.
2650 */
2651 if (all_frozen_set)
2653
2654 xlrec->ntuples = nthispage;
2655
2656 /*
2657 * Write out an xl_multi_insert_tuple and the tuple data itself
2658 * for each tuple.
2659 */
2660 for (i = 0; i < nthispage; i++)
2661 {
2663 xl_multi_insert_tuple *tuphdr;
2664 int datalen;
2665
2666 if (!init)
2667 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2668 /* xl_multi_insert_tuple needs two-byte alignment. */
2670 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2671
2672 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2673 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2674 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2675
2676 /* write bitmap [+ padding] [+ oid] + data */
2677 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2679 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2680 datalen);
2681 tuphdr->datalen = datalen;
2682 scratchptr += datalen;
2683 }
2684 totaldatalen = scratchptr - tupledata;
2685 Assert((scratchptr - scratch.data) < BLCKSZ);
2686
2687 if (need_tuple_data)
2689
2690 /*
2691 * Signal that this is the last xl_heap_multi_insert record
2692 * emitted by this call to heap_multi_insert(). Needed for logical
2693 * decoding so it knows when to cleanup temporary data.
2694 */
2695 if (ndone + nthispage == ntuples)
2697
2698 if (init)
2699 {
2700 info |= XLOG_HEAP_INIT_PAGE;
2702 }
2703
2704 /*
2705 * If we're doing logical decoding, include the new tuple data
2706 * even if we take a full-page image of the page.
2707 */
2708 if (need_tuple_data)
2710
2712 XLogRegisterData(xlrec, tupledata - scratch.data);
2714 if (all_frozen_set)
2715 XLogRegisterBuffer(1, vmbuffer, 0);
2716
2717 XLogRegisterBufData(0, tupledata, totaldatalen);
2718
2719 /* filtering by origin on a row level is much more efficient */
2721
2722 recptr = XLogInsert(RM_HEAP2_ID, info);
2723
2724 PageSetLSN(page, recptr);
2725 if (all_frozen_set)
2726 {
2727 Assert(BufferIsDirty(vmbuffer));
2728 PageSetLSN(BufferGetPage(vmbuffer), recptr);
2729 }
2730 }
2731
2733
2734 if (all_frozen_set)
2735 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
2736
2737 UnlockReleaseBuffer(buffer);
2738 ndone += nthispage;
2739
2740 /*
2741 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2742 * likely that we'll insert into subsequent heap pages that are likely
2743 * to use the same vm page.
2744 */
2745 }
2746
2747 /* We're done with inserting all tuples, so release the last vmbuffer. */
2748 if (vmbuffer != InvalidBuffer)
2749 ReleaseBuffer(vmbuffer);
2750
2751 /*
2752 * We're done with the actual inserts. Check for conflicts again, to
2753 * ensure that all rw-conflicts in to these inserts are detected. Without
2754 * this final check, a sequential scan of the heap may have locked the
2755 * table after the "before" check, missing one opportunity to detect the
2756 * conflict, and then scanned the table before the new tuples were there,
2757 * missing the other chance to detect the conflict.
2758 *
2759 * For heap inserts, we only need to check for table-level SSI locks. Our
2760 * new tuples can't possibly conflict with existing tuple locks, and heap
2761 * page locks are only consolidated versions of tuple locks; they do not
2762 * lock "gaps" as index page locks do. So we don't need to specify a
2763 * buffer when making the call.
2764 */
2766
2767 /*
2768 * If tuples are cacheable, mark them for invalidation from the caches in
2769 * case we abort. Note it is OK to do this after releasing the buffer,
2770 * because the heaptuples data structure is all in local memory, not in
2771 * the shared buffer.
2772 */
2773 if (IsCatalogRelation(relation))
2774 {
2775 for (i = 0; i < ntuples; i++)
2777 }
2778
2779 /* copy t_self fields back to the caller's slots */
2780 for (i = 0; i < ntuples; i++)
2781 slots[i]->tts_tid = heaptuples[i]->t_self;
2782
2783 pgstat_count_heap_insert(relation, ntuples);
2784}

References Assert, AssertHasSnapshotForToast(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), BufferIsDirty(), CacheInvalidateHeapTuple(), CHECK_FOR_INTERRUPTS, CheckForSerializableConflictIn(), xl_multi_insert_tuple::datalen, END_CRIT_SECTION, ExecFetchSlotHeapTuple(), fb(), GetCurrentTransactionId(), HEAP_DEFAULT_FILLFACTOR, HEAP_INSERT_FROZEN, HEAP_INSERT_NO_LOGICAL, heap_multi_insert_pages(), heap_prepare_insert(), i, init, InvalidBlockNumber, InvalidBuffer, IsCatalogRelation(), ItemPointerGetOffsetNumber(), LockBuffer(), log_heap_new_cid(), MarkBufferDirty(), MAXALIGN, PageClearAllVisible(), PageClearPrunable, PageGetHeapFreeSpace(), PageGetMaxOffsetNumber(), PageIsAllVisible(), PageSetAllVisible(), PageSetLSN(), palloc(), pgstat_count_heap_insert(), RelationData::rd_locator, REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetBufferForTuple(), RelationGetRelid, RelationGetTargetPageFreeSpace, RelationIsAccessibleInLogicalDecoding, RelationIsLogicallyLogged, RelationNeedsWAL, RelationPutHeapTuple(), ReleaseBuffer(), SHORTALIGN, SizeOfHeapMultiInsert, SizeofHeapTupleHeader, SizeOfMultiInsertTuple, START_CRIT_SECTION, xl_multi_insert_tuple::t_hoff, xl_multi_insert_tuple::t_infomask, xl_multi_insert_tuple::t_infomask2, HeapTupleData::t_tableOid, TupleTableSlot::tts_tableOid, UnlockReleaseBuffer(), VISIBILITYMAP_ALL_FROZEN, VISIBILITYMAP_ALL_VISIBLE, visibilitymap_clear(), visibilitymap_set_vmbits(), VISIBILITYMAP_VALID_BITS, XLH_INSERT_ALL_FROZEN_SET, XLH_INSERT_ALL_VISIBLE_CLEARED, XLH_INSERT_CONTAINS_NEW_TUPLE, XLH_INSERT_LAST_IN_MULTI, XLOG_HEAP2_MULTI_INSERT, XLOG_HEAP_INIT_PAGE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by CatalogTuplesMultiInsertWithInfo().

◆ heap_multi_insert_pages()

static int heap_multi_insert_pages ( HeapTuple heaptuples,
int  done,
int  ntuples,
Size  saveFreeSpace 
)
static

Definition at line 2391 of file heapam.c.

2392{
2394 int npages = 1;
2395
2396 for (int i = done; i < ntuples; i++)
2397 {
2398 size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2399
2400 if (page_avail < tup_sz)
2401 {
2402 npages++;
2404 }
2405 page_avail -= tup_sz;
2406 }
2407
2408 return npages;
2409}

References fb(), i, MAXALIGN, and SizeOfPageHeaderData.

Referenced by heap_multi_insert().

◆ heap_pre_freeze_checks()

void heap_pre_freeze_checks ( Buffer  buffer,
HeapTupleFreeze tuples,
int  ntuples 
)

Definition at line 7426 of file heapam.c.

7428{
7429 Page page = BufferGetPage(buffer);
7430
7431 for (int i = 0; i < ntuples; i++)
7432 {
7433 HeapTupleFreeze *frz = tuples + i;
7434 ItemId itemid = PageGetItemId(page, frz->offset);
7435 HeapTupleHeader htup;
7436
7437 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7438
7439 /* Deliberately avoid relying on tuple hint bits here */
7440 if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
7441 {
7443
7445 if (unlikely(!TransactionIdDidCommit(xmin)))
7446 ereport(ERROR,
7448 errmsg_internal("uncommitted xmin %u needs to be frozen",
7449 xmin)));
7450 }
7451
7452 /*
7453 * TransactionIdDidAbort won't work reliably in the presence of XIDs
7454 * left behind by transactions that were in progress during a crash,
7455 * so we can only check that xmax didn't commit
7456 */
7457 if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
7458 {
7460
7463 ereport(ERROR,
7465 errmsg_internal("cannot freeze committed xmax %u",
7466 xmax)));
7467 }
7468 }
7469}

References Assert, BufferGetPage(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errmsg_internal(), ERROR, fb(), HEAP_FREEZE_CHECK_XMAX_ABORTED, HEAP_FREEZE_CHECK_XMIN_COMMITTED, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetRawXmin(), HeapTupleHeaderXminFrozen(), i, PageGetItem(), PageGetItemId(), TransactionIdDidCommit(), TransactionIdIsNormal, and unlikely.

Referenced by heap_page_will_freeze().

◆ heap_prepare_freeze_tuple()

bool heap_prepare_freeze_tuple ( HeapTupleHeader  tuple,
const struct VacuumCutoffs cutoffs,
HeapPageFreeze pagefrz,
HeapTupleFreeze frz,
bool totally_frozen 
)

Definition at line 7146 of file heapam.c.

7150{
7151 bool xmin_already_frozen = false,
7152 xmax_already_frozen = false;
7153 bool freeze_xmin = false,
7154 replace_xvac = false,
7155 replace_xmax = false,
7156 freeze_xmax = false;
7157 TransactionId xid;
7158
7159 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
7160 frz->t_infomask2 = tuple->t_infomask2;
7161 frz->t_infomask = tuple->t_infomask;
7162 frz->frzflags = 0;
7163 frz->checkflags = 0;
7164
7165 /*
7166 * Process xmin, while keeping track of whether it's already frozen, or
7167 * will become frozen iff our freeze plan is executed by caller (could be
7168 * neither).
7169 */
7170 xid = HeapTupleHeaderGetXmin(tuple);
7171 if (!TransactionIdIsNormal(xid))
7172 xmin_already_frozen = true;
7173 else
7174 {
7175 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7176 ereport(ERROR,
7178 errmsg_internal("found xmin %u from before relfrozenxid %u",
7179 xid, cutoffs->relfrozenxid)));
7180
7181 /* Will set freeze_xmin flags in freeze plan below */
7183
7184 /* Verify that xmin committed if and when freeze plan is executed */
7185 if (freeze_xmin)
7186 {
7189 pagefrz->FreezePageConflictXid = xid;
7190 }
7191 }
7192
7193 /*
7194 * Old-style VACUUM FULL is gone, but we have to process xvac for as long
7195 * as we support having MOVED_OFF/MOVED_IN tuples in the database
7196 */
7197 xid = HeapTupleHeaderGetXvac(tuple);
7198 if (TransactionIdIsNormal(xid))
7199 {
7201 Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
7202
7203 /*
7204 * For Xvac, we always freeze proactively. This allows totally_frozen
7205 * tracking to ignore xvac.
7206 */
7207 replace_xvac = pagefrz->freeze_required = true;
7208
7210 pagefrz->FreezePageConflictXid = xid;
7211
7212 /* Will set replace_xvac flags in freeze plan below */
7213 }
7214
7215 /* Now process xmax */
7216 xid = frz->xmax;
7217 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7218 {
7219 /* Raw xmax is a MultiXactId */
7221 uint16 flags;
7222
7223 /*
7224 * We will either remove xmax completely (in the "freeze_xmax" path),
7225 * process xmax by replacing it (in the "replace_xmax" path), or
7226 * perform no-op xmax processing. The only constraint is that the
7227 * FreezeLimit/MultiXactCutoff postcondition must never be violated.
7228 */
7229 newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
7230 &flags, pagefrz);
7231
7232 if (flags & FRM_NOOP)
7233 {
7234 /*
7235 * xmax is a MultiXactId, and nothing about it changes for now.
7236 * This is the only case where 'freeze_required' won't have been
7237 * set for us by FreezeMultiXactId, as well as the only case where
7238 * neither freeze_xmax nor replace_xmax are set (given a multi).
7239 *
7240 * This is a no-op, but the call to FreezeMultiXactId might have
7241 * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
7242 * for us (the "freeze page" variants, specifically). That'll
7243 * make it safe for our caller to freeze the page later on, while
7244 * leaving this particular xmax undisturbed.
7245 *
7246 * FreezeMultiXactId is _not_ responsible for the "no freeze"
7247 * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
7248 * job. A call to heap_tuple_should_freeze for this same tuple
7249 * will take place below if 'freeze_required' isn't set already.
7250 * (This repeats work from FreezeMultiXactId, but allows "no
7251 * freeze" tracker maintenance to happen in only one place.)
7252 */
7255 }
7256 else if (flags & FRM_RETURN_IS_XID)
7257 {
7258 /*
7259 * xmax will become an updater Xid (original MultiXact's updater
7260 * member Xid will be carried forward as a simple Xid in Xmax).
7261 */
7263
7264 /*
7265 * NB -- some of these transformations are only valid because we
7266 * know the return Xid is a tuple updater (i.e. not merely a
7267 * locker.) Also note that the only reason we don't explicitly
7268 * worry about HEAP_KEYS_UPDATED is because it lives in
7269 * t_infomask2 rather than t_infomask.
7270 */
7271 frz->t_infomask &= ~HEAP_XMAX_BITS;
7272 frz->xmax = newxmax;
7273 if (flags & FRM_MARK_COMMITTED)
7274 frz->t_infomask |= HEAP_XMAX_COMMITTED;
7275 replace_xmax = true;
7276 }
7277 else if (flags & FRM_RETURN_IS_MULTI)
7278 {
7281
7282 /*
7283 * xmax is an old MultiXactId that we have to replace with a new
7284 * MultiXactId, to carry forward two or more original member XIDs.
7285 */
7287
7288 /*
7289 * We can't use GetMultiXactIdHintBits directly on the new multi
7290 * here; that routine initializes the masks to all zeroes, which
7291 * would lose other bits we need. Doing it this way ensures all
7292 * unrelated bits remain untouched.
7293 */
7294 frz->t_infomask &= ~HEAP_XMAX_BITS;
7295 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7297 frz->t_infomask |= newbits;
7298 frz->t_infomask2 |= newbits2;
7299 frz->xmax = newxmax;
7300 replace_xmax = true;
7301 }
7302 else
7303 {
7304 /*
7305 * Freeze plan for tuple "freezes xmax" in the strictest sense:
7306 * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
7307 */
7308 Assert(flags & FRM_INVALIDATE_XMAX);
7310
7311 /* Will set freeze_xmax flags in freeze plan below */
7312 freeze_xmax = true;
7313 }
7314
7315 /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
7316 Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
7317 }
7318 else if (TransactionIdIsNormal(xid))
7319 {
7320 /* Raw xmax is normal XID */
7321 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7322 ereport(ERROR,
7324 errmsg_internal("found xmax %u from before relfrozenxid %u",
7325 xid, cutoffs->relfrozenxid)));
7326
7327 /* Will set freeze_xmax flags in freeze plan below */
7329
7330 /*
7331 * Verify that xmax aborted if and when freeze plan is executed,
7332 * provided it's from an update. (A lock-only xmax can be removed
7333 * independent of this, since the lock is released at xact end.)
7334 */
7336 frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
7337 }
7338 else if (!TransactionIdIsValid(xid))
7339 {
7340 /* Raw xmax is InvalidTransactionId XID */
7341 Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
7342 xmax_already_frozen = true;
7343 }
7344 else
7345 ereport(ERROR,
7347 errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
7348 xid, tuple->t_infomask)));
7349
7350 if (freeze_xmin)
7351 {
7353
7354 frz->t_infomask |= HEAP_XMIN_FROZEN;
7355 }
7356 if (replace_xvac)
7357 {
7358 /*
7359 * If a MOVED_OFF tuple is not dead, the xvac transaction must have
7360 * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
7361 * transaction succeeded.
7362 */
7363 Assert(pagefrz->freeze_required);
7364 if (tuple->t_infomask & HEAP_MOVED_OFF)
7365 frz->frzflags |= XLH_INVALID_XVAC;
7366 else
7367 frz->frzflags |= XLH_FREEZE_XVAC;
7368 }
7369 if (replace_xmax)
7370 {
7372 Assert(pagefrz->freeze_required);
7373
7374 /* Already set replace_xmax flags in freeze plan earlier */
7375 }
7376 if (freeze_xmax)
7377 {
7379
7380 frz->xmax = InvalidTransactionId;
7381
7382 /*
7383 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7384 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
7385 * Also get rid of the HEAP_KEYS_UPDATED bit.
7386 */
7387 frz->t_infomask &= ~HEAP_XMAX_BITS;
7388 frz->t_infomask |= HEAP_XMAX_INVALID;
7389 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7390 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7391 }
7392
7393 /*
7394 * Determine if this tuple is already totally frozen, or will become
7395 * totally frozen (provided caller executes freeze plans for the page)
7396 */
7399
7400 if (!pagefrz->freeze_required && !(xmin_already_frozen &&
7402 {
7403 /*
7404 * So far no previous tuple from the page made freezing mandatory.
7405 * Does this tuple force caller to freeze the entire page?
7406 */
7407 pagefrz->freeze_required =
7408 heap_tuple_should_freeze(tuple, cutoffs,
7409 &pagefrz->NoFreezePageRelfrozenXid,
7410 &pagefrz->NoFreezePageRelminMxid);
7411 }
7412
7413 /* Tell caller if this tuple has a usable freeze plan set in *frz */
7415}

References Assert, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errmsg_internal(), ERROR, fb(), HeapPageFreeze::freeze_required, FreezeMultiXactId(), HeapPageFreeze::FreezePageConflictXid, FRM_INVALIDATE_XMAX, FRM_MARK_COMMITTED, FRM_NOOP, FRM_RETURN_IS_MULTI, FRM_RETURN_IS_XID, GetMultiXactIdHintBits(), HEAP_FREEZE_CHECK_XMAX_ABORTED, HEAP_FREEZE_CHECK_XMIN_COMMITTED, HEAP_MOVED_OFF, heap_tuple_should_freeze(), HEAP_XMAX_COMMITTED, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMIN_FROZEN, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), InvalidTransactionId, VacuumCutoffs::MultiXactCutoff, MultiXactIdIsValid, MultiXactIdPrecedes(), HeapPageFreeze::NoFreezePageRelfrozenXid, HeapPageFreeze::NoFreezePageRelminMxid, VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, VacuumCutoffs::relfrozenxid, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, TransactionIdFollows(), TransactionIdIsNormal, TransactionIdIsValid, TransactionIdPrecedes(), TransactionIdPrecedesOrEquals(), XLH_FREEZE_XVAC, and XLH_INVALID_XVAC.

Referenced by heap_freeze_tuple(), and heap_prune_record_unchanged_lp_normal().

◆ heap_prepare_insert()

static HeapTuple heap_prepare_insert ( Relation  relation,
HeapTuple  tup,
TransactionId  xid,
CommandId  cid,
int  options 
)
static

Definition at line 2343 of file heapam.c.

2345{
2346 /*
2347 * To allow parallel inserts, we need to ensure that they are safe to be
2348 * performed in workers. We have the infrastructure to allow parallel
2349 * inserts in general except for the cases where inserts generate a new
2350 * CommandId (eg. inserts into a table having a foreign key column).
2351 */
2352 if (IsParallelWorker())
2353 ereport(ERROR,
2355 errmsg("cannot insert tuples in a parallel worker")));
2356
2357 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2358 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2359 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2360 HeapTupleHeaderSetXmin(tup->t_data, xid);
2363
2364 HeapTupleHeaderSetCmin(tup->t_data, cid);
2365 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2366 tup->t_tableOid = RelationGetRelid(relation);
2367
2368 /*
2369 * If the new tuple is too big for storage or contains already toasted
2370 * out-of-line attributes from some other relation, invoke the toaster.
2371 */
2372 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2373 relation->rd_rel->relkind != RELKIND_MATVIEW)
2374 {
2375 /* toast table entries should never be recursively toasted */
2377 return tup;
2378 }
2379 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2380 return heap_toast_insert_or_update(relation, tup, NULL, options);
2381 else
2382 return tup;
2383}

References Assert, ereport, errcode(), errmsg, ERROR, fb(), HEAP2_XACT_MASK, HEAP_INSERT_FROZEN, heap_toast_insert_or_update(), HEAP_XACT_MASK, HEAP_XMAX_INVALID, HeapTupleHasExternal(), HeapTupleHeaderSetCmin(), HeapTupleHeaderSetXmax(), HeapTupleHeaderSetXmin(), HeapTupleHeaderSetXminFrozen(), IsParallelWorker, RelationData::rd_rel, RelationGetRelid, and TOAST_TUPLE_THRESHOLD.

Referenced by heap_insert(), and heap_multi_insert().

◆ heap_prepare_pagescan()

void heap_prepare_pagescan ( TableScanDesc  sscan)

Definition at line 616 of file heapam.c.

617{
619 Buffer buffer = scan->rs_cbuf;
620 BlockNumber block = scan->rs_cblock;
621 Snapshot snapshot;
622 Page page;
623 int lines;
624 bool all_visible;
626
627 Assert(BufferGetBlockNumber(buffer) == block);
628
629 /* ensure we're not accidentally being used when not in pagemode */
631 snapshot = scan->rs_base.rs_snapshot;
632
633 /*
634 * Prune and repair fragmentation for the whole page, if possible.
635 */
636 heap_page_prune_opt(scan->rs_base.rs_rd, buffer, &scan->rs_vmbuffer);
637
638 /*
639 * We must hold share lock on the buffer content while examining tuple
640 * visibility. Afterwards, however, the tuples we have found to be
641 * visible are guaranteed good as long as we hold the buffer pin.
642 */
644
645 page = BufferGetPage(buffer);
646 lines = PageGetMaxOffsetNumber(page);
647
648 /*
649 * If the all-visible flag indicates that all tuples on the page are
650 * visible to everyone, we can skip the per-tuple visibility tests.
651 *
652 * Note: In hot standby, a tuple that's already visible to all
653 * transactions on the primary might still be invisible to a read-only
654 * transaction in the standby. We partly handle this problem by tracking
655 * the minimum xmin of visible tuples as the cut-off XID while marking a
656 * page all-visible on the primary and WAL log that along with the
657 * visibility map SET operation. In hot standby, we wait for (or abort)
658 * all transactions that can potentially may not see one or more tuples on
659 * the page. That's how index-only scans work fine in hot standby. A
660 * crucial difference between index-only scans and heap scans is that the
661 * index-only scan completely relies on the visibility map where as heap
662 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
663 * the page-level flag can be trusted in the same way, because it might
664 * get propagated somehow without being explicitly WAL-logged, e.g. via a
665 * full page write. Until we can prove that beyond doubt, let's check each
666 * tuple for visibility the hard way.
667 */
668 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
671
672 /*
673 * We call page_collect_tuples() with constant arguments, to get the
674 * compiler to constant fold the constant arguments. Separate calls with
675 * constant arguments, rather than variables, are needed on several
676 * compilers to actually perform constant folding.
677 */
678 if (likely(all_visible))
679 {
681 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
682 block, lines, true, false);
683 else
684 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
685 block, lines, true, true);
686 }
687 else
688 {
690 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
691 block, lines, false, false);
692 else
693 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
694 block, lines, false, true);
695 }
696
698}

References Assert, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), CheckForSerializableConflictOutNeeded(), fb(), heap_page_prune_opt(), likely, LockBuffer(), page_collect_tuples(), PageGetMaxOffsetNumber(), PageIsAllVisible(), HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, HeapScanDescData::rs_ntuples, TableScanDescData::rs_rd, TableScanDescData::rs_snapshot, HeapScanDescData::rs_vmbuffer, SO_ALLOW_PAGEMODE, and SnapshotData::takenDuringRecovery.

Referenced by heapam_scan_sample_next_block(), and heapgettup_pagemode().

◆ heap_rescan()

void heap_rescan ( TableScanDesc  sscan,
ScanKey  key,
bool  set_params,
bool  allow_strat,
bool  allow_sync,
bool  allow_pagemode 
)

Definition at line 1319 of file heapam.c.

1321{
1323
1324 if (set_params)
1325 {
1326 if (allow_strat)
1328 else
1330
1331 if (allow_sync)
1333 else
1335
1336 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1339 else
1341 }
1342
1343 /*
1344 * unpin scan buffers
1345 */
1346 if (BufferIsValid(scan->rs_cbuf))
1347 {
1348 ReleaseBuffer(scan->rs_cbuf);
1349 scan->rs_cbuf = InvalidBuffer;
1350 }
1351
1352 if (BufferIsValid(scan->rs_vmbuffer))
1353 {
1355 scan->rs_vmbuffer = InvalidBuffer;
1356 }
1357
1358 /*
1359 * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any
1360 * additional data vs a normal HeapScan
1361 */
1362
1363 /*
1364 * The read stream is reset on rescan. This must be done before
1365 * initscan(), as some state referred to by read_stream_reset() is reset
1366 * in initscan().
1367 */
1368 if (scan->rs_read_stream)
1370
1371 /*
1372 * reinitialize scan descriptor
1373 */
1374 initscan(scan, key, true);
1375}

References BufferIsValid(), fb(), initscan(), InvalidBuffer, IsMVCCSnapshot, read_stream_reset(), ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, HeapScanDescData::rs_vmbuffer, SO_ALLOW_PAGEMODE, SO_ALLOW_STRAT, and SO_ALLOW_SYNC.

◆ heap_scan_stream_read_next_parallel()

static BlockNumber heap_scan_stream_read_next_parallel ( ReadStream stream,
void callback_private_data,
void per_buffer_data 
)
static

◆ heap_scan_stream_read_next_serial()

static BlockNumber heap_scan_stream_read_next_serial ( ReadStream stream,
void callback_private_data,
void per_buffer_data 
)
static

Definition at line 292 of file heapam.c.

295{
296 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
297
298 if (unlikely(!scan->rs_inited))
299 {
301 scan->rs_inited = true;
302 }
303 else
305 scan->rs_prefetch_block,
306 scan->rs_dir);
307
308 return scan->rs_prefetch_block;
309}

References heapgettup_advance_block(), heapgettup_initial_block(), HeapScanDescData::rs_dir, HeapScanDescData::rs_inited, HeapScanDescData::rs_prefetch_block, and unlikely.

Referenced by heap_beginscan().

◆ heap_set_tidrange()

void heap_set_tidrange ( TableScanDesc  sscan,
ItemPointer  mintid,
ItemPointer  maxtid 
)

Definition at line 1489 of file heapam.c.

1491{
1497
1498 /*
1499 * For relations without any pages, we can simply leave the TID range
1500 * unset. There will be no tuples to scan, therefore no tuples outside
1501 * the given TID range.
1502 */
1503 if (scan->rs_nblocks == 0)
1504 return;
1505
1506 /*
1507 * Set up some ItemPointers which point to the first and last possible
1508 * tuples in the heap.
1509 */
1512
1513 /*
1514 * If the given maximum TID is below the highest possible TID in the
1515 * relation, then restrict the range to that, otherwise we scan to the end
1516 * of the relation.
1517 */
1520
1521 /*
1522 * If the given minimum TID is above the lowest possible TID in the
1523 * relation, then restrict the range to only scan for TIDs above that.
1524 */
1527
1528 /*
1529 * Check for an empty range and protect from would be negative results
1530 * from the numBlks calculation below.
1531 */
1533 {
1534 /* Set an empty range of blocks to scan */
1536 return;
1537 }
1538
1539 /*
1540 * Calculate the first block and the number of blocks we must scan. We
1541 * could be more aggressive here and perform some more validation to try
1542 * and further narrow the scope of blocks to scan by checking if the
1543 * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1544 * advance startBlk by one. Likewise, if highestItem has an offset of 0
1545 * we could scan one fewer blocks. However, such an optimization does not
1546 * seem worth troubling over, currently.
1547 */
1549
1552
1553 /* Set the start block and number of blocks to scan */
1555
1556 /* Finally, set the TID range in sscan */
1557 ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
1558 ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
1559}

References fb(), FirstOffsetNumber, heap_setscanlimits(), ItemPointerCompare(), ItemPointerCopy(), ItemPointerGetBlockNumberNoCheck(), ItemPointerSet(), MaxOffsetNumber, and HeapScanDescData::rs_nblocks.

◆ heap_setscanlimits()

void heap_setscanlimits ( TableScanDesc  sscan,
BlockNumber  startBlk,
BlockNumber  numBlks 
)

Definition at line 500 of file heapam.c.

501{
503
504 Assert(!scan->rs_inited); /* else too late to change */
505 /* else rs_startblock is significant */
507
508 /* Check startBlk is valid (but allow case of zero blocks...) */
509 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
510
511 scan->rs_startblock = startBlk;
512 scan->rs_numblocks = numBlks;
513}

References Assert, fb(), HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, HeapScanDescData::rs_numblocks, HeapScanDescData::rs_startblock, and SO_ALLOW_SYNC.

Referenced by heap_set_tidrange(), and heapam_index_build_range_scan().

◆ heap_tuple_needs_eventual_freeze()

bool heap_tuple_needs_eventual_freeze ( HeapTupleHeader  tuple)

Definition at line 7910 of file heapam.c.

7911{
7912 TransactionId xid;
7913
7914 /*
7915 * If xmin is a normal transaction ID, this tuple is definitely not
7916 * frozen.
7917 */
7918 xid = HeapTupleHeaderGetXmin(tuple);
7919 if (TransactionIdIsNormal(xid))
7920 return true;
7921
7922 /*
7923 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7924 */
7925 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7926 {
7927 MultiXactId multi;
7928
7929 multi = HeapTupleHeaderGetRawXmax(tuple);
7930 if (MultiXactIdIsValid(multi))
7931 return true;
7932 }
7933 else
7934 {
7935 xid = HeapTupleHeaderGetRawXmax(tuple);
7936 if (TransactionIdIsNormal(xid))
7937 return true;
7938 }
7939
7940 if (tuple->t_infomask & HEAP_MOVED)
7941 {
7942 xid = HeapTupleHeaderGetXvac(tuple);
7943 if (TransactionIdIsNormal(xid))
7944 return true;
7945 }
7946
7947 return false;
7948}

References HEAP_MOVED, HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), MultiXactIdIsValid, HeapTupleHeaderData::t_infomask, and TransactionIdIsNormal.

Referenced by collect_corrupt_items(), and heap_page_would_be_all_visible().

◆ heap_tuple_should_freeze()

bool heap_tuple_should_freeze ( HeapTupleHeader  tuple,
const struct VacuumCutoffs cutoffs,
TransactionId NoFreezePageRelfrozenXid,
MultiXactId NoFreezePageRelminMxid 
)

Definition at line 7965 of file heapam.c.

7969{
7970 TransactionId xid;
7971 MultiXactId multi;
7972 bool freeze = false;
7973
7974 /* First deal with xmin */
7975 xid = HeapTupleHeaderGetXmin(tuple);
7976 if (TransactionIdIsNormal(xid))
7977 {
7979 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7980 *NoFreezePageRelfrozenXid = xid;
7981 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7982 freeze = true;
7983 }
7984
7985 /* Now deal with xmax */
7987 multi = InvalidMultiXactId;
7988 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7989 multi = HeapTupleHeaderGetRawXmax(tuple);
7990 else
7991 xid = HeapTupleHeaderGetRawXmax(tuple);
7992
7993 if (TransactionIdIsNormal(xid))
7994 {
7996 /* xmax is a non-permanent XID */
7997 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7998 *NoFreezePageRelfrozenXid = xid;
7999 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
8000 freeze = true;
8001 }
8002 else if (!MultiXactIdIsValid(multi))
8003 {
8004 /* xmax is a permanent XID or invalid MultiXactId/XID */
8005 }
8006 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
8007 {
8008 /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
8009 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
8010 *NoFreezePageRelminMxid = multi;
8011 /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
8012 freeze = true;
8013 }
8014 else
8015 {
8016 /* xmax is a MultiXactId that may have an updater XID */
8017 MultiXactMember *members;
8018 int nmembers;
8019
8021 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
8022 *NoFreezePageRelminMxid = multi;
8023 if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
8024 freeze = true;
8025
8026 /* need to check whether any member of the mxact is old */
8027 nmembers = GetMultiXactIdMembers(multi, &members, false,
8029
8030 for (int i = 0; i < nmembers; i++)
8031 {
8032 xid = members[i].xid;
8034 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8035 *NoFreezePageRelfrozenXid = xid;
8036 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
8037 freeze = true;
8038 }
8039 if (nmembers > 0)
8040 pfree(members);
8041 }
8042
8043 if (tuple->t_infomask & HEAP_MOVED)
8044 {
8045 xid = HeapTupleHeaderGetXvac(tuple);
8046 if (TransactionIdIsNormal(xid))
8047 {
8049 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8050 *NoFreezePageRelfrozenXid = xid;
8051 /* heap_prepare_freeze_tuple forces xvac freezing */
8052 freeze = true;
8053 }
8054 }
8055
8056 return freeze;
8057}

References Assert, VacuumCutoffs::FreezeLimit, GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_MOVED, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), i, InvalidMultiXactId, InvalidTransactionId, VacuumCutoffs::MultiXactCutoff, MultiXactIdIsValid, MultiXactIdPrecedes(), MultiXactIdPrecedesOrEquals(), pfree(), VacuumCutoffs::relfrozenxid, VacuumCutoffs::relminmxid, HeapTupleHeaderData::t_infomask, TransactionIdIsNormal, TransactionIdPrecedes(), TransactionIdPrecedesOrEquals(), and MultiXactMember::xid.

Referenced by heap_prepare_freeze_tuple(), and lazy_scan_noprune().

◆ heap_update()

TM_Result heap_update ( Relation  relation,
const ItemPointerData otid,
HeapTuple  newtup,
CommandId  cid,
Snapshot  crosscheck,
bool  wait,
TM_FailureData tmfd,
LockTupleMode lockmode,
TU_UpdateIndexes update_indexes 
)

Definition at line 3323 of file heapam.c.

3327{
3328 TM_Result result;
3336 ItemId lp;
3340 bool old_key_copied = false;
3341 Page page,
3342 newpage;
3343 BlockNumber block;
3345 Buffer buffer,
3346 newbuf,
3347 vmbuffer = InvalidBuffer,
3349 bool need_toast;
3351 pagefree;
3352 bool have_tuple_lock = false;
3353 bool iscombo;
3354 bool use_hot_update = false;
3355 bool summarized_update = false;
3356 bool key_intact;
3357 bool all_visible_cleared = false;
3358 bool all_visible_cleared_new = false;
3359 bool checked_lockers;
3360 bool locker_remains;
3361 bool id_has_external = false;
3368
3370
3371 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3374
3375 AssertHasSnapshotForToast(relation);
3376
3377 /*
3378 * Forbid this during a parallel operation, lest it allocate a combo CID.
3379 * Other workers might need that combo CID for visibility checks, and we
3380 * have no provision for broadcasting it to them.
3381 */
3382 if (IsInParallelMode())
3383 ereport(ERROR,
3385 errmsg("cannot update tuples during a parallel operation")));
3386
3387#ifdef USE_ASSERT_CHECKING
3389#endif
3390
3391 /*
3392 * Fetch the list of attributes to be checked for various operations.
3393 *
3394 * For HOT considerations, this is wasted effort if we fail to update or
3395 * have to put the new tuple on a different page. But we must compute the
3396 * list before obtaining buffer lock --- in the worst case, if we are
3397 * doing an update on one of the relevant system catalogs, we could
3398 * deadlock if we try to fetch the list later. In any case, the relcache
3399 * caches the data so this is usually pretty cheap.
3400 *
3401 * We also need columns used by the replica identity and columns that are
3402 * considered the "key" of rows in the table.
3403 *
3404 * Note that we get copies of each bitmap, so we need not worry about
3405 * relcache flush happening midway through.
3406 */
3419
3421 INJECTION_POINT("heap_update-before-pin", NULL);
3422 buffer = ReadBuffer(relation, block);
3423 page = BufferGetPage(buffer);
3424
3425 /*
3426 * Before locking the buffer, pin the visibility map page if it appears to
3427 * be necessary. Since we haven't got the lock yet, someone else might be
3428 * in the middle of changing this, so we'll need to recheck after we have
3429 * the lock.
3430 */
3431 if (PageIsAllVisible(page))
3432 visibilitymap_pin(relation, block, &vmbuffer);
3433
3435
3437
3438 /*
3439 * Usually, a buffer pin and/or snapshot blocks pruning of otid, ensuring
3440 * we see LP_NORMAL here. When the otid origin is a syscache, we may have
3441 * neither a pin nor a snapshot. Hence, we may see other LP_ states, each
3442 * of which indicates concurrent pruning.
3443 *
3444 * Failing with TM_Updated would be most accurate. However, unlike other
3445 * TM_Updated scenarios, we don't know the successor ctid in LP_UNUSED and
3446 * LP_DEAD cases. While the distinction between TM_Updated and TM_Deleted
3447 * does matter to SQL statements UPDATE and MERGE, those SQL statements
3448 * hold a snapshot that ensures LP_NORMAL. Hence, the choice between
3449 * TM_Updated and TM_Deleted affects only the wording of error messages.
3450 * Settle on TM_Deleted, for two reasons. First, it avoids complicating
3451 * the specification of when tmfd->ctid is valid. Second, it creates
3452 * error log evidence that we took this branch.
3453 *
3454 * Since it's possible to see LP_UNUSED at otid, it's also possible to see
3455 * LP_NORMAL for a tuple that replaced LP_UNUSED. If it's a tuple for an
3456 * unrelated row, we'll fail with "duplicate key value violates unique".
3457 * XXX if otid is the live, newer version of the newtup row, we'll discard
3458 * changes originating in versions of this catalog row after the version
3459 * the caller got from syscache. See syscache-update-pruned.spec.
3460 */
3461 if (!ItemIdIsNormal(lp))
3462 {
3464
3465 UnlockReleaseBuffer(buffer);
3467 if (vmbuffer != InvalidBuffer)
3468 ReleaseBuffer(vmbuffer);
3469 tmfd->ctid = *otid;
3470 tmfd->xmax = InvalidTransactionId;
3471 tmfd->cmax = InvalidCommandId;
3473
3478 /* modified_attrs not yet initialized */
3480 return TM_Deleted;
3481 }
3482
3483 /*
3484 * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3485 * properly.
3486 */
3487 oldtup.t_tableOid = RelationGetRelid(relation);
3488 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3489 oldtup.t_len = ItemIdGetLength(lp);
3490 oldtup.t_self = *otid;
3491
3492 /* the new tuple is ready, except for this: */
3493 newtup->t_tableOid = RelationGetRelid(relation);
3494
3495 /*
3496 * Determine columns modified by the update. Additionally, identify
3497 * whether any of the unmodified replica identity key attributes in the
3498 * old tuple is externally stored or not. This is required because for
3499 * such attributes the flattened value won't be WAL logged as part of the
3500 * new tuple so we must include it as part of the old_key_tuple. See
3501 * ExtractReplicaIdentity.
3502 */
3504 id_attrs, &oldtup,
3506
3507 /*
3508 * If we're not updating any "key" column, we can grab a weaker lock type.
3509 * This allows for more concurrency when we are running simultaneously
3510 * with foreign key checks.
3511 *
3512 * Note that if a column gets detoasted while executing the update, but
3513 * the value ends up being the same, this test will fail and we will use
3514 * the stronger lock. This is acceptable; the important case to optimize
3515 * is updates that don't manipulate key columns, not those that
3516 * serendipitously arrive at the same key values.
3517 */
3519 {
3520 *lockmode = LockTupleNoKeyExclusive;
3522 key_intact = true;
3523
3524 /*
3525 * If this is the first possibly-multixact-able operation in the
3526 * current transaction, set my per-backend OldestMemberMXactId
3527 * setting. We can be certain that the transaction will never become a
3528 * member of any older MultiXactIds than that. (We have to do this
3529 * even if we end up just using our own TransactionId below, since
3530 * some other backend could incorporate our XID into a MultiXact
3531 * immediately afterwards.)
3532 */
3534 }
3535 else
3536 {
3537 *lockmode = LockTupleExclusive;
3539 key_intact = false;
3540 }
3541
3542 /*
3543 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3544 * otid may very well point at newtup->t_self, which we will overwrite
3545 * with the new tuple's location, so there's great risk of confusion if we
3546 * use otid anymore.
3547 */
3548
3549l2:
3550 checked_lockers = false;
3551 locker_remains = false;
3552 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3553
3554 /* see below about the "no wait" case */
3555 Assert(result != TM_BeingModified || wait);
3556
3557 if (result == TM_Invisible)
3558 {
3559 UnlockReleaseBuffer(buffer);
3560 ereport(ERROR,
3562 errmsg("attempted to update invisible tuple")));
3563 }
3564 else if (result == TM_BeingModified && wait)
3565 {
3568 bool can_continue = false;
3569
3570 /*
3571 * XXX note that we don't consider the "no wait" case here. This
3572 * isn't a problem currently because no caller uses that case, but it
3573 * should be fixed if such a caller is introduced. It wasn't a
3574 * problem previously because this code would always wait, but now
3575 * that some tuple locks do not conflict with one of the lock modes we
3576 * use, it is possible that this case is interesting to handle
3577 * specially.
3578 *
3579 * This may cause failures with third-party code that calls
3580 * heap_update directly.
3581 */
3582
3583 /* must copy state data before unlocking buffer */
3585 infomask = oldtup.t_data->t_infomask;
3586
3587 /*
3588 * Now we have to do something about the existing locker. If it's a
3589 * multi, sleep on it; we might be awakened before it is completely
3590 * gone (or even not sleep at all in some cases); we need to preserve
3591 * it as locker, unless it is gone completely.
3592 *
3593 * If it's not a multi, we need to check for sleeping conditions
3594 * before actually going to sleep. If the update doesn't conflict
3595 * with the locks, we just continue without sleeping (but making sure
3596 * it is preserved).
3597 *
3598 * Before sleeping, we need to acquire tuple lock to establish our
3599 * priority for the tuple (see heap_lock_tuple). LockTuple will
3600 * release us when we are next-in-line for the tuple. Note we must
3601 * not acquire the tuple lock until we're sure we're going to sleep;
3602 * otherwise we're open for race conditions with other transactions
3603 * holding the tuple lock which sleep on us.
3604 *
3605 * If we are forced to "start over" below, we keep the tuple lock;
3606 * this arranges that we stay at the head of the line while rechecking
3607 * tuple state.
3608 */
3610 {
3612 int remain;
3613 bool current_is_member = false;
3614
3616 *lockmode, &current_is_member))
3617 {
3619
3620 /*
3621 * Acquire the lock, if necessary (but skip it when we're
3622 * requesting a lock and already have one; avoids deadlock).
3623 */
3624 if (!current_is_member)
3625 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3627
3628 /* wait for multixact */
3630 relation, &oldtup.t_self, XLTW_Update,
3631 &remain);
3632 checked_lockers = true;
3633 locker_remains = remain != 0;
3635
3636 /*
3637 * If xwait had just locked the tuple then some other xact
3638 * could update this tuple before we get to this point. Check
3639 * for xmax change, and start over if so.
3640 */
3641 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3642 infomask) ||
3644 xwait))
3645 goto l2;
3646 }
3647
3648 /*
3649 * Note that the multixact may not be done by now. It could have
3650 * surviving members; our own xact or other subxacts of this
3651 * backend, and also any other concurrent transaction that locked
3652 * the tuple with LockTupleKeyShare if we only got
3653 * LockTupleNoKeyExclusive. If this is the case, we have to be
3654 * careful to mark the updated tuple with the surviving members in
3655 * Xmax.
3656 *
3657 * Note that there could have been another update in the
3658 * MultiXact. In that case, we need to check whether it committed
3659 * or aborted. If it aborted we are safe to update it again;
3660 * otherwise there is an update conflict, and we have to return
3661 * TableTuple{Deleted, Updated} below.
3662 *
3663 * In the LockTupleExclusive case, we still need to preserve the
3664 * surviving members: those would include the tuple locks we had
3665 * before this one, which are important to keep in case this
3666 * subxact aborts.
3667 */
3668 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3670 else
3672
3673 /*
3674 * There was no UPDATE in the MultiXact; or it aborted. No
3675 * TransactionIdIsInProgress() call needed here, since we called
3676 * MultiXactIdWait() above.
3677 */
3680 can_continue = true;
3681 }
3683 {
3684 /*
3685 * The only locker is ourselves; we can avoid grabbing the tuple
3686 * lock here, but must preserve our locking information.
3687 */
3688 checked_lockers = true;
3689 locker_remains = true;
3690 can_continue = true;
3691 }
3693 {
3694 /*
3695 * If it's just a key-share locker, and we're not changing the key
3696 * columns, we don't need to wait for it to end; but we need to
3697 * preserve it as locker.
3698 */
3699 checked_lockers = true;
3700 locker_remains = true;
3701 can_continue = true;
3702 }
3703 else
3704 {
3705 /*
3706 * Wait for regular transaction to end; but first, acquire tuple
3707 * lock.
3708 */
3710 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3712 XactLockTableWait(xwait, relation, &oldtup.t_self,
3713 XLTW_Update);
3714 checked_lockers = true;
3716
3717 /*
3718 * xwait is done, but if xwait had just locked the tuple then some
3719 * other xact could update this tuple before we get to this point.
3720 * Check for xmax change, and start over if so.
3721 */
3722 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3725 goto l2;
3726
3727 /* Otherwise check if it committed or aborted */
3728 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3729 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3730 can_continue = true;
3731 }
3732
3733 if (can_continue)
3734 result = TM_Ok;
3735 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3736 result = TM_Updated;
3737 else
3738 result = TM_Deleted;
3739 }
3740
3741 /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3742 if (result != TM_Ok)
3743 {
3744 Assert(result == TM_SelfModified ||
3745 result == TM_Updated ||
3746 result == TM_Deleted ||
3747 result == TM_BeingModified);
3748 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3749 Assert(result != TM_Updated ||
3750 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3751 }
3752
3753 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3754 {
3755 /* Perform additional check for transaction-snapshot mode RI updates */
3757 result = TM_Updated;
3758 }
3759
3760 if (result != TM_Ok)
3761 {
3762 tmfd->ctid = oldtup.t_data->t_ctid;
3763 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3764 if (result == TM_SelfModified)
3765 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3766 else
3767 tmfd->cmax = InvalidCommandId;
3768 UnlockReleaseBuffer(buffer);
3769 if (have_tuple_lock)
3770 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3771 if (vmbuffer != InvalidBuffer)
3772 ReleaseBuffer(vmbuffer);
3774
3781 return result;
3782 }
3783
3784 /*
3785 * If we didn't pin the visibility map page and the page has become all
3786 * visible while we were busy locking the buffer, or during some
3787 * subsequent window during which we had it unlocked, we'll have to unlock
3788 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3789 * bit unfortunate, especially since we'll now have to recheck whether the
3790 * tuple has been locked or updated under us, but hopefully it won't
3791 * happen very often.
3792 */
3793 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3794 {
3796 visibilitymap_pin(relation, block, &vmbuffer);
3798 goto l2;
3799 }
3800
3801 /* Fill in transaction status data */
3802
3803 /*
3804 * If the tuple we're updating is locked, we need to preserve the locking
3805 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3806 */
3808 oldtup.t_data->t_infomask,
3809 oldtup.t_data->t_infomask2,
3810 xid, *lockmode, true,
3813
3814 /*
3815 * And also prepare an Xmax value for the new copy of the tuple. If there
3816 * was no xmax previously, or there was one but all lockers are now gone,
3817 * then use InvalidTransactionId; otherwise, get the xmax from the old
3818 * tuple. (In rare cases that might also be InvalidTransactionId and yet
3819 * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3820 */
3821 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3822 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3825 else
3827
3829 {
3832 }
3833 else
3834 {
3835 /*
3836 * If we found a valid Xmax for the new tuple, then the infomask bits
3837 * to use on the new tuple depend on what was there on the old one.
3838 * Note that since we're doing an update, the only possibility is that
3839 * the lockers had FOR KEY SHARE lock.
3840 */
3841 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3842 {
3845 }
3846 else
3847 {
3850 }
3851 }
3852
3853 /*
3854 * Prepare the new tuple with the appropriate initial values of Xmin and
3855 * Xmax, as well as initial infomask bits as computed above.
3856 */
3857 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3858 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3859 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3861 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3862 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3864
3865 /*
3866 * Replace cid with a combo CID if necessary. Note that we already put
3867 * the plain cid into the new tuple.
3868 */
3870
3871 /*
3872 * If the toaster needs to be activated, OR if the new tuple will not fit
3873 * on the same page as the old, then we need to release the content lock
3874 * (but not the pin!) on the old tuple's buffer while we are off doing
3875 * TOAST and/or table-file-extension work. We must mark the old tuple to
3876 * show that it's locked, else other processes may try to update it
3877 * themselves.
3878 *
3879 * We need to invoke the toaster if there are already any out-of-line
3880 * toasted values present, or if the new tuple is over-threshold.
3881 */
3882 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3883 relation->rd_rel->relkind != RELKIND_MATVIEW)
3884 {
3885 /* toast table entries should never be recursively toasted */
3888 need_toast = false;
3889 }
3890 else
3893 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3894
3896
3897 newtupsize = MAXALIGN(newtup->t_len);
3898
3900 {
3904 bool cleared_all_frozen = false;
3905
3906 /*
3907 * To prevent concurrent sessions from updating the tuple, we have to
3908 * temporarily mark it locked, while we release the page-level lock.
3909 *
3910 * To satisfy the rule that any xid potentially appearing in a buffer
3911 * written out to disk, we unfortunately have to WAL log this
3912 * temporary modification. We can reuse xl_heap_lock for this
3913 * purpose. If we crash/error before following through with the
3914 * actual update, xmax will be of an aborted transaction, allowing
3915 * other sessions to proceed.
3916 */
3917
3918 /*
3919 * Compute xmax / infomask appropriate for locking the tuple. This has
3920 * to be done separately from the combo that's going to be used for
3921 * updating, because the potentially created multixact would otherwise
3922 * be wrong.
3923 */
3925 oldtup.t_data->t_infomask,
3926 oldtup.t_data->t_infomask2,
3927 xid, *lockmode, false,
3930
3932
3934
3935 /* Clear obsolete visibility flags ... */
3936 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3937 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3939 /* ... and store info about transaction updating this tuple */
3942 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3943 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3945
3946 /* temporarily make it look not-updated, but locked */
3947 oldtup.t_data->t_ctid = oldtup.t_self;
3948
3949 /*
3950 * Clear all-frozen bit on visibility map if needed. We could
3951 * immediately reset ALL_VISIBLE, but given that the WAL logging
3952 * overhead would be unchanged, that doesn't seem necessarily
3953 * worthwhile.
3954 */
3955 if (PageIsAllVisible(page) &&
3956 visibilitymap_clear(relation, block, vmbuffer,
3958 cleared_all_frozen = true;
3959
3960 MarkBufferDirty(buffer);
3961
3962 if (RelationNeedsWAL(relation))
3963 {
3966
3969
3970 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3972 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3973 oldtup.t_data->t_infomask2);
3974 xlrec.flags =
3978 PageSetLSN(page, recptr);
3979 }
3980
3982
3984
3985 /*
3986 * Let the toaster do its thing, if needed.
3987 *
3988 * Note: below this point, heaptup is the data we actually intend to
3989 * store into the relation; newtup is the caller's original untoasted
3990 * data.
3991 */
3992 if (need_toast)
3993 {
3994 /* Note we always use WAL and FSM during updates */
3996 newtupsize = MAXALIGN(heaptup->t_len);
3997 }
3998 else
3999 heaptup = newtup;
4000
4001 /*
4002 * Now, do we need a new page for the tuple, or not? This is a bit
4003 * tricky since someone else could have added tuples to the page while
4004 * we weren't looking. We have to recheck the available space after
4005 * reacquiring the buffer lock. But don't bother to do that if the
4006 * former amount of free space is still not enough; it's unlikely
4007 * there's more free now than before.
4008 *
4009 * What's more, if we need to get a new page, we will need to acquire
4010 * buffer locks on both old and new pages. To avoid deadlock against
4011 * some other backend trying to get the same two locks in the other
4012 * order, we must be consistent about the order we get the locks in.
4013 * We use the rule "lock the lower-numbered page of the relation
4014 * first". To implement this, we must do RelationGetBufferForTuple
4015 * while not holding the lock on the old page, and we must rely on it
4016 * to get the locks on both pages in the correct order.
4017 *
4018 * Another consideration is that we need visibility map page pin(s) if
4019 * we will have to clear the all-visible flag on either page. If we
4020 * call RelationGetBufferForTuple, we rely on it to acquire any such
4021 * pins; but if we don't, we have to handle that here. Hence we need
4022 * a loop.
4023 */
4024 for (;;)
4025 {
4026 if (newtupsize > pagefree)
4027 {
4028 /* It doesn't fit, must use RelationGetBufferForTuple. */
4029 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4030 buffer, 0, NULL,
4031 &vmbuffer_new, &vmbuffer,
4032 0);
4033 /* We're all done. */
4034 break;
4035 }
4036 /* Acquire VM page pin if needed and we don't have it. */
4037 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4038 visibilitymap_pin(relation, block, &vmbuffer);
4039 /* Re-acquire the lock on the old tuple's page. */
4041 /* Re-check using the up-to-date free space */
4043 if (newtupsize > pagefree ||
4044 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
4045 {
4046 /*
4047 * Rats, it doesn't fit anymore, or somebody just now set the
4048 * all-visible flag. We must now unlock and loop to avoid
4049 * deadlock. Fortunately, this path should seldom be taken.
4050 */
4052 }
4053 else
4054 {
4055 /* We're all done. */
4056 newbuf = buffer;
4057 break;
4058 }
4059 }
4060 }
4061 else
4062 {
4063 /* No TOAST work needed, and it'll fit on same page */
4064 newbuf = buffer;
4065 heaptup = newtup;
4066 }
4067
4069
4070 /*
4071 * We're about to do the actual update -- check for conflict first, to
4072 * avoid possibly having to roll back work we've just done.
4073 *
4074 * This is safe without a recheck as long as there is no possibility of
4075 * another process scanning the pages between this check and the update
4076 * being visible to the scan (i.e., exclusive buffer content lock(s) are
4077 * continuously held from this point until the tuple update is visible).
4078 *
4079 * For the new tuple the only check needed is at the relation level, but
4080 * since both tuples are in the same relation and the check for oldtup
4081 * will include checking the relation level, there is no benefit to a
4082 * separate check for the new tuple.
4083 */
4084 CheckForSerializableConflictIn(relation, &oldtup.t_self,
4085 BufferGetBlockNumber(buffer));
4086
4087 /*
4088 * At this point newbuf and buffer are both pinned and locked, and newbuf
4089 * has enough space for the new tuple. If they are the same buffer, only
4090 * one pin is held.
4091 */
4092
4093 if (newbuf == buffer)
4094 {
4095 /*
4096 * Since the new tuple is going into the same page, we might be able
4097 * to do a HOT update. Check if any of the index columns have been
4098 * changed.
4099 */
4101 {
4102 use_hot_update = true;
4103
4104 /*
4105 * If none of the columns that are used in hot-blocking indexes
4106 * were updated, we can apply HOT, but we do still need to check
4107 * if we need to update the summarizing indexes, and update those
4108 * indexes if the columns were updated, or we may fail to detect
4109 * e.g. value bound changes in BRIN minmax indexes.
4110 */
4112 summarized_update = true;
4113 }
4114 }
4115 else
4116 {
4117 /* Set a hint that the old page could use prune/defrag */
4118 PageSetFull(page);
4119 }
4120
4121 /*
4122 * Compute replica identity tuple before entering the critical section so
4123 * we don't PANIC upon a memory allocation failure.
4124 * ExtractReplicaIdentity() will return NULL if nothing needs to be
4125 * logged. Pass old key required as true only if the replica identity key
4126 * columns are modified or it has external data.
4127 */
4132
4133 /* NO EREPORT(ERROR) from here till changes are logged */
4135
4136 /*
4137 * If this transaction commits, the old tuple will become DEAD sooner or
4138 * later. Set flag that this page is a candidate for pruning once our xid
4139 * falls below the OldestXmin horizon. If the transaction finally aborts,
4140 * the subsequent page pruning will be a no-op and the hint will be
4141 * cleared.
4142 *
4143 * XXX Should we set hint on newbuf as well? If the transaction aborts,
4144 * there would be a prunable tuple in the newbuf; but for now we choose
4145 * not to optimize for aborts. Note that heap_xlog_update must be kept in
4146 * sync if this decision changes.
4147 */
4148 PageSetPrunable(page, xid);
4149
4150 if (use_hot_update)
4151 {
4152 /* Mark the old tuple as HOT-updated */
4154 /* And mark the new tuple as heap-only */
4156 /* Mark the caller's copy too, in case different from heaptup */
4158 }
4159 else
4160 {
4161 /* Make sure tuples are correctly marked as not-HOT */
4165 }
4166
4167 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4168
4169
4170 /* Clear obsolete visibility flags, possibly set by ourselves above... */
4171 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4172 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4173 /* ... and store info about transaction updating this tuple */
4176 oldtup.t_data->t_infomask |= infomask_old_tuple;
4177 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4179
4180 /* record address of new tuple in t_ctid of old one */
4181 oldtup.t_data->t_ctid = heaptup->t_self;
4182
4183 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4184 if (PageIsAllVisible(page))
4185 {
4186 all_visible_cleared = true;
4187 PageClearAllVisible(page);
4188 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4189 vmbuffer, VISIBILITYMAP_VALID_BITS);
4190 }
4191 if (newbuf != buffer && PageIsAllVisible(newpage))
4192 {
4197 }
4198
4199 if (newbuf != buffer)
4201 MarkBufferDirty(buffer);
4202
4203 /* XLOG stuff */
4204 if (RelationNeedsWAL(relation))
4205 {
4207
4208 /*
4209 * For logical decoding we need combo CIDs to properly decode the
4210 * catalog.
4211 */
4213 {
4214 log_heap_new_cid(relation, &oldtup);
4215 log_heap_new_cid(relation, heaptup);
4216 }
4217
4218 recptr = log_heap_update(relation, buffer,
4223 if (newbuf != buffer)
4224 {
4226 }
4227 PageSetLSN(page, recptr);
4228 }
4229
4231
4232 if (newbuf != buffer)
4235
4236 /*
4237 * Mark old tuple for invalidation from system caches at next command
4238 * boundary, and mark the new tuple for invalidation in case we abort. We
4239 * have to do this before releasing the buffer because oldtup is in the
4240 * buffer. (heaptup is all in local memory, but it's necessary to process
4241 * both tuple versions in one call to inval.c so we can avoid redundant
4242 * sinval messages.)
4243 */
4245
4246 /* Now we can release the buffer(s) */
4247 if (newbuf != buffer)
4249 ReleaseBuffer(buffer);
4252 if (BufferIsValid(vmbuffer))
4253 ReleaseBuffer(vmbuffer);
4254
4255 /*
4256 * Release the lmgr tuple lock, if we had it.
4257 */
4258 if (have_tuple_lock)
4259 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4260
4261 pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
4262
4263 /*
4264 * If heaptup is a private copy, release it. Don't forget to copy t_self
4265 * back to the caller's image, too.
4266 */
4267 if (heaptup != newtup)
4268 {
4269 newtup->t_self = heaptup->t_self;
4271 }
4272
4273 /*
4274 * If it is a HOT update, the update may still need to update summarized
4275 * indexes, lest we fail to update those summaries and get incorrect
4276 * results (for example, minmax bounds of the block may change with this
4277 * update).
4278 */
4279 if (use_hot_update)
4280 {
4283 else
4285 }
4286 else
4288
4291
4298
4299 return TM_Ok;
4300}

References Assert, AssertHasSnapshotForToast(), bms_add_members(), bms_free(), bms_overlap(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), TM_FailureData::ctid, DoesMultiXactIdConflict(), END_CRIT_SECTION, ereport, errcode(), errmsg, ERROR, ExtractReplicaIdentity(), fb(), GetCurrentTransactionId(), GetMultiXactIdHintBits(), HEAP2_XACT_MASK, heap_acquire_tuplock(), heap_freetuple(), HEAP_LOCKED_UPGRADED(), HEAP_MOVED, heap_toast_insert_or_update(), HEAP_UPDATED, HEAP_XACT_MASK, HEAP_XMAX_BITS, HEAP_XMAX_INVALID, HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HeapDetermineColumnsInfo(), HeapTupleClearHeapOnly(), HeapTupleClearHotUpdated(), HeapTupleGetUpdateXid(), HeapTupleHasExternal(), HeapTupleHeaderAdjustCmax(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetNatts, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderSetCmax(), HeapTupleHeaderSetCmin(), HeapTupleHeaderSetXmax(), HeapTupleHeaderSetXmin(), HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesVisibility(), HeapTupleSetHeapOnly(), HeapTupleSetHotUpdated(), INDEX_ATTR_BITMAP_HOT_BLOCKING, INDEX_ATTR_BITMAP_IDENTITY_KEY, INDEX_ATTR_BITMAP_KEY, INDEX_ATTR_BITMAP_SUMMARIZED, INJECTION_POINT, InvalidBuffer, InvalidCommandId, InvalidSnapshot, InvalidTransactionId, IsInParallelMode(), ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), LockTupleExclusive, LockTupleNoKeyExclusive, LockWaitBlock, log_heap_new_cid(), log_heap_update(), MarkBufferDirty(), MAXALIGN, MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, PageClearAllVisible(), PageGetHeapFreeSpace(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetFull(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_update(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetBufferForTuple(), RelationGetIndexAttrBitmap(), RelationGetNumberOfAttributes, RelationGetRelid, RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationPutHeapTuple(), RelationSupportsSysCache(), ReleaseBuffer(), SizeOfHeapLock, START_CRIT_SECTION, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TOAST_TUPLE_THRESHOLD, TransactionIdDidAbort(), TransactionIdEquals, TransactionIdIsCurrentTransactionId(), TransactionIdIsValid, TU_All, TU_None, TU_Summarizing, UnlockReleaseBuffer(), UnlockTupleTuplock, UpdateXmaxHintBits(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), VISIBILITYMAP_VALID_BITS, XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP_LOCK, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLTW_Update, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_update(), and simple_heap_update().

◆ HeapCheckForSerializableConflictOut()

void HeapCheckForSerializableConflictOut ( bool  visible,
Relation  relation,
HeapTuple  tuple,
Buffer  buffer,
Snapshot  snapshot 
)

Definition at line 9345 of file heapam.c.

9348{
9349 TransactionId xid;
9351
9352 if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9353 return;
9354
9355 /*
9356 * Check to see whether the tuple has been written to by a concurrent
9357 * transaction, either to create it not visible to us, or to delete it
9358 * while it is visible to us. The "visible" bool indicates whether the
9359 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9360 * is going on with it.
9361 *
9362 * In the event of a concurrently inserted tuple that also happens to have
9363 * been concurrently updated (by a separate transaction), the xmin of the
9364 * tuple will be used -- not the updater's xid.
9365 */
9367 switch (htsvResult)
9368 {
9369 case HEAPTUPLE_LIVE:
9370 if (visible)
9371 return;
9372 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9373 break;
9376 if (visible)
9377 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9378 else
9379 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9380
9382 {
9383 /* This is like the HEAPTUPLE_DEAD case */
9384 Assert(!visible);
9385 return;
9386 }
9387 break;
9389 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9390 break;
9391 case HEAPTUPLE_DEAD:
9392 Assert(!visible);
9393 return;
9394 default:
9395
9396 /*
9397 * The only way to get to this default clause is if a new value is
9398 * added to the enum type without adding it to this switch
9399 * statement. That's a bug, so elog.
9400 */
9401 elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9402
9403 /*
9404 * In spite of having all enum values covered and calling elog on
9405 * this default, some compilers think this is a code path which
9406 * allows xid to be used below without initialization. Silence
9407 * that warning.
9408 */
9410 }
9411
9414
9415 /*
9416 * Find top level xid. Bail out if xid is too early to be a conflict, or
9417 * if it's our own xid.
9418 */
9420 return;
9423 return;
9424
9425 CheckForSerializableConflictOut(relation, xid, snapshot);
9426}

References Assert, CheckForSerializableConflictOut(), CheckForSerializableConflictOutNeeded(), elog, ERROR, fb(), GetTopTransactionIdIfAny(), HEAPTUPLE_DEAD, HEAPTUPLE_DELETE_IN_PROGRESS, HEAPTUPLE_INSERT_IN_PROGRESS, HEAPTUPLE_LIVE, HEAPTUPLE_RECENTLY_DEAD, HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleSatisfiesVacuum(), InvalidTransactionId, SubTransGetTopmostTransaction(), HeapTupleData::t_data, TransactionIdEquals, TransactionIdFollowsOrEquals(), TransactionIdIsValid, TransactionIdPrecedes(), and TransactionXmin.

Referenced by BitmapHeapScanNextBlock(), heap_fetch(), heap_get_latest_tid(), heap_hot_search_buffer(), heapam_scan_sample_next_tuple(), heapgettup(), and page_collect_tuples().

◆ HeapDetermineColumnsInfo()

static Bitmapset * HeapDetermineColumnsInfo ( Relation  relation,
Bitmapset interesting_cols,
Bitmapset external_cols,
HeapTuple  oldtup,
HeapTuple  newtup,
bool has_external 
)
static

Definition at line 4480 of file heapam.c.

4485{
4486 int attidx;
4488 TupleDesc tupdesc = RelationGetDescr(relation);
4489
4490 attidx = -1;
4491 while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
4492 {
4493 /* attidx is zero-based, attrnum is the normal attribute number */
4495 Datum value1,
4496 value2;
4497 bool isnull1,
4498 isnull2;
4499
4500 /*
4501 * If it's a whole-tuple reference, say "not equal". It's not really
4502 * worth supporting this case, since it could only succeed after a
4503 * no-op update, which is hardly a case worth optimizing for.
4504 */
4505 if (attrnum == 0)
4506 {
4507 modified = bms_add_member(modified, attidx);
4508 continue;
4509 }
4510
4511 /*
4512 * Likewise, automatically say "not equal" for any system attribute
4513 * other than tableOID; we cannot expect these to be consistent in a
4514 * HOT chain, or even to be set correctly yet in the new tuple.
4515 */
4516 if (attrnum < 0)
4517 {
4518 if (attrnum != TableOidAttributeNumber)
4519 {
4520 modified = bms_add_member(modified, attidx);
4521 continue;
4522 }
4523 }
4524
4525 /*
4526 * Extract the corresponding values. XXX this is pretty inefficient
4527 * if there are many indexed columns. Should we do a single
4528 * heap_deform_tuple call on each tuple, instead? But that doesn't
4529 * work for system columns ...
4530 */
4531 value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4532 value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4533
4534 if (!heap_attr_equals(tupdesc, attrnum, value1,
4535 value2, isnull1, isnull2))
4536 {
4537 modified = bms_add_member(modified, attidx);
4538 continue;
4539 }
4540
4541 /*
4542 * No need to check attributes that can't be stored externally. Note
4543 * that system attributes can't be stored externally.
4544 */
4545 if (attrnum < 0 || isnull1 ||
4546 TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
4547 continue;
4548
4549 /*
4550 * Check if the old tuple's attribute is stored externally and is a
4551 * member of external_cols.
4552 */
4555 *has_external = true;
4556 }
4557
4558 return modified;
4559}

References attlen, bms_add_member(), bms_is_member(), bms_next_member(), DatumGetPointer(), fb(), FirstLowInvalidHeapAttributeNumber, heap_attr_equals(), heap_getattr(), RelationGetDescr, TableOidAttributeNumber, TupleDescCompactAttr(), and VARATT_IS_EXTERNAL().

Referenced by heap_update().

◆ heapgettup()

static void heapgettup ( HeapScanDesc  scan,
ScanDirection  dir,
int  nkeys,
ScanKey  key 
)
static

Definition at line 960 of file heapam.c.

964{
965 HeapTuple tuple = &(scan->rs_ctup);
966 Page page;
968 int linesleft;
969
970 if (likely(scan->rs_inited))
971 {
972 /* continue from previously returned page/tuple */
974 page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
975 goto continue_page;
976 }
977
978 /*
979 * advance the scan until we find a qualifying tuple or run out of stuff
980 * to scan
981 */
982 while (true)
983 {
984 heap_fetch_next_buffer(scan, dir);
985
986 /* did we run out of blocks to scan? */
987 if (!BufferIsValid(scan->rs_cbuf))
988 break;
989
991
993 page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
995
996 /*
997 * Only continue scanning the page while we have lines left.
998 *
999 * Note that this protects us from accessing line pointers past
1000 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
1001 * table scan, and for when we start scanning a new page.
1002 */
1003 for (; linesleft > 0; linesleft--, lineoff += dir)
1004 {
1005 bool visible;
1007
1008 if (!ItemIdIsNormal(lpp))
1009 continue;
1010
1011 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1012 tuple->t_len = ItemIdGetLength(lpp);
1013 ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
1014
1015 visible = HeapTupleSatisfiesVisibility(tuple,
1016 scan->rs_base.rs_snapshot,
1017 scan->rs_cbuf);
1018
1020 tuple, scan->rs_cbuf,
1021 scan->rs_base.rs_snapshot);
1022
1023 /* skip tuples not visible to this snapshot */
1024 if (!visible)
1025 continue;
1026
1027 /* skip any tuples that don't match the scan key */
1028 if (key != NULL &&
1030 nkeys, key))
1031 continue;
1032
1034 scan->rs_coffset = lineoff;
1035 return;
1036 }
1037
1038 /*
1039 * if we get here, it means we've exhausted the items on this page and
1040 * it's time to move to the next.
1041 */
1043 }
1044
1045 /* end of scan */
1046 if (BufferIsValid(scan->rs_cbuf))
1047 ReleaseBuffer(scan->rs_cbuf);
1048
1049 scan->rs_cbuf = InvalidBuffer;
1052 tuple->t_data = NULL;
1053 scan->rs_inited = false;
1054}

References Assert, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid(), fb(), heap_fetch_next_buffer(), HeapCheckForSerializableConflictOut(), heapgettup_continue_page(), heapgettup_start_page(), HeapKeyTest(), HeapTupleSatisfiesVisibility(), InvalidBlockNumber, InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerSet(), likely, LockBuffer(), PageGetItem(), PageGetItemId(), RelationGetDescr, ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_coffset, HeapScanDescData::rs_ctup, HeapScanDescData::rs_inited, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, TableScanDescData::rs_snapshot, HeapTupleData::t_data, HeapTupleData::t_len, and HeapTupleData::t_self.

Referenced by heap_getnext(), heap_getnextslot(), and heap_getnextslot_tidrange().

◆ heapgettup_advance_block()

static BlockNumber heapgettup_advance_block ( HeapScanDesc  scan,
BlockNumber  block,
ScanDirection  dir 
)
inlinestatic

Definition at line 876 of file heapam.c.

877{
878 Assert(scan->rs_base.rs_parallel == NULL);
879
881 {
882 block++;
883
884 /* wrap back to the start of the heap */
885 if (block >= scan->rs_nblocks)
886 block = 0;
887
888 /*
889 * Report our new scan position for synchronization purposes. We don't
890 * do that when moving backwards, however. That would just mess up any
891 * other forward-moving scanners.
892 *
893 * Note: we do this before checking for end of scan so that the final
894 * state of the position hint is back at the start of the rel. That's
895 * not strictly necessary, but otherwise when you run the same query
896 * multiple times the starting position would shift a little bit
897 * backwards on every invocation, which is confusing. We don't
898 * guarantee any specific ordering in general, though.
899 */
900 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
901 ss_report_location(scan->rs_base.rs_rd, block);
902
903 /* we're done if we're back at where we started */
904 if (block == scan->rs_startblock)
905 return InvalidBlockNumber;
906
907 /* check if the limit imposed by heap_setscanlimits() is met */
908 if (scan->rs_numblocks != InvalidBlockNumber)
909 {
910 if (--scan->rs_numblocks == 0)
911 return InvalidBlockNumber;
912 }
913
914 return block;
915 }
916 else
917 {
918 /* we're done if the last block is the start position */
919 if (block == scan->rs_startblock)
920 return InvalidBlockNumber;
921
922 /* check if the limit imposed by heap_setscanlimits() is met */
923 if (scan->rs_numblocks != InvalidBlockNumber)
924 {
925 if (--scan->rs_numblocks == 0)
926 return InvalidBlockNumber;
927 }
928
929 /* wrap to the end of the heap when the last page was page 0 */
930 if (block == 0)
931 block = scan->rs_nblocks;
932
933 block--;
934
935 return block;
936 }
937}

References Assert, fb(), InvalidBlockNumber, likely, HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_nblocks, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, TableScanDescData::rs_rd, HeapScanDescData::rs_startblock, ScanDirectionIsForward, SO_ALLOW_SYNC, and ss_report_location().

Referenced by heap_scan_stream_read_next_serial().

◆ heapgettup_continue_page()

static Page heapgettup_continue_page ( HeapScanDesc  scan,
ScanDirection  dir,
int linesleft,
OffsetNumber lineoff 
)
inlinestatic

Definition at line 830 of file heapam.c.

832{
833 Page page;
834
835 Assert(scan->rs_inited);
837
838 /* Caller is responsible for ensuring buffer is locked if needed */
839 page = BufferGetPage(scan->rs_cbuf);
840
841 if (ScanDirectionIsForward(dir))
842 {
844 *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
845 }
846 else
847 {
848 /*
849 * The previous returned tuple may have been vacuumed since the
850 * previous scan when we use a non-MVCC snapshot, so we must
851 * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
852 */
854 *linesleft = *lineoff;
855 }
856
857 /* lineoff now references the physically previous or next tid */
858 return page;
859}

References Assert, BufferGetPage(), BufferIsValid(), fb(), Min, OffsetNumberNext, OffsetNumberPrev, PageGetMaxOffsetNumber(), HeapScanDescData::rs_cbuf, HeapScanDescData::rs_coffset, HeapScanDescData::rs_inited, and ScanDirectionIsForward.

Referenced by heapgettup().

◆ heapgettup_initial_block()

static pg_noinline BlockNumber heapgettup_initial_block ( HeapScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 752 of file heapam.c.

753{
754 Assert(!scan->rs_inited);
755 Assert(scan->rs_base.rs_parallel == NULL);
756
757 /* When there are no pages to scan, return InvalidBlockNumber */
758 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
759 return InvalidBlockNumber;
760
761 if (ScanDirectionIsForward(dir))
762 {
763 return scan->rs_startblock;
764 }
765 else
766 {
767 /*
768 * Disable reporting to syncscan logic in a backwards scan; it's not
769 * very likely anyone else is doing the same thing at the same time,
770 * and much more likely that we'll just bollix things for forward
771 * scanners.
772 */
774
775 /*
776 * Start from last page of the scan. Ensure we take into account
777 * rs_numblocks if it's been adjusted by heap_setscanlimits().
778 */
779 if (scan->rs_numblocks != InvalidBlockNumber)
780 return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
781
782 if (scan->rs_startblock > 0)
783 return scan->rs_startblock - 1;
784
785 return scan->rs_nblocks - 1;
786 }
787}

References Assert, fb(), InvalidBlockNumber, HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, HeapScanDescData::rs_nblocks, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, HeapScanDescData::rs_startblock, and ScanDirectionIsForward.

Referenced by heap_scan_stream_read_next_serial().

◆ heapgettup_pagemode()

static void heapgettup_pagemode ( HeapScanDesc  scan,
ScanDirection  dir,
int  nkeys,
ScanKey  key 
)
static

Definition at line 1070 of file heapam.c.

1074{
1075 HeapTuple tuple = &(scan->rs_ctup);
1076 Page page;
1079
1080 if (likely(scan->rs_inited))
1081 {
1082 /* continue from previously returned page/tuple */
1083 page = BufferGetPage(scan->rs_cbuf);
1084
1085 lineindex = scan->rs_cindex + dir;
1086 if (ScanDirectionIsForward(dir))
1087 linesleft = scan->rs_ntuples - lineindex;
1088 else
1089 linesleft = scan->rs_cindex;
1090 /* lineindex now references the next or previous visible tid */
1091
1092 goto continue_page;
1093 }
1094
1095 /*
1096 * advance the scan until we find a qualifying tuple or run out of stuff
1097 * to scan
1098 */
1099 while (true)
1100 {
1101 heap_fetch_next_buffer(scan, dir);
1102
1103 /* did we run out of blocks to scan? */
1104 if (!BufferIsValid(scan->rs_cbuf))
1105 break;
1106
1108
1109 /* prune the page and determine visible tuple offsets */
1111 page = BufferGetPage(scan->rs_cbuf);
1112 linesleft = scan->rs_ntuples;
1114
1115 /* block is the same for all tuples, set it once outside the loop */
1117
1118 /* lineindex now references the next or previous visible tid */
1120
1121 for (; linesleft > 0; linesleft--, lineindex += dir)
1122 {
1123 ItemId lpp;
1125
1126 Assert(lineindex < scan->rs_ntuples);
1128 lpp = PageGetItemId(page, lineoff);
1130
1131 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1132 tuple->t_len = ItemIdGetLength(lpp);
1134
1135 /* skip any tuples that don't match the scan key */
1136 if (key != NULL &&
1138 nkeys, key))
1139 continue;
1140
1141 scan->rs_cindex = lineindex;
1142 return;
1143 }
1144 }
1145
1146 /* end of scan */
1147 if (BufferIsValid(scan->rs_cbuf))
1148 ReleaseBuffer(scan->rs_cbuf);
1149 scan->rs_cbuf = InvalidBuffer;
1152 tuple->t_data = NULL;
1153 scan->rs_inited = false;
1154}

References Assert, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), fb(), heap_fetch_next_buffer(), heap_prepare_pagescan(), HeapKeyTest(), InvalidBlockNumber, InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerSetBlockNumber(), ItemPointerSetOffsetNumber(), likely, PageGetItem(), PageGetItemId(), RelationGetDescr, ReleaseBuffer(), HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_cindex, HeapScanDescData::rs_ctup, HeapScanDescData::rs_inited, HeapScanDescData::rs_ntuples, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, ScanDirectionIsForward, HeapTupleData::t_data, HeapTupleData::t_len, and HeapTupleData::t_self.

Referenced by heap_getnext(), heap_getnextslot(), and heap_getnextslot_tidrange().

◆ heapgettup_start_page()

static Page heapgettup_start_page ( HeapScanDesc  scan,
ScanDirection  dir,
int linesleft,
OffsetNumber lineoff 
)
static

Definition at line 799 of file heapam.c.

801{
802 Page page;
803
804 Assert(scan->rs_inited);
806
807 /* Caller is responsible for ensuring buffer is locked if needed */
808 page = BufferGetPage(scan->rs_cbuf);
809
811
812 if (ScanDirectionIsForward(dir))
814 else
816
817 /* lineoff now references the physically previous or next tid */
818 return page;
819}

References Assert, BufferGetPage(), BufferIsValid(), fb(), FirstOffsetNumber, PageGetMaxOffsetNumber(), HeapScanDescData::rs_cbuf, HeapScanDescData::rs_inited, and ScanDirectionIsForward.

Referenced by heapgettup().

◆ HeapTupleGetUpdateXid()

◆ HeapTupleHeaderAdvanceConflictHorizon()

void HeapTupleHeaderAdvanceConflictHorizon ( HeapTupleHeader  tuple,
TransactionId snapshotConflictHorizon 
)

Definition at line 8073 of file heapam.c.

8075{
8079
8080 if (tuple->t_infomask & HEAP_MOVED)
8081 {
8082 if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
8083 *snapshotConflictHorizon = xvac;
8084 }
8085
8086 /*
8087 * Ignore tuples inserted by an aborted transaction or if the tuple was
8088 * updated/deleted by the inserting transaction.
8089 *
8090 * Look for a committed hint bit, or if no xmin bit is set, check clog.
8091 */
8092 if (HeapTupleHeaderXminCommitted(tuple) ||
8094 {
8095 if (xmax != xmin &&
8096 TransactionIdFollows(xmax, *snapshotConflictHorizon))
8097 *snapshotConflictHorizon = xmax;
8098 }
8099}

References fb(), HEAP_MOVED, HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), HeapTupleHeaderXminCommitted(), HeapTupleHeaderXminInvalid(), HeapTupleHeaderData::t_infomask, TransactionIdDidCommit(), TransactionIdFollows(), and TransactionIdPrecedes().

Referenced by heap_index_delete_tuples(), heap_prune_chain(), and prune_freeze_plan().

◆ index_delete_check_htid()

static void index_delete_check_htid ( TM_IndexDeleteOp delstate,
Page  page,
OffsetNumber  maxoff,
const ItemPointerData htid,
TM_IndexStatus istatus 
)
inlinestatic

Definition at line 8158 of file heapam.c.

8161{
8163 ItemId iid;
8164
8165 Assert(OffsetNumberIsValid(istatus->idxoffnum));
8166
8167 if (unlikely(indexpagehoffnum > maxoff))
8168 ereport(ERROR,
8170 errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
8173 istatus->idxoffnum, delstate->iblknum,
8175
8177 if (unlikely(!ItemIdIsUsed(iid)))
8178 ereport(ERROR,
8180 errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
8183 istatus->idxoffnum, delstate->iblknum,
8185
8186 if (ItemIdHasStorage(iid))
8187 {
8188 HeapTupleHeader htup;
8189
8191 htup = (HeapTupleHeader) PageGetItem(page, iid);
8192
8194 ereport(ERROR,
8196 errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
8199 istatus->idxoffnum, delstate->iblknum,
8201 }
8202}

References Assert, ereport, errcode(), errmsg_internal(), ERROR, fb(), HeapTupleHeaderIsHeapOnly(), ItemIdHasStorage, ItemIdIsNormal, ItemIdIsUsed, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), OffsetNumberIsValid, PageGetItem(), PageGetItemId(), RelationGetRelationName, and unlikely.

Referenced by heap_index_delete_tuples().

◆ index_delete_sort()

static void index_delete_sort ( TM_IndexDeleteOp delstate)
static

Definition at line 8563 of file heapam.c.

8564{
8565 TM_IndexDelete *deltids = delstate->deltids;
8566 int ndeltids = delstate->ndeltids;
8567
8568 /*
8569 * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
8570 *
8571 * This implementation is fast with array sizes up to ~4500. This covers
8572 * all supported BLCKSZ values.
8573 */
8574 const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
8575
8576 /* Think carefully before changing anything here -- keep swaps cheap */
8577 StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
8578 "element size exceeds 8 bytes");
8579
8580 for (int g = 0; g < lengthof(gaps); g++)
8581 {
8582 for (int hi = gaps[g], i = hi; i < ndeltids; i++)
8583 {
8584 TM_IndexDelete d = deltids[i];
8585 int j = i;
8586
8587 while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
8588 {
8589 deltids[j] = deltids[j - hi];
8590 j -= hi;
8591 }
8592 deltids[j] = d;
8593 }
8594 }
8595}

References fb(), i, index_delete_sort_cmp(), j, lengthof, and StaticAssertDecl.

Referenced by heap_index_delete_tuples().

◆ index_delete_sort_cmp()

static int index_delete_sort_cmp ( TM_IndexDelete deltid1,
TM_IndexDelete deltid2 
)
inlinestatic

Definition at line 8527 of file heapam.c.

8528{
8529 ItemPointer tid1 = &deltid1->tid;
8530 ItemPointer tid2 = &deltid2->tid;
8531
8532 {
8535
8536 if (blk1 != blk2)
8537 return (blk1 < blk2) ? -1 : 1;
8538 }
8539 {
8542
8543 if (pos1 != pos2)
8544 return (pos1 < pos2) ? -1 : 1;
8545 }
8546
8547 Assert(false);
8548
8549 return 0;
8550}

References Assert, fb(), ItemPointerGetBlockNumber(), and ItemPointerGetOffsetNumber().

Referenced by index_delete_sort().

◆ initscan()

static void initscan ( HeapScanDesc  scan,
ScanKey  key,
bool  keep_startblock 
)
static

Definition at line 357 of file heapam.c.

358{
360 bool allow_strat;
361 bool allow_sync;
362
363 /*
364 * Determine the number of blocks we have to scan.
365 *
366 * It is sufficient to do this once at scan start, since any tuples added
367 * while the scan is in progress will be invisible to my snapshot anyway.
368 * (That is not true when using a non-MVCC snapshot. However, we couldn't
369 * guarantee to return tuples added after scan start anyway, since they
370 * might go into pages we already scanned. To guarantee consistent
371 * results for a non-MVCC snapshot, the caller must hold some higher-level
372 * lock that ensures the interesting tuple(s) won't change.)
373 */
374 if (scan->rs_base.rs_parallel != NULL)
375 {
377 scan->rs_nblocks = bpscan->phs_nblocks;
378 }
379 else
381
382 /*
383 * If the table is large relative to NBuffers, use a bulk-read access
384 * strategy and enable synchronized scanning (see syncscan.c). Although
385 * the thresholds for these features could be different, we make them the
386 * same so that there are only two behaviors to tune rather than four.
387 * (However, some callers need to be able to disable one or both of these
388 * behaviors, independently of the size of the table; also there is a GUC
389 * variable that can disable synchronized scanning.)
390 *
391 * Note that table_block_parallelscan_initialize has a very similar test;
392 * if you change this, consider changing that one, too.
393 */
395 scan->rs_nblocks > NBuffers / 4)
396 {
398 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
399 }
400 else
401 allow_strat = allow_sync = false;
402
403 if (allow_strat)
404 {
405 /* During a rescan, keep the previous strategy object. */
406 if (scan->rs_strategy == NULL)
408 }
409 else
410 {
411 if (scan->rs_strategy != NULL)
413 scan->rs_strategy = NULL;
414 }
415
416 if (scan->rs_base.rs_parallel != NULL)
417 {
418 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
421 else
423
424 /*
425 * If not rescanning, initialize the startblock. Finding the actual
426 * start location is done in table_block_parallelscan_startblock_init,
427 * based on whether an alternative start location has been set with
428 * heap_setscanlimits, or using the syncscan location, when syncscan
429 * is enabled.
430 */
431 if (!keep_startblock)
433 }
434 else
435 {
436 if (keep_startblock)
437 {
438 /*
439 * When rescanning, we want to keep the previous startblock
440 * setting, so that rewinding a cursor doesn't generate surprising
441 * results. Reset the active syncscan setting, though.
442 */
445 else
447 }
449 {
452 }
453 else
454 {
456 scan->rs_startblock = 0;
457 }
458 }
459
461 scan->rs_inited = false;
462 scan->rs_ctup.t_data = NULL;
464 scan->rs_cbuf = InvalidBuffer;
466 scan->rs_ntuples = 0;
467 scan->rs_cindex = 0;
468
469 /*
470 * Initialize to ForwardScanDirection because it is most common and
471 * because heap scans go forward before going backward (e.g. CURSORs).
472 */
475
476 /* page-at-a-time fields are always invalid when not rs_inited */
477
478 /*
479 * copy the scan key, if appropriate
480 */
481 if (key != NULL && scan->rs_base.rs_nkeys > 0)
482 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
483
484 /*
485 * Currently, we only have a stats counter for sequential heap scans (but
486 * e.g for bitmap scans the underlying bitmap index scans will be counted,
487 * and for sample scans we update stats for tuple fetches).
488 */
489 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
491}

References BAS_BULKREAD, fb(), ForwardScanDirection, FreeAccessStrategy(), GetAccessStrategy(), InvalidBlockNumber, InvalidBuffer, ItemPointerSetInvalid(), NBuffers, pgstat_count_heap_scan, ParallelTableScanDescData::phs_syncscan, RelationGetNumberOfBlocks, RelationUsesLocalBuffers, HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_cindex, HeapScanDescData::rs_ctup, HeapScanDescData::rs_dir, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, TableScanDescData::rs_key, HeapScanDescData::rs_nblocks, TableScanDescData::rs_nkeys, HeapScanDescData::rs_ntuples, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, HeapScanDescData::rs_startblock, HeapScanDescData::rs_strategy, SO_ALLOW_STRAT, SO_ALLOW_SYNC, SO_TYPE_SEQSCAN, ss_get_location(), synchronize_seqscans, HeapTupleData::t_data, and HeapTupleData::t_self.

Referenced by heap_beginscan(), and heap_rescan().

◆ log_heap_new_cid()

static XLogRecPtr log_heap_new_cid ( Relation  relation,
HeapTuple  tup 
)
static

Definition at line 9160 of file heapam.c.

9161{
9163
9165 HeapTupleHeader hdr = tup->t_data;
9166
9167 Assert(ItemPointerIsValid(&tup->t_self));
9168 Assert(tup->t_tableOid != InvalidOid);
9169
9170 xlrec.top_xid = GetTopTransactionId();
9171 xlrec.target_locator = relation->rd_locator;
9172 xlrec.target_tid = tup->t_self;
9173
9174 /*
9175 * If the tuple got inserted & deleted in the same TX we definitely have a
9176 * combo CID, set cmin and cmax.
9177 */
9178 if (hdr->t_infomask & HEAP_COMBOCID)
9179 {
9182 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
9183 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
9184 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
9185 }
9186 /* No combo CID, so only cmin or cmax can be set by this TX */
9187 else
9188 {
9189 /*
9190 * Tuple inserted.
9191 *
9192 * We need to check for LOCK ONLY because multixacts might be
9193 * transferred to the new tuple in case of FOR KEY SHARE updates in
9194 * which case there will be an xmax, although the tuple just got
9195 * inserted.
9196 */
9197 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
9199 {
9201 xlrec.cmax = InvalidCommandId;
9202 }
9203 /* Tuple from a different tx updated or deleted. */
9204 else
9205 {
9206 xlrec.cmin = InvalidCommandId;
9208 }
9209 xlrec.combocid = InvalidCommandId;
9210 }
9211
9212 /*
9213 * Note that we don't need to register the buffer here, because this
9214 * operation does not modify the page. The insert/update/delete that
9215 * called us certainly did, but that's WAL-logged separately.
9216 */
9219
9220 /* will be looked at irrespective of origin */
9221
9223
9224 return recptr;
9225}

References Assert, fb(), GetTopTransactionId(), HEAP_COMBOCID, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetCmin(), HeapTupleHeaderGetRawCommandId(), HeapTupleHeaderXminInvalid(), InvalidCommandId, InvalidOid, ItemPointerIsValid(), RelationData::rd_locator, SizeOfHeapNewCid, HeapTupleHeaderData::t_infomask, XLOG_HEAP2_NEW_CID, XLogBeginInsert(), XLogInsert(), and XLogRegisterData().

Referenced by heap_delete(), heap_insert(), heap_multi_insert(), and heap_update().

◆ log_heap_update()

static XLogRecPtr log_heap_update ( Relation  reln,
Buffer  oldbuf,
Buffer  newbuf,
HeapTuple  oldtup,
HeapTuple  newtup,
HeapTuple  old_key_tuple,
bool  all_visible_cleared,
bool  new_all_visible_cleared 
)
static

Definition at line 8938 of file heapam.c.

8942{
8946 uint8 info;
8948 uint16 prefixlen = 0,
8949 suffixlen = 0;
8951 Page page = BufferGetPage(newbuf);
8953 bool init;
8954 int bufflags;
8955
8956 /* Caller should not call me on a non-WAL-logged relation */
8958
8960
8962 info = XLOG_HEAP_HOT_UPDATE;
8963 else
8964 info = XLOG_HEAP_UPDATE;
8965
8966 /*
8967 * If the old and new tuple are on the same page, we only need to log the
8968 * parts of the new tuple that were changed. That saves on the amount of
8969 * WAL we need to write. Currently, we just count any unchanged bytes in
8970 * the beginning and end of the tuple. That's quick to check, and
8971 * perfectly covers the common case that only one field is updated.
8972 *
8973 * We could do this even if the old and new tuple are on different pages,
8974 * but only if we don't make a full-page image of the old page, which is
8975 * difficult to know in advance. Also, if the old tuple is corrupt for
8976 * some reason, it would allow the corruption to propagate the new page,
8977 * so it seems best to avoid. Under the general assumption that most
8978 * updates tend to create the new tuple version on the same page, there
8979 * isn't much to be gained by doing this across pages anyway.
8980 *
8981 * Skip this if we're taking a full-page image of the new page, as we
8982 * don't include the new tuple in the WAL record in that case. Also
8983 * disable if effective_wal_level='logical', as logical decoding needs to
8984 * be able to read the new tuple in whole from the WAL record alone.
8985 */
8986 if (oldbuf == newbuf && !need_tuple_data &&
8988 {
8989 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8990 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8991 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8992 int newlen = newtup->t_len - newtup->t_data->t_hoff;
8993
8994 /* Check for common prefix between old and new tuple */
8995 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8996 {
8997 if (newp[prefixlen] != oldp[prefixlen])
8998 break;
8999 }
9000
9001 /*
9002 * Storing the length of the prefix takes 2 bytes, so we need to save
9003 * at least 3 bytes or there's no point.
9004 */
9005 if (prefixlen < 3)
9006 prefixlen = 0;
9007
9008 /* Same for suffix */
9010 {
9011 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
9012 break;
9013 }
9014 if (suffixlen < 3)
9015 suffixlen = 0;
9016 }
9017
9018 /* Prepare main WAL data chain */
9019 xlrec.flags = 0;
9024 if (prefixlen > 0)
9026 if (suffixlen > 0)
9028 if (need_tuple_data)
9029 {
9031 if (old_key_tuple)
9032 {
9033 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
9035 else
9037 }
9038 }
9039
9040 /* If new tuple is the single and first tuple on page... */
9043 {
9044 info |= XLOG_HEAP_INIT_PAGE;
9045 init = true;
9046 }
9047 else
9048 init = false;
9049
9050 /* Prepare WAL data for the old page */
9051 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
9052 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
9053 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
9054 oldtup->t_data->t_infomask2);
9055
9056 /* Prepare WAL data for the new page */
9057 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
9058 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
9059
9061 if (init)
9063 if (need_tuple_data)
9065
9067 if (oldbuf != newbuf)
9069
9071
9072 /*
9073 * Prepare WAL data for the new tuple.
9074 */
9075 if (prefixlen > 0 || suffixlen > 0)
9076 {
9077 if (prefixlen > 0 && suffixlen > 0)
9078 {
9081 XLogRegisterBufData(0, &prefix_suffix, sizeof(uint16) * 2);
9082 }
9083 else if (prefixlen > 0)
9084 {
9085 XLogRegisterBufData(0, &prefixlen, sizeof(uint16));
9086 }
9087 else
9088 {
9089 XLogRegisterBufData(0, &suffixlen, sizeof(uint16));
9090 }
9091 }
9092
9093 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
9094 xlhdr.t_infomask = newtup->t_data->t_infomask;
9095 xlhdr.t_hoff = newtup->t_data->t_hoff;
9097
9098 /*
9099 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
9100 *
9101 * The 'data' doesn't include the common prefix or suffix.
9102 */
9104 if (prefixlen == 0)
9105 {
9107 (char *) newtup->t_data + SizeofHeapTupleHeader,
9109 }
9110 else
9111 {
9112 /*
9113 * Have to write the null bitmap and data after the common prefix as
9114 * two separate rdata entries.
9115 */
9116 /* bitmap [+ padding] [+ oid] */
9117 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
9118 {
9120 (char *) newtup->t_data + SizeofHeapTupleHeader,
9121 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
9122 }
9123
9124 /* data after common prefix */
9126 (char *) newtup->t_data + newtup->t_data->t_hoff + prefixlen,
9127 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
9128 }
9129
9130 /* We need to log a tuple identity */
9132 {
9133 /* don't really need this, but its more comfy to decode */
9134 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
9135 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
9136 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
9137
9139
9140 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
9143 }
9144
9145 /* filtering by origin on a row level is much more efficient */
9147
9148 recptr = XLogInsert(RM_HEAP_ID, info);
9149
9150 return recptr;
9151}

References Assert, BufferGetPage(), compute_infobits(), fb(), FirstOffsetNumber, HeapTupleHeaderGetRawXmax(), HeapTupleIsHeapOnly(), init, ItemPointerGetOffsetNumber(), Min, PageGetMaxOffsetNumber(), REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationIsLogicallyLogged, RelationNeedsWAL, SizeOfHeapHeader, SizeofHeapTupleHeader, SizeOfHeapUpdate, XLH_UPDATE_CONTAINS_NEW_TUPLE, XLH_UPDATE_CONTAINS_OLD_KEY, XLH_UPDATE_CONTAINS_OLD_TUPLE, XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED, XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED, XLH_UPDATE_PREFIX_FROM_OLD, XLH_UPDATE_SUFFIX_FROM_OLD, XLOG_HEAP_HOT_UPDATE, XLOG_HEAP_INIT_PAGE, XLOG_HEAP_UPDATE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogCheckBufferNeedsBackup(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heap_update().

◆ log_heap_visible()

XLogRecPtr log_heap_visible ( Relation  rel,
Buffer  heap_buffer,
Buffer  vm_buffer,
TransactionId  snapshotConflictHorizon,
uint8  vmflags 
)

◆ MultiXactIdGetUpdateXid()

static TransactionId MultiXactIdGetUpdateXid ( TransactionId  xmax,
uint16  t_infomask 
)
static

Definition at line 7627 of file heapam.c.

7628{
7630 MultiXactMember *members;
7631 int nmembers;
7632
7633 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7634 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7635
7636 /*
7637 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7638 * pre-pg_upgrade.
7639 */
7640 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7641
7642 if (nmembers > 0)
7643 {
7644 int i;
7645
7646 for (i = 0; i < nmembers; i++)
7647 {
7648 /* Ignore lockers */
7649 if (!ISUPDATE_from_mxstatus(members[i].status))
7650 continue;
7651
7652 /* there can be at most one updater */
7654 update_xact = members[i].xid;
7655#ifndef USE_ASSERT_CHECKING
7656
7657 /*
7658 * in an assert-enabled build, walk the whole array to ensure
7659 * there's no other updater.
7660 */
7661 break;
7662#endif
7663 }
7664
7665 pfree(members);
7666 }
7667
7668 return update_xact;
7669}

References Assert, fb(), GetMultiXactIdMembers(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_LOCK_ONLY, i, InvalidTransactionId, ISUPDATE_from_mxstatus, pfree(), and MultiXactMember::xid.

Referenced by compute_new_xmax_infomask(), FreezeMultiXactId(), heap_lock_updated_tuple(), and HeapTupleGetUpdateXid().

◆ MultiXactIdWait()

static void MultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
Relation  rel,
const ItemPointerData ctid,
XLTW_Oper  oper,
int remaining 
)
static

Definition at line 7873 of file heapam.c.

7876{
7877 (void) Do_MultiXactIdWait(multi, status, infomask, false,
7878 rel, ctid, oper, remaining, false);
7879}

References Do_MultiXactIdWait(), fb(), oper(), and remaining.

Referenced by heap_delete(), heap_inplace_lock(), heap_lock_tuple(), and heap_update().

◆ page_collect_tuples()

static pg_attribute_always_inline int page_collect_tuples ( HeapScanDesc  scan,
Snapshot  snapshot,
Page  page,
Buffer  buffer,
BlockNumber  block,
int  lines,
bool  all_visible,
bool  check_serializable 
)
static

Definition at line 522 of file heapam.c.

526{
527 Oid relid = RelationGetRelid(scan->rs_base.rs_rd);
528 int ntup = 0;
529 int nvis = 0;
531
532 /* page at a time should have been disabled otherwise */
533 Assert(IsMVCCSnapshot(snapshot));
534
535 /* first find all tuples on the page */
537 {
540
542 continue;
543
544 /*
545 * If the page is not all-visible or we need to check serializability,
546 * maintain enough state to be able to refind the tuple efficiently,
547 * without again first needing to fetch the item and then via that the
548 * tuple.
549 */
550 if (!all_visible || check_serializable)
551 {
552 tup = &batchmvcc.tuples[ntup];
553
555 tup->t_len = ItemIdGetLength(lpp);
556 tup->t_tableOid = relid;
557 ItemPointerSet(&(tup->t_self), block, lineoff);
558 }
559
560 /*
561 * If the page is all visible, these fields otherwise won't be
562 * populated in loop below.
563 */
564 if (all_visible)
565 {
567 {
568 batchmvcc.visible[ntup] = true;
569 }
570 scan->rs_vistuples[ntup] = lineoff;
571 }
572
573 ntup++;
574 }
575
577
578 /*
579 * Unless the page is all visible, test visibility for all tuples one go.
580 * That is considerably more efficient than calling
581 * HeapTupleSatisfiesMVCC() one-by-one.
582 */
583 if (all_visible)
584 nvis = ntup;
585 else
586 nvis = HeapTupleSatisfiesMVCCBatch(snapshot, buffer,
587 ntup,
588 &batchmvcc,
589 scan->rs_vistuples);
590
591 /*
592 * So far we don't have batch API for testing serializabilty, so do so
593 * one-by-one.
594 */
596 {
597 for (int i = 0; i < ntup; i++)
598 {
600 scan->rs_base.rs_rd,
601 &batchmvcc.tuples[i],
602 buffer, snapshot);
603 }
604 }
605
606 return nvis;
607}

References Assert, fb(), FirstOffsetNumber, HeapCheckForSerializableConflictOut(), HeapTupleSatisfiesMVCCBatch(), i, IsMVCCSnapshot, ItemIdGetLength, ItemIdIsNormal, ItemPointerSet(), MaxHeapTuplesPerPage, PageGetItem(), PageGetItemId(), RelationGetRelid, HeapScanDescData::rs_base, TableScanDescData::rs_rd, HeapScanDescData::rs_vistuples, HeapTupleData::t_data, and unlikely.

Referenced by heap_prepare_pagescan().

◆ ReleaseBulkInsertStatePin()

void ReleaseBulkInsertStatePin ( BulkInsertState  bistate)

Definition at line 2114 of file heapam.c.

2115{
2116 if (bistate->current_buf != InvalidBuffer)
2117 ReleaseBuffer(bistate->current_buf);
2118 bistate->current_buf = InvalidBuffer;
2119
2120 /*
2121 * Despite the name, we also reset bulk relation extension state.
2122 * Otherwise we can end up erroring out due to looking for free space in
2123 * ->next_free of one partition, even though ->next_free was set when
2124 * extending another partition. It could obviously also be bad for
2125 * efficiency to look at existing blocks at offsets from another
2126 * partition, even if we don't error out.
2127 */
2128 bistate->next_free = InvalidBlockNumber;
2129 bistate->last_free = InvalidBlockNumber;
2130}

References BulkInsertStateData::current_buf, InvalidBlockNumber, InvalidBuffer, BulkInsertStateData::last_free, BulkInsertStateData::next_free, and ReleaseBuffer().

Referenced by CopyFrom().

◆ simple_heap_delete()

void simple_heap_delete ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 3277 of file heapam.c.

3278{
3279 TM_Result result;
3280 TM_FailureData tmfd;
3281
3282 result = heap_delete(relation, tid,
3284 true /* wait for commit */ ,
3285 &tmfd, false /* changingPart */ );
3286 switch (result)
3287 {
3288 case TM_SelfModified:
3289 /* Tuple was already updated in current command? */
3290 elog(ERROR, "tuple already updated by self");
3291 break;
3292
3293 case TM_Ok:
3294 /* done successfully */
3295 break;
3296
3297 case TM_Updated:
3298 elog(ERROR, "tuple concurrently updated");
3299 break;
3300
3301 case TM_Deleted:
3302 elog(ERROR, "tuple concurrently deleted");
3303 break;
3304
3305 default:
3306 elog(ERROR, "unrecognized heap_delete status: %u", result);
3307 break;
3308 }
3309}

References elog, ERROR, GetCurrentCommandId(), heap_delete(), InvalidSnapshot, TM_Deleted, TM_Ok, TM_SelfModified, and TM_Updated.

Referenced by CatalogTupleDelete(), and toast_delete_datum().

◆ simple_heap_insert()

void simple_heap_insert ( Relation  relation,
HeapTuple  tup 
)

Definition at line 2796 of file heapam.c.

2797{
2798 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2799}

References fb(), GetCurrentCommandId(), and heap_insert().

Referenced by CatalogTupleInsert(), CatalogTupleInsertWithInfo(), and InsertOneTuple().

◆ simple_heap_update()

void simple_heap_update ( Relation  relation,
const ItemPointerData otid,
HeapTuple  tup,
TU_UpdateIndexes update_indexes 
)

Definition at line 4570 of file heapam.c.

4572{
4573 TM_Result result;
4574 TM_FailureData tmfd;
4575 LockTupleMode lockmode;
4576
4577 result = heap_update(relation, otid, tup,
4579 true /* wait for commit */ ,
4580 &tmfd, &lockmode, update_indexes);
4581 switch (result)
4582 {
4583 case TM_SelfModified:
4584 /* Tuple was already updated in current command? */
4585 elog(ERROR, "tuple already updated by self");
4586 break;
4587
4588 case TM_Ok:
4589 /* done successfully */
4590 break;
4591
4592 case TM_Updated:
4593 elog(ERROR, "tuple concurrently updated");
4594 break;
4595
4596 case TM_Deleted:
4597 elog(ERROR, "tuple concurrently deleted");
4598 break;
4599
4600 default:
4601 elog(ERROR, "unrecognized heap_update status: %u", result);
4602 break;
4603 }
4604}

References elog, ERROR, fb(), GetCurrentCommandId(), heap_update(), InvalidSnapshot, TM_Deleted, TM_Ok, TM_SelfModified, and TM_Updated.

Referenced by CatalogTupleUpdate(), and CatalogTupleUpdateWithInfo().

◆ test_lockmode_for_conflict()

static TM_Result test_lockmode_for_conflict ( MultiXactStatus  status,
TransactionId  xid,
LockTupleMode  mode,
HeapTuple  tup,
bool needwait 
)
static

Definition at line 5690 of file heapam.c.

5693{
5695
5696 *needwait = false;
5698
5699 /*
5700 * Note: we *must* check TransactionIdIsInProgress before
5701 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5702 * for an explanation.
5703 */
5705 {
5706 /*
5707 * The tuple has already been locked by our own transaction. This is
5708 * very rare but can happen if multiple transactions are trying to
5709 * lock an ancient version of the same tuple.
5710 */
5711 return TM_SelfModified;
5712 }
5713 else if (TransactionIdIsInProgress(xid))
5714 {
5715 /*
5716 * If the locking transaction is running, what we do depends on
5717 * whether the lock modes conflict: if they do, then we must wait for
5718 * it to finish; otherwise we can fall through to lock this tuple
5719 * version without waiting.
5720 */
5723 {
5724 *needwait = true;
5725 }
5726
5727 /*
5728 * If we set needwait above, then this value doesn't matter;
5729 * otherwise, this value signals to caller that it's okay to proceed.
5730 */
5731 return TM_Ok;
5732 }
5733 else if (TransactionIdDidAbort(xid))
5734 return TM_Ok;
5735 else if (TransactionIdDidCommit(xid))
5736 {
5737 /*
5738 * The other transaction committed. If it was only a locker, then the
5739 * lock is completely gone now and we can return success; but if it
5740 * was an update, then what we do depends on whether the two lock
5741 * modes conflict. If they conflict, then we must report error to
5742 * caller. But if they don't, we can fall through to allow the current
5743 * transaction to lock the tuple.
5744 *
5745 * Note: the reason we worry about ISUPDATE here is because as soon as
5746 * a transaction ends, all its locks are gone and meaningless, and
5747 * thus we can ignore them; whereas its updates persist. In the
5748 * TransactionIdIsInProgress case, above, we don't need to check
5749 * because we know the lock is still "alive" and thus a conflict needs
5750 * always be checked.
5751 */
5752 if (!ISUPDATE_from_mxstatus(status))
5753 return TM_Ok;
5754
5757 {
5758 /* bummer */
5759 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5760 return TM_Updated;
5761 else
5762 return TM_Deleted;
5763 }
5764
5765 return TM_Ok;
5766 }
5767
5768 /* Not in progress, not aborted, not committed -- must have crashed */
5769 return TM_Ok;
5770}

References DoLockModesConflict(), fb(), get_mxact_status_for_lock(), ISUPDATE_from_mxstatus, ItemPointerEquals(), LOCKMODE_from_mxstatus, mode, TM_Deleted, TM_Ok, TM_SelfModified, TM_Updated, TransactionIdDidAbort(), TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), and TransactionIdIsInProgress().

Referenced by heap_lock_updated_tuple_rec().

◆ UpdateXmaxHintBits()

◆ xmax_infomask_changed()

static bool xmax_infomask_changed ( uint16  new_infomask,
uint16  old_infomask 
)
inlinestatic

Definition at line 2831 of file heapam.c.

2832{
2833 const uint16 interesting =
2835
2836 if ((new_infomask & interesting) != (old_infomask & interesting))
2837 return true;
2838
2839 return false;
2840}

References fb(), HEAP_LOCK_MASK, HEAP_XMAX_IS_MULTI, and HEAP_XMAX_LOCK_ONLY.

Referenced by heap_delete(), heap_lock_tuple(), and heap_update().

Variable Documentation

◆ hwlock

LOCKMODE hwlock

Definition at line 128 of file heapam.c.

◆ lockstatus

int lockstatus

Definition at line 129 of file heapam.c.

◆ MultiXactStatusLock

const int MultiXactStatusLock[MaxMultiXactStatus+1]
static
Initial value:

Definition at line 207 of file heapam.c.

208{
209 LockTupleKeyShare, /* ForKeyShare */
210 LockTupleShare, /* ForShare */
211 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
212 LockTupleExclusive, /* ForUpdate */
213 LockTupleNoKeyExclusive, /* NoKeyUpdate */
214 LockTupleExclusive /* Update */
215};

◆ [struct]

const struct { ... } tupleLockExtraInfo[]
Initial value:
=
{
.hwlock = AccessShareLock,
.updstatus = -1
},
.hwlock = RowShareLock,
.lockstatus = MultiXactStatusForShare,
.updstatus = -1
},
.hwlock = ExclusiveLock,
},
.lockstatus = MultiXactStatusForUpdate,
.updstatus = MultiXactStatusUpdate
}
}
#define AccessExclusiveLock
Definition lockdefs.h:43
#define ExclusiveLock
Definition lockdefs.h:42
#define RowShareLock
Definition lockdefs.h:37

Referenced by DoesMultiXactIdConflict(), and get_mxact_status_for_lock().

◆ updstatus

int updstatus

Definition at line 130 of file heapam.c.