PostgreSQL Source Code git master
Loading...
Searching...
No Matches
heapam.c File Reference
#include "postgres.h"
#include "access/heapam.h"
#include "access/heaptoast.h"
#include "access/hio.h"
#include "access/multixact.h"
#include "access/subtrans.h"
#include "access/syncscan.h"
#include "access/valid.h"
#include "access/visibilitymap.h"
#include "access/xloginsert.h"
#include "catalog/pg_database.h"
#include "catalog/pg_database_d.h"
#include "commands/vacuum.h"
#include "pgstat.h"
#include "port/pg_bitutils.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/datum.h"
#include "utils/injection_point.h"
#include "utils/inval.h"
#include "utils/spccache.h"
#include "utils/syscache.h"
Include dependency graph for heapam.c:

Go to the source code of this file.

Data Structures

struct  IndexDeleteCounts
 

Macros

#define LOCKMODE_from_mxstatus(status)    (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
 
#define LockTupleTuplock(rel, tup, mode)    LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
 
#define UnlockTupleTuplock(rel, tup, mode)    UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
 
#define ConditionalLockTupleTuplock(rel, tup, mode, log)    ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock, (log))
 
#define BOTTOMUP_MAX_NBLOCKS   6
 
#define BOTTOMUP_TOLERANCE_NBLOCKS   3
 
#define TUPLOCK_from_mxstatus(status)    (MultiXactStatusLock[(status)])
 
#define FRM_NOOP   0x0001
 
#define FRM_INVALIDATE_XMAX   0x0002
 
#define FRM_RETURN_IS_XID   0x0004
 
#define FRM_RETURN_IS_MULTI   0x0008
 
#define FRM_MARK_COMMITTED   0x0010
 

Typedefs

typedef struct IndexDeleteCounts IndexDeleteCounts
 

Functions

static HeapTuple heap_prepare_insert (Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options)
 
static XLogRecPtr log_heap_update (Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared)
 
static BitmapsetHeapDetermineColumnsInfo (Relation relation, Bitmapset *interesting_cols, Bitmapset *external_cols, HeapTuple oldtup, HeapTuple newtup, bool *has_external)
 
static bool heap_acquire_tuplock (Relation relation, const ItemPointerData *tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock)
 
static BlockNumber heapgettup_advance_block (HeapScanDesc scan, BlockNumber block, ScanDirection dir)
 
static pg_noinline BlockNumber heapgettup_initial_block (HeapScanDesc scan, ScanDirection dir)
 
static void compute_new_xmax_infomask (TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2)
 
static TM_Result heap_lock_updated_tuple (Relation rel, uint16 prior_infomask, TransactionId prior_raw_xmax, const ItemPointerData *prior_ctid, TransactionId xid, LockTupleMode mode)
 
static void GetMultiXactIdHintBits (MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2)
 
static TransactionId MultiXactIdGetUpdateXid (TransactionId xmax, uint16 t_infomask)
 
static bool DoesMultiXactIdConflict (MultiXactId multi, uint16 infomask, LockTupleMode lockmode, bool *current_is_member)
 
static void MultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining)
 
static bool ConditionalMultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining, bool logLockFailure)
 
static void index_delete_sort (TM_IndexDeleteOp *delstate)
 
static int bottomup_sort_and_shrink (TM_IndexDeleteOp *delstate)
 
static XLogRecPtr log_heap_new_cid (Relation relation, HeapTuple tup)
 
static HeapTuple ExtractReplicaIdentity (Relation relation, HeapTuple tp, bool key_required, bool *copy)
 
static void AssertHasSnapshotForToast (Relation rel)
 
static BlockNumber heap_scan_stream_read_next_parallel (ReadStream *stream, void *callback_private_data, void *per_buffer_data)
 
static BlockNumber heap_scan_stream_read_next_serial (ReadStream *stream, void *callback_private_data, void *per_buffer_data)
 
static BlockNumber bitmapheap_stream_read_next (ReadStream *pgsr, void *private_data, void *per_buffer_data)
 
static void initscan (HeapScanDesc scan, ScanKey key, bool keep_startblock)
 
void heap_setscanlimits (TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
 
static pg_attribute_always_inline int page_collect_tuples (HeapScanDesc scan, Snapshot snapshot, Page page, Buffer buffer, BlockNumber block, int lines, bool all_visible, bool check_serializable)
 
void heap_prepare_pagescan (TableScanDesc sscan)
 
static void heap_fetch_next_buffer (HeapScanDesc scan, ScanDirection dir)
 
static Page heapgettup_start_page (HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
 
static Page heapgettup_continue_page (HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
 
static void heapgettup (HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
 
static void heapgettup_pagemode (HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
 
TableScanDesc heap_beginscan (Relation relation, Snapshot snapshot, int nkeys, ScanKey key, ParallelTableScanDesc parallel_scan, uint32 flags)
 
void heap_rescan (TableScanDesc sscan, ScanKey key, bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode)
 
void heap_endscan (TableScanDesc sscan)
 
HeapTuple heap_getnext (TableScanDesc sscan, ScanDirection direction)
 
bool heap_getnextslot (TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
 
void heap_set_tidrange (TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid)
 
bool heap_getnextslot_tidrange (TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
 
bool heap_fetch (Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf)
 
bool heap_hot_search_buffer (ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call)
 
void heap_get_latest_tid (TableScanDesc sscan, ItemPointer tid)
 
static void UpdateXmaxHintBits (HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
 
BulkInsertState GetBulkInsertState (void)
 
void FreeBulkInsertState (BulkInsertState bistate)
 
void ReleaseBulkInsertStatePin (BulkInsertState bistate)
 
void heap_insert (Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate)
 
static int heap_multi_insert_pages (HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
 
void heap_multi_insert (Relation relation, TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate)
 
void simple_heap_insert (Relation relation, HeapTuple tup)
 
static uint8 compute_infobits (uint16 infomask, uint16 infomask2)
 
static bool xmax_infomask_changed (uint16 new_infomask, uint16 old_infomask)
 
TM_Result heap_delete (Relation relation, const ItemPointerData *tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
 
void simple_heap_delete (Relation relation, const ItemPointerData *tid)
 
TM_Result heap_update (Relation relation, const ItemPointerData *otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
 
static bool heap_attr_equals (TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, bool isnull1, bool isnull2)
 
void simple_heap_update (Relation relation, const ItemPointerData *otid, HeapTuple tup, TU_UpdateIndexes *update_indexes)
 
static MultiXactStatus get_mxact_status_for_lock (LockTupleMode mode, bool is_update)
 
TM_Result heap_lock_tuple (Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_updates, Buffer *buffer, TM_FailureData *tmfd)
 
static TM_Result test_lockmode_for_conflict (MultiXactStatus status, TransactionId xid, LockTupleMode mode, HeapTuple tup, bool *needwait)
 
static TM_Result heap_lock_updated_tuple_rec (Relation rel, TransactionId priorXmax, const ItemPointerData *tid, TransactionId xid, LockTupleMode mode)
 
void heap_finish_speculative (Relation relation, const ItemPointerData *tid)
 
void heap_abort_speculative (Relation relation, const ItemPointerData *tid)
 
bool heap_inplace_lock (Relation relation, HeapTuple oldtup_ptr, Buffer buffer, void(*release_callback)(void *), void *arg)
 
void heap_inplace_update_and_unlock (Relation relation, HeapTuple oldtup, HeapTuple tuple, Buffer buffer)
 
void heap_inplace_unlock (Relation relation, HeapTuple oldtup, Buffer buffer)
 
static TransactionId FreezeMultiXactId (MultiXactId multi, uint16 t_infomask, const struct VacuumCutoffs *cutoffs, uint16 *flags, HeapPageFreeze *pagefrz)
 
bool heap_prepare_freeze_tuple (HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, HeapPageFreeze *pagefrz, HeapTupleFreeze *frz, bool *totally_frozen)
 
void heap_pre_freeze_checks (Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
 
void heap_freeze_prepared_tuples (Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
 
bool heap_freeze_tuple (HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId FreezeLimit, TransactionId MultiXactCutoff)
 
TransactionId HeapTupleGetUpdateXid (const HeapTupleHeaderData *tup)
 
static bool Do_MultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, bool nowait, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining, bool logLockFailure)
 
bool heap_tuple_needs_eventual_freeze (HeapTupleHeader tuple)
 
bool heap_tuple_should_freeze (HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, TransactionId *NoFreezePageRelfrozenXid, MultiXactId *NoFreezePageRelminMxid)
 
void HeapTupleHeaderAdvanceConflictHorizon (HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
 
static void index_delete_check_htid (TM_IndexDeleteOp *delstate, Page page, OffsetNumber maxoff, const ItemPointerData *htid, TM_IndexStatus *istatus)
 
TransactionId heap_index_delete_tuples (Relation rel, TM_IndexDeleteOp *delstate)
 
static int index_delete_sort_cmp (TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
 
static int bottomup_nblocksfavorable (IndexDeleteCounts *blockgroups, int nblockgroups, TM_IndexDelete *deltids)
 
static int bottomup_sort_and_shrink_cmp (const void *arg1, const void *arg2)
 
XLogRecPtr log_heap_visible (Relation rel, Buffer heap_buffer, Buffer vm_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags)
 
void HeapCheckForSerializableConflictOut (bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot)
 

Variables

struct { 
 
   LOCKMODE   hwlock 
 
   int   lockstatus 
 
   int   updstatus 
 
tupleLockExtraInfo [] 
 
static const int MultiXactStatusLock [MaxMultiXactStatus+1]
 

Macro Definition Documentation

◆ BOTTOMUP_MAX_NBLOCKS

#define BOTTOMUP_MAX_NBLOCKS   6

Definition at line 189 of file heapam.c.

◆ BOTTOMUP_TOLERANCE_NBLOCKS

#define BOTTOMUP_TOLERANCE_NBLOCKS   3

Definition at line 190 of file heapam.c.

◆ ConditionalLockTupleTuplock

#define ConditionalLockTupleTuplock (   rel,
  tup,
  mode,
  log 
)     ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock, (log))

Definition at line 171 of file heapam.c.

179{
181 int next_item;
182 int ndeltids;
183 TM_IndexDelete *deltids;
185#endif
186
187/* heap_index_delete_tuples bottom-up index deletion costing constants */
188#define BOTTOMUP_MAX_NBLOCKS 6
189#define BOTTOMUP_TOLERANCE_NBLOCKS 3
190
191/*
192 * heap_index_delete_tuples uses this when determining which heap blocks it
193 * must visit to help its bottom-up index deletion caller
194 */
195typedef struct IndexDeleteCounts
196{
197 int16 npromisingtids; /* Number of "promising" TIDs in group */
198 int16 ntids; /* Number of TIDs in group */
199 int16 ifirsttid; /* Offset to group's first deltid */
201
202/*
203 * This table maps tuple lock strength values for each particular
204 * MultiXactStatus value.
205 */
206static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
207{
208 LockTupleKeyShare, /* ForKeyShare */
209 LockTupleShare, /* ForShare */
210 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
211 LockTupleExclusive, /* ForUpdate */
212 LockTupleNoKeyExclusive, /* NoKeyUpdate */
213 LockTupleExclusive /* Update */
214};
215
216/* Get the LockTupleMode for a given MultiXactStatus */
217#define TUPLOCK_from_mxstatus(status) \
218 (MultiXactStatusLock[(status)])
219
220/*
221 * Check that we have a valid snapshot if we might need TOAST access.
222 */
223static inline void
225{
226#ifdef USE_ASSERT_CHECKING
227
228 /* bootstrap mode in particular breaks this rule */
230 return;
231
232 /* if the relation doesn't have a TOAST table, we are good */
233 if (!OidIsValid(rel->rd_rel->reltoastrelid))
234 return;
235
237
238#endif /* USE_ASSERT_CHECKING */
239}
240
241/* ----------------------------------------------------------------
242 * heap support routines
243 * ----------------------------------------------------------------
244 */
245
246/*
247 * Streaming read API callback for parallel sequential scans. Returns the next
248 * block the caller wants from the read stream or InvalidBlockNumber when done.
249 */
250static BlockNumber
252 void *callback_private_data,
253 void *per_buffer_data)
254{
255 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
256
259
260 if (unlikely(!scan->rs_inited))
261 {
262 /* parallel scan */
266 scan->rs_startblock,
267 scan->rs_numblocks);
268
269 /* may return InvalidBlockNumber if there are no more blocks */
273 scan->rs_inited = true;
274 }
275 else
276 {
279 scan->rs_base.rs_parallel);
280 }
281
282 return scan->rs_prefetch_block;
283}
284
285/*
286 * Streaming read API callback for serial sequential and TID range scans.
287 * Returns the next block the caller wants from the read stream or
288 * InvalidBlockNumber when done.
289 */
290static BlockNumber
292 void *callback_private_data,
293 void *per_buffer_data)
294{
295 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
296
297 if (unlikely(!scan->rs_inited))
298 {
300 scan->rs_inited = true;
301 }
302 else
304 scan->rs_prefetch_block,
305 scan->rs_dir);
306
307 return scan->rs_prefetch_block;
308}
309
310/*
311 * Read stream API callback for bitmap heap scans.
312 * Returns the next block the caller wants from the read stream or
313 * InvalidBlockNumber when done.
314 */
315static BlockNumber
316bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data,
317 void *per_buffer_data)
318{
319 TBMIterateResult *tbmres = per_buffer_data;
322 TableScanDesc sscan = &hscan->rs_base;
323
324 for (;;)
325 {
327
328 /* no more entries in the bitmap */
329 if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
330 return InvalidBlockNumber;
331
332 /*
333 * Ignore any claimed entries past what we think is the end of the
334 * relation. It may have been extended after the start of our scan (we
335 * only hold an AccessShareLock, and it could be inserts from this
336 * backend). We don't take this optimization in SERIALIZABLE
337 * isolation though, as we need to examine all invisible tuples
338 * reachable by the index.
339 */
341 tbmres->blockno >= hscan->rs_nblocks)
342 continue;
343
344 return tbmres->blockno;
345 }
346
347 /* not reachable */
348 Assert(false);
349}
350
351/* ----------------
352 * initscan - scan code common to heap_beginscan and heap_rescan
353 * ----------------
354 */
355static void
357{
359 bool allow_strat;
360 bool allow_sync;
361
362 /*
363 * Determine the number of blocks we have to scan.
364 *
365 * It is sufficient to do this once at scan start, since any tuples added
366 * while the scan is in progress will be invisible to my snapshot anyway.
367 * (That is not true when using a non-MVCC snapshot. However, we couldn't
368 * guarantee to return tuples added after scan start anyway, since they
369 * might go into pages we already scanned. To guarantee consistent
370 * results for a non-MVCC snapshot, the caller must hold some higher-level
371 * lock that ensures the interesting tuple(s) won't change.)
372 */
373 if (scan->rs_base.rs_parallel != NULL)
374 {
376 scan->rs_nblocks = bpscan->phs_nblocks;
377 }
378 else
380
381 /*
382 * If the table is large relative to NBuffers, use a bulk-read access
383 * strategy and enable synchronized scanning (see syncscan.c). Although
384 * the thresholds for these features could be different, we make them the
385 * same so that there are only two behaviors to tune rather than four.
386 * (However, some callers need to be able to disable one or both of these
387 * behaviors, independently of the size of the table; also there is a GUC
388 * variable that can disable synchronized scanning.)
389 *
390 * Note that table_block_parallelscan_initialize has a very similar test;
391 * if you change this, consider changing that one, too.
392 */
394 scan->rs_nblocks > NBuffers / 4)
395 {
397 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
398 }
399 else
400 allow_strat = allow_sync = false;
401
402 if (allow_strat)
403 {
404 /* During a rescan, keep the previous strategy object. */
405 if (scan->rs_strategy == NULL)
407 }
408 else
409 {
410 if (scan->rs_strategy != NULL)
412 scan->rs_strategy = NULL;
413 }
414
415 if (scan->rs_base.rs_parallel != NULL)
416 {
417 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
420 else
422
423 /*
424 * If not rescanning, initialize the startblock. Finding the actual
425 * start location is done in table_block_parallelscan_startblock_init,
426 * based on whether an alternative start location has been set with
427 * heap_setscanlimits, or using the syncscan location, when syncscan
428 * is enabled.
429 */
430 if (!keep_startblock)
432 }
433 else
434 {
435 if (keep_startblock)
436 {
437 /*
438 * When rescanning, we want to keep the previous startblock
439 * setting, so that rewinding a cursor doesn't generate surprising
440 * results. Reset the active syncscan setting, though.
441 */
444 else
446 }
448 {
451 }
452 else
453 {
455 scan->rs_startblock = 0;
456 }
457 }
458
460 scan->rs_inited = false;
461 scan->rs_ctup.t_data = NULL;
463 scan->rs_cbuf = InvalidBuffer;
465 scan->rs_ntuples = 0;
466 scan->rs_cindex = 0;
467
468 /*
469 * Initialize to ForwardScanDirection because it is most common and
470 * because heap scans go forward before going backward (e.g. CURSORs).
471 */
474
475 /* page-at-a-time fields are always invalid when not rs_inited */
476
477 /*
478 * copy the scan key, if appropriate
479 */
480 if (key != NULL && scan->rs_base.rs_nkeys > 0)
481 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
482
483 /*
484 * Currently, we only have a stats counter for sequential heap scans (but
485 * e.g for bitmap scans the underlying bitmap index scans will be counted,
486 * and for sample scans we update stats for tuple fetches).
487 */
488 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
490}
491
492/*
493 * heap_setscanlimits - restrict range of a heapscan
494 *
495 * startBlk is the page to start at
496 * numBlks is number of pages to scan (InvalidBlockNumber means "all")
497 */
498void
500{
502
503 Assert(!scan->rs_inited); /* else too late to change */
504 /* else rs_startblock is significant */
506
507 /* Check startBlk is valid (but allow case of zero blocks...) */
508 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
509
510 scan->rs_startblock = startBlk;
511 scan->rs_numblocks = numBlks;
512}
513
514/*
515 * Per-tuple loop for heap_prepare_pagescan(). Pulled out so it can be called
516 * multiple times, with constant arguments for all_visible,
517 * check_serializable.
518 */
520static int
522 Page page, Buffer buffer,
523 BlockNumber block, int lines,
524 bool all_visible, bool check_serializable)
525{
526 Oid relid = RelationGetRelid(scan->rs_base.rs_rd);
527 int ntup = 0;
528 int nvis = 0;
530
531 /* page at a time should have been disabled otherwise */
532 Assert(IsMVCCSnapshot(snapshot));
533
534 /* first find all tuples on the page */
536 {
539
541 continue;
542
543 /*
544 * If the page is not all-visible or we need to check serializability,
545 * maintain enough state to be able to refind the tuple efficiently,
546 * without again first needing to fetch the item and then via that the
547 * tuple.
548 */
549 if (!all_visible || check_serializable)
550 {
551 tup = &batchmvcc.tuples[ntup];
552
554 tup->t_len = ItemIdGetLength(lpp);
555 tup->t_tableOid = relid;
556 ItemPointerSet(&(tup->t_self), block, lineoff);
557 }
558
559 /*
560 * If the page is all visible, these fields otherwise won't be
561 * populated in loop below.
562 */
563 if (all_visible)
564 {
566 {
567 batchmvcc.visible[ntup] = true;
568 }
569 scan->rs_vistuples[ntup] = lineoff;
570 }
571
572 ntup++;
573 }
574
576
577 /*
578 * Unless the page is all visible, test visibility for all tuples one go.
579 * That is considerably more efficient than calling
580 * HeapTupleSatisfiesMVCC() one-by-one.
581 */
582 if (all_visible)
583 nvis = ntup;
584 else
585 nvis = HeapTupleSatisfiesMVCCBatch(snapshot, buffer,
586 ntup,
587 &batchmvcc,
588 scan->rs_vistuples);
589
590 /*
591 * So far we don't have batch API for testing serializabilty, so do so
592 * one-by-one.
593 */
595 {
596 for (int i = 0; i < ntup; i++)
597 {
599 scan->rs_base.rs_rd,
600 &batchmvcc.tuples[i],
601 buffer, snapshot);
602 }
603 }
604
605 return nvis;
606}
607
608/*
609 * heap_prepare_pagescan - Prepare current scan page to be scanned in pagemode
610 *
611 * Preparation currently consists of 1. prune the scan's rs_cbuf page, and 2.
612 * fill the rs_vistuples[] array with the OffsetNumbers of visible tuples.
613 */
614void
616{
618 Buffer buffer = scan->rs_cbuf;
619 BlockNumber block = scan->rs_cblock;
620 Snapshot snapshot;
621 Page page;
622 int lines;
623 bool all_visible;
625
626 Assert(BufferGetBlockNumber(buffer) == block);
627
628 /* ensure we're not accidentally being used when not in pagemode */
630 snapshot = scan->rs_base.rs_snapshot;
631
632 /*
633 * Prune and repair fragmentation for the whole page, if possible.
634 */
635 heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
636
637 /*
638 * We must hold share lock on the buffer content while examining tuple
639 * visibility. Afterwards, however, the tuples we have found to be
640 * visible are guaranteed good as long as we hold the buffer pin.
641 */
643
644 page = BufferGetPage(buffer);
645 lines = PageGetMaxOffsetNumber(page);
646
647 /*
648 * If the all-visible flag indicates that all tuples on the page are
649 * visible to everyone, we can skip the per-tuple visibility tests.
650 *
651 * Note: In hot standby, a tuple that's already visible to all
652 * transactions on the primary might still be invisible to a read-only
653 * transaction in the standby. We partly handle this problem by tracking
654 * the minimum xmin of visible tuples as the cut-off XID while marking a
655 * page all-visible on the primary and WAL log that along with the
656 * visibility map SET operation. In hot standby, we wait for (or abort)
657 * all transactions that can potentially may not see one or more tuples on
658 * the page. That's how index-only scans work fine in hot standby. A
659 * crucial difference between index-only scans and heap scans is that the
660 * index-only scan completely relies on the visibility map where as heap
661 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
662 * the page-level flag can be trusted in the same way, because it might
663 * get propagated somehow without being explicitly WAL-logged, e.g. via a
664 * full page write. Until we can prove that beyond doubt, let's check each
665 * tuple for visibility the hard way.
666 */
667 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
670
671 /*
672 * We call page_collect_tuples() with constant arguments, to get the
673 * compiler to constant fold the constant arguments. Separate calls with
674 * constant arguments, rather than variables, are needed on several
675 * compilers to actually perform constant folding.
676 */
677 if (likely(all_visible))
678 {
680 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
681 block, lines, true, false);
682 else
683 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
684 block, lines, true, true);
685 }
686 else
687 {
689 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
690 block, lines, false, false);
691 else
692 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
693 block, lines, false, true);
694 }
695
697}
698
699/*
700 * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM.
701 *
702 * Read the next block of the scan relation from the read stream and save it
703 * in the scan descriptor. It is already pinned.
704 */
705static inline void
707{
708 Assert(scan->rs_read_stream);
709
710 /* release previous scan buffer, if any */
711 if (BufferIsValid(scan->rs_cbuf))
712 {
713 ReleaseBuffer(scan->rs_cbuf);
714 scan->rs_cbuf = InvalidBuffer;
715 }
716
717 /*
718 * Be sure to check for interrupts at least once per page. Checks at
719 * higher code levels won't be able to stop a seqscan that encounters many
720 * pages' worth of consecutive dead tuples.
721 */
723
724 /*
725 * If the scan direction is changing, reset the prefetch block to the
726 * current block. Otherwise, we will incorrectly prefetch the blocks
727 * between the prefetch block and the current block again before
728 * prefetching blocks in the new, correct scan direction.
729 */
730 if (unlikely(scan->rs_dir != dir))
731 {
732 scan->rs_prefetch_block = scan->rs_cblock;
734 }
735
736 scan->rs_dir = dir;
737
739 if (BufferIsValid(scan->rs_cbuf))
741}
742
743/*
744 * heapgettup_initial_block - return the first BlockNumber to scan
745 *
746 * Returns InvalidBlockNumber when there are no blocks to scan. This can
747 * occur with empty tables and in parallel scans when parallel workers get all
748 * of the pages before we can get a chance to get our first page.
749 */
752{
753 Assert(!scan->rs_inited);
754 Assert(scan->rs_base.rs_parallel == NULL);
755
756 /* When there are no pages to scan, return InvalidBlockNumber */
757 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
758 return InvalidBlockNumber;
759
760 if (ScanDirectionIsForward(dir))
761 {
762 return scan->rs_startblock;
763 }
764 else
765 {
766 /*
767 * Disable reporting to syncscan logic in a backwards scan; it's not
768 * very likely anyone else is doing the same thing at the same time,
769 * and much more likely that we'll just bollix things for forward
770 * scanners.
771 */
773
774 /*
775 * Start from last page of the scan. Ensure we take into account
776 * rs_numblocks if it's been adjusted by heap_setscanlimits().
777 */
778 if (scan->rs_numblocks != InvalidBlockNumber)
779 return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
780
781 if (scan->rs_startblock > 0)
782 return scan->rs_startblock - 1;
783
784 return scan->rs_nblocks - 1;
785 }
786}
787
788
789/*
790 * heapgettup_start_page - helper function for heapgettup()
791 *
792 * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
793 * to the number of tuples on this page. Also set *lineoff to the first
794 * offset to scan with forward scans getting the first offset and backward
795 * getting the final offset on the page.
796 */
797static Page
800{
801 Page page;
802
803 Assert(scan->rs_inited);
805
806 /* Caller is responsible for ensuring buffer is locked if needed */
807 page = BufferGetPage(scan->rs_cbuf);
808
810
811 if (ScanDirectionIsForward(dir))
813 else
815
816 /* lineoff now references the physically previous or next tid */
817 return page;
818}
819
820
821/*
822 * heapgettup_continue_page - helper function for heapgettup()
823 *
824 * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
825 * to the number of tuples left to scan on this page. Also set *lineoff to
826 * the next offset to scan according to the ScanDirection in 'dir'.
827 */
828static inline Page
831{
832 Page page;
833
834 Assert(scan->rs_inited);
836
837 /* Caller is responsible for ensuring buffer is locked if needed */
838 page = BufferGetPage(scan->rs_cbuf);
839
840 if (ScanDirectionIsForward(dir))
841 {
843 *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
844 }
845 else
846 {
847 /*
848 * The previous returned tuple may have been vacuumed since the
849 * previous scan when we use a non-MVCC snapshot, so we must
850 * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
851 */
853 *linesleft = *lineoff;
854 }
855
856 /* lineoff now references the physically previous or next tid */
857 return page;
858}
859
860/*
861 * heapgettup_advance_block - helper for heap_fetch_next_buffer()
862 *
863 * Given the current block number, the scan direction, and various information
864 * contained in the scan descriptor, calculate the BlockNumber to scan next
865 * and return it. If there are no further blocks to scan, return
866 * InvalidBlockNumber to indicate this fact to the caller.
867 *
868 * This should not be called to determine the initial block number -- only for
869 * subsequent blocks.
870 *
871 * This also adjusts rs_numblocks when a limit has been imposed by
872 * heap_setscanlimits().
873 */
874static inline BlockNumber
876{
877 Assert(scan->rs_base.rs_parallel == NULL);
878
880 {
881 block++;
882
883 /* wrap back to the start of the heap */
884 if (block >= scan->rs_nblocks)
885 block = 0;
886
887 /*
888 * Report our new scan position for synchronization purposes. We don't
889 * do that when moving backwards, however. That would just mess up any
890 * other forward-moving scanners.
891 *
892 * Note: we do this before checking for end of scan so that the final
893 * state of the position hint is back at the start of the rel. That's
894 * not strictly necessary, but otherwise when you run the same query
895 * multiple times the starting position would shift a little bit
896 * backwards on every invocation, which is confusing. We don't
897 * guarantee any specific ordering in general, though.
898 */
899 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
900 ss_report_location(scan->rs_base.rs_rd, block);
901
902 /* we're done if we're back at where we started */
903 if (block == scan->rs_startblock)
904 return InvalidBlockNumber;
905
906 /* check if the limit imposed by heap_setscanlimits() is met */
907 if (scan->rs_numblocks != InvalidBlockNumber)
908 {
909 if (--scan->rs_numblocks == 0)
910 return InvalidBlockNumber;
911 }
912
913 return block;
914 }
915 else
916 {
917 /* we're done if the last block is the start position */
918 if (block == scan->rs_startblock)
919 return InvalidBlockNumber;
920
921 /* check if the limit imposed by heap_setscanlimits() is met */
922 if (scan->rs_numblocks != InvalidBlockNumber)
923 {
924 if (--scan->rs_numblocks == 0)
925 return InvalidBlockNumber;
926 }
927
928 /* wrap to the end of the heap when the last page was page 0 */
929 if (block == 0)
930 block = scan->rs_nblocks;
931
932 block--;
933
934 return block;
935 }
936}
937
938/* ----------------
939 * heapgettup - fetch next heap tuple
940 *
941 * Initialize the scan if not already done; then advance to the next
942 * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
943 * or set scan->rs_ctup.t_data = NULL if no more tuples.
944 *
945 * Note: the reason nkeys/key are passed separately, even though they are
946 * kept in the scan descriptor, is that the caller may not want us to check
947 * the scankeys.
948 *
949 * Note: when we fall off the end of the scan in either direction, we
950 * reset rs_inited. This means that a further request with the same
951 * scan direction will restart the scan, which is a bit odd, but a
952 * request with the opposite scan direction will start a fresh scan
953 * in the proper direction. The latter is required behavior for cursors,
954 * while the former case is generally undefined behavior in Postgres
955 * so we don't care too much.
956 * ----------------
957 */
958static void
960 ScanDirection dir,
961 int nkeys,
962 ScanKey key)
963{
964 HeapTuple tuple = &(scan->rs_ctup);
965 Page page;
967 int linesleft;
968
969 if (likely(scan->rs_inited))
970 {
971 /* continue from previously returned page/tuple */
973 page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
974 goto continue_page;
975 }
976
977 /*
978 * advance the scan until we find a qualifying tuple or run out of stuff
979 * to scan
980 */
981 while (true)
982 {
983 heap_fetch_next_buffer(scan, dir);
984
985 /* did we run out of blocks to scan? */
986 if (!BufferIsValid(scan->rs_cbuf))
987 break;
988
990
992 page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
994
995 /*
996 * Only continue scanning the page while we have lines left.
997 *
998 * Note that this protects us from accessing line pointers past
999 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
1000 * table scan, and for when we start scanning a new page.
1001 */
1002 for (; linesleft > 0; linesleft--, lineoff += dir)
1003 {
1004 bool visible;
1006
1007 if (!ItemIdIsNormal(lpp))
1008 continue;
1009
1010 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1011 tuple->t_len = ItemIdGetLength(lpp);
1012 ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
1013
1014 visible = HeapTupleSatisfiesVisibility(tuple,
1015 scan->rs_base.rs_snapshot,
1016 scan->rs_cbuf);
1017
1019 tuple, scan->rs_cbuf,
1020 scan->rs_base.rs_snapshot);
1021
1022 /* skip tuples not visible to this snapshot */
1023 if (!visible)
1024 continue;
1025
1026 /* skip any tuples that don't match the scan key */
1027 if (key != NULL &&
1029 nkeys, key))
1030 continue;
1031
1033 scan->rs_coffset = lineoff;
1034 return;
1035 }
1036
1037 /*
1038 * if we get here, it means we've exhausted the items on this page and
1039 * it's time to move to the next.
1040 */
1042 }
1043
1044 /* end of scan */
1045 if (BufferIsValid(scan->rs_cbuf))
1046 ReleaseBuffer(scan->rs_cbuf);
1047
1048 scan->rs_cbuf = InvalidBuffer;
1051 tuple->t_data = NULL;
1052 scan->rs_inited = false;
1053}
1054
1055/* ----------------
1056 * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
1057 *
1058 * Same API as heapgettup, but used in page-at-a-time mode
1059 *
1060 * The internal logic is much the same as heapgettup's too, but there are some
1061 * differences: we do not take the buffer content lock (that only needs to
1062 * happen inside heap_prepare_pagescan), and we iterate through just the
1063 * tuples listed in rs_vistuples[] rather than all tuples on the page. Notice
1064 * that lineindex is 0-based, where the corresponding loop variable lineoff in
1065 * heapgettup is 1-based.
1066 * ----------------
1067 */
1068static void
1070 ScanDirection dir,
1071 int nkeys,
1072 ScanKey key)
1073{
1074 HeapTuple tuple = &(scan->rs_ctup);
1075 Page page;
1078
1079 if (likely(scan->rs_inited))
1080 {
1081 /* continue from previously returned page/tuple */
1082 page = BufferGetPage(scan->rs_cbuf);
1083
1084 lineindex = scan->rs_cindex + dir;
1085 if (ScanDirectionIsForward(dir))
1086 linesleft = scan->rs_ntuples - lineindex;
1087 else
1088 linesleft = scan->rs_cindex;
1089 /* lineindex now references the next or previous visible tid */
1090
1091 goto continue_page;
1092 }
1093
1094 /*
1095 * advance the scan until we find a qualifying tuple or run out of stuff
1096 * to scan
1097 */
1098 while (true)
1099 {
1100 heap_fetch_next_buffer(scan, dir);
1101
1102 /* did we run out of blocks to scan? */
1103 if (!BufferIsValid(scan->rs_cbuf))
1104 break;
1105
1107
1108 /* prune the page and determine visible tuple offsets */
1110 page = BufferGetPage(scan->rs_cbuf);
1111 linesleft = scan->rs_ntuples;
1113
1114 /* block is the same for all tuples, set it once outside the loop */
1116
1117 /* lineindex now references the next or previous visible tid */
1119
1120 for (; linesleft > 0; linesleft--, lineindex += dir)
1121 {
1122 ItemId lpp;
1124
1125 Assert(lineindex < scan->rs_ntuples);
1127 lpp = PageGetItemId(page, lineoff);
1129
1130 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1131 tuple->t_len = ItemIdGetLength(lpp);
1133
1134 /* skip any tuples that don't match the scan key */
1135 if (key != NULL &&
1137 nkeys, key))
1138 continue;
1139
1140 scan->rs_cindex = lineindex;
1141 return;
1142 }
1143 }
1144
1145 /* end of scan */
1146 if (BufferIsValid(scan->rs_cbuf))
1147 ReleaseBuffer(scan->rs_cbuf);
1148 scan->rs_cbuf = InvalidBuffer;
1151 tuple->t_data = NULL;
1152 scan->rs_inited = false;
1153}
1154
1155
1156/* ----------------------------------------------------------------
1157 * heap access method interface
1158 * ----------------------------------------------------------------
1159 */
1160
1161
1163heap_beginscan(Relation relation, Snapshot snapshot,
1164 int nkeys, ScanKey key,
1165 ParallelTableScanDesc parallel_scan,
1166 uint32 flags)
1167{
1168 HeapScanDesc scan;
1169
1170 /*
1171 * increment relation ref count while scanning relation
1172 *
1173 * This is just to make really sure the relcache entry won't go away while
1174 * the scan has a pointer to it. Caller should be holding the rel open
1175 * anyway, so this is redundant in all normal scenarios...
1176 */
1178
1179 /*
1180 * allocate and initialize scan descriptor
1181 */
1182 if (flags & SO_TYPE_BITMAPSCAN)
1183 {
1185
1186 /*
1187 * Bitmap Heap scans do not have any fields that a normal Heap Scan
1188 * does not have, so no special initializations required here.
1189 */
1190 scan = (HeapScanDesc) bscan;
1191 }
1192 else
1194
1195 scan->rs_base.rs_rd = relation;
1196 scan->rs_base.rs_snapshot = snapshot;
1197 scan->rs_base.rs_nkeys = nkeys;
1198 scan->rs_base.rs_flags = flags;
1199 scan->rs_base.rs_parallel = parallel_scan;
1200 scan->rs_strategy = NULL; /* set in initscan */
1201 scan->rs_cbuf = InvalidBuffer;
1202
1203 /*
1204 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1205 */
1206 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1208
1209 /* Check that a historic snapshot is not used for non-catalog tables */
1210 if (snapshot &&
1211 IsHistoricMVCCSnapshot(snapshot) &&
1213 {
1214 ereport(ERROR,
1216 errmsg("cannot query non-catalog table \"%s\" during logical decoding",
1217 RelationGetRelationName(relation))));
1218 }
1219
1220 /*
1221 * For seqscan and sample scans in a serializable transaction, acquire a
1222 * predicate lock on the entire relation. This is required not only to
1223 * lock all the matching tuples, but also to conflict with new insertions
1224 * into the table. In an indexscan, we take page locks on the index pages
1225 * covering the range specified in the scan qual, but in a heap scan there
1226 * is nothing more fine-grained to lock. A bitmap scan is a different
1227 * story, there we have already scanned the index and locked the index
1228 * pages covering the predicate. But in that case we still have to lock
1229 * any matching heap tuples. For sample scan we could optimize the locking
1230 * to be at least page-level granularity, but we'd need to add per-tuple
1231 * locking for that.
1232 */
1234 {
1235 /*
1236 * Ensure a missing snapshot is noticed reliably, even if the
1237 * isolation mode means predicate locking isn't performed (and
1238 * therefore the snapshot isn't used here).
1239 */
1240 Assert(snapshot);
1241 PredicateLockRelation(relation, snapshot);
1242 }
1243
1244 /* we only need to set this up once */
1245 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1246
1247 /*
1248 * Allocate memory to keep track of page allocation for parallel workers
1249 * when doing a parallel scan.
1250 */
1251 if (parallel_scan != NULL)
1253 else
1255
1256 /*
1257 * we do this here instead of in initscan() because heap_rescan also calls
1258 * initscan() and we don't want to allocate memory again
1259 */
1260 if (nkeys > 0)
1261 scan->rs_base.rs_key = palloc_array(ScanKeyData, nkeys);
1262 else
1263 scan->rs_base.rs_key = NULL;
1264
1265 initscan(scan, key, false);
1266
1267 scan->rs_read_stream = NULL;
1268
1269 /*
1270 * Set up a read stream for sequential scans and TID range scans. This
1271 * should be done after initscan() because initscan() allocates the
1272 * BufferAccessStrategy object passed to the read stream API.
1273 */
1274 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN ||
1276 {
1278
1279 if (scan->rs_base.rs_parallel)
1281 else
1283
1284 /* ---
1285 * It is safe to use batchmode as the only locks taken by `cb`
1286 * are never taken while waiting for IO:
1287 * - SyncScanLock is used in the non-parallel case
1288 * - in the parallel case, only spinlocks and atomics are used
1289 * ---
1290 */
1293 scan->rs_strategy,
1294 scan->rs_base.rs_rd,
1296 cb,
1297 scan,
1298 0);
1299 }
1300 else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
1301 {
1304 scan->rs_strategy,
1305 scan->rs_base.rs_rd,
1308 scan,
1309 sizeof(TBMIterateResult));
1310 }
1311
1312
1313 return (TableScanDesc) scan;
1314}
1315
1316void
1318 bool allow_strat, bool allow_sync, bool allow_pagemode)
1319{
1321
1322 if (set_params)
1323 {
1324 if (allow_strat)
1326 else
1328
1329 if (allow_sync)
1331 else
1333
1334 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1337 else
1339 }
1340
1341 /*
1342 * unpin scan buffers
1343 */
1344 if (BufferIsValid(scan->rs_cbuf))
1345 {
1346 ReleaseBuffer(scan->rs_cbuf);
1347 scan->rs_cbuf = InvalidBuffer;
1348 }
1349
1350 /*
1351 * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any
1352 * additional data vs a normal HeapScan
1353 */
1354
1355 /*
1356 * The read stream is reset on rescan. This must be done before
1357 * initscan(), as some state referred to by read_stream_reset() is reset
1358 * in initscan().
1359 */
1360 if (scan->rs_read_stream)
1362
1363 /*
1364 * reinitialize scan descriptor
1365 */
1366 initscan(scan, key, true);
1367}
1368
1369void
1371{
1373
1374 /* Note: no locking manipulations needed */
1375
1376 /*
1377 * unpin scan buffers
1378 */
1379 if (BufferIsValid(scan->rs_cbuf))
1380 ReleaseBuffer(scan->rs_cbuf);
1381
1382 /*
1383 * Must free the read stream before freeing the BufferAccessStrategy.
1384 */
1385 if (scan->rs_read_stream)
1387
1388 /*
1389 * decrement relation reference count and free scan descriptor storage
1390 */
1392
1393 if (scan->rs_base.rs_key)
1394 pfree(scan->rs_base.rs_key);
1395
1396 if (scan->rs_strategy != NULL)
1398
1399 if (scan->rs_parallelworkerdata != NULL)
1401
1402 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1404
1405 pfree(scan);
1406}
1407
1410{
1412
1413 /*
1414 * This is still widely used directly, without going through table AM, so
1415 * add a safety check. It's possible we should, at a later point,
1416 * downgrade this to an assert. The reason for checking the AM routine,
1417 * rather than the AM oid, is that this allows to write regression tests
1418 * that create another AM reusing the heap handler.
1419 */
1420 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1421 ereport(ERROR,
1423 errmsg_internal("only heap AM is supported")));
1424
1425 /* Note: no locking manipulations needed */
1426
1428 heapgettup_pagemode(scan, direction,
1429 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1430 else
1431 heapgettup(scan, direction,
1432 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1433
1434 if (scan->rs_ctup.t_data == NULL)
1435 return NULL;
1436
1437 /*
1438 * if we get here it means we have a new current scan tuple, so point to
1439 * the proper return buffer and return the tuple.
1440 */
1441
1443
1444 return &scan->rs_ctup;
1445}
1446
1447bool
1449{
1451
1452 /* Note: no locking manipulations needed */
1453
1454 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1455 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1456 else
1457 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1458
1459 if (scan->rs_ctup.t_data == NULL)
1460 {
1461 ExecClearTuple(slot);
1462 return false;
1463 }
1464
1465 /*
1466 * if we get here it means we have a new current scan tuple, so point to
1467 * the proper return buffer and return the tuple.
1468 */
1469
1471
1472 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1473 scan->rs_cbuf);
1474 return true;
1475}
1476
1477void
1480{
1486
1487 /*
1488 * For relations without any pages, we can simply leave the TID range
1489 * unset. There will be no tuples to scan, therefore no tuples outside
1490 * the given TID range.
1491 */
1492 if (scan->rs_nblocks == 0)
1493 return;
1494
1495 /*
1496 * Set up some ItemPointers which point to the first and last possible
1497 * tuples in the heap.
1498 */
1501
1502 /*
1503 * If the given maximum TID is below the highest possible TID in the
1504 * relation, then restrict the range to that, otherwise we scan to the end
1505 * of the relation.
1506 */
1509
1510 /*
1511 * If the given minimum TID is above the lowest possible TID in the
1512 * relation, then restrict the range to only scan for TIDs above that.
1513 */
1516
1517 /*
1518 * Check for an empty range and protect from would be negative results
1519 * from the numBlks calculation below.
1520 */
1522 {
1523 /* Set an empty range of blocks to scan */
1525 return;
1526 }
1527
1528 /*
1529 * Calculate the first block and the number of blocks we must scan. We
1530 * could be more aggressive here and perform some more validation to try
1531 * and further narrow the scope of blocks to scan by checking if the
1532 * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1533 * advance startBlk by one. Likewise, if highestItem has an offset of 0
1534 * we could scan one fewer blocks. However, such an optimization does not
1535 * seem worth troubling over, currently.
1536 */
1538
1541
1542 /* Set the start block and number of blocks to scan */
1544
1545 /* Finally, set the TID range in sscan */
1546 ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
1547 ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
1548}
1549
1550bool
1552 TupleTableSlot *slot)
1553{
1555 ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
1556 ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
1557
1558 /* Note: no locking manipulations needed */
1559 for (;;)
1560 {
1561 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1562 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1563 else
1564 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1565
1566 if (scan->rs_ctup.t_data == NULL)
1567 {
1568 ExecClearTuple(slot);
1569 return false;
1570 }
1571
1572 /*
1573 * heap_set_tidrange will have used heap_setscanlimits to limit the
1574 * range of pages we scan to only ones that can contain the TID range
1575 * we're scanning for. Here we must filter out any tuples from these
1576 * pages that are outside of that range.
1577 */
1578 if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1579 {
1580 ExecClearTuple(slot);
1581
1582 /*
1583 * When scanning backwards, the TIDs will be in descending order.
1584 * Future tuples in this direction will be lower still, so we can
1585 * just return false to indicate there will be no more tuples.
1586 */
1587 if (ScanDirectionIsBackward(direction))
1588 return false;
1589
1590 continue;
1591 }
1592
1593 /*
1594 * Likewise for the final page, we must filter out TIDs greater than
1595 * maxtid.
1596 */
1597 if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1598 {
1599 ExecClearTuple(slot);
1600
1601 /*
1602 * When scanning forward, the TIDs will be in ascending order.
1603 * Future tuples in this direction will be higher still, so we can
1604 * just return false to indicate there will be no more tuples.
1605 */
1606 if (ScanDirectionIsForward(direction))
1607 return false;
1608 continue;
1609 }
1610
1611 break;
1612 }
1613
1614 /*
1615 * if we get here it means we have a new current scan tuple, so point to
1616 * the proper return buffer and return the tuple.
1617 */
1619
1620 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1621 return true;
1622}
1623
1624/*
1625 * heap_fetch - retrieve tuple with given tid
1626 *
1627 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1628 * the tuple, fill in the remaining fields of *tuple, and check the tuple
1629 * against the specified snapshot.
1630 *
1631 * If successful (tuple found and passes snapshot time qual), then *userbuf
1632 * is set to the buffer holding the tuple and true is returned. The caller
1633 * must unpin the buffer when done with the tuple.
1634 *
1635 * If the tuple is not found (ie, item number references a deleted slot),
1636 * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer,
1637 * and false is returned.
1638 *
1639 * If the tuple is found but fails the time qual check, then the behavior
1640 * depends on the keep_buf parameter. If keep_buf is false, the results
1641 * are the same as for the tuple-not-found case. If keep_buf is true,
1642 * then tuple->t_data and *userbuf are returned as for the success case,
1643 * and again the caller must unpin the buffer; but false is returned.
1644 *
1645 * heap_fetch does not follow HOT chains: only the exact TID requested will
1646 * be fetched.
1647 *
1648 * It is somewhat inconsistent that we ereport() on invalid block number but
1649 * return false on invalid item number. There are a couple of reasons though.
1650 * One is that the caller can relatively easily check the block number for
1651 * validity, but cannot check the item number without reading the page
1652 * himself. Another is that when we are following a t_ctid link, we can be
1653 * reasonably confident that the page number is valid (since VACUUM shouldn't
1654 * truncate off the destination page without having killed the referencing
1655 * tuple first), but the item number might well not be good.
1656 */
1657bool
1658heap_fetch(Relation relation,
1659 Snapshot snapshot,
1660 HeapTuple tuple,
1661 Buffer *userbuf,
1662 bool keep_buf)
1663{
1664 ItemPointer tid = &(tuple->t_self);
1665 ItemId lp;
1666 Buffer buffer;
1667 Page page;
1668 OffsetNumber offnum;
1669 bool valid;
1670
1671 /*
1672 * Fetch and pin the appropriate page of the relation.
1673 */
1674 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1675
1676 /*
1677 * Need share lock on buffer to examine tuple commit status.
1678 */
1680 page = BufferGetPage(buffer);
1681
1682 /*
1683 * We'd better check for out-of-range offnum in case of VACUUM since the
1684 * TID was obtained.
1685 */
1686 offnum = ItemPointerGetOffsetNumber(tid);
1688 {
1690 ReleaseBuffer(buffer);
1692 tuple->t_data = NULL;
1693 return false;
1694 }
1695
1696 /*
1697 * get the item line pointer corresponding to the requested tid
1698 */
1699 lp = PageGetItemId(page, offnum);
1700
1701 /*
1702 * Must check for deleted tuple.
1703 */
1704 if (!ItemIdIsNormal(lp))
1705 {
1707 ReleaseBuffer(buffer);
1709 tuple->t_data = NULL;
1710 return false;
1711 }
1712
1713 /*
1714 * fill in *tuple fields
1715 */
1716 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1717 tuple->t_len = ItemIdGetLength(lp);
1718 tuple->t_tableOid = RelationGetRelid(relation);
1719
1720 /*
1721 * check tuple visibility, then release lock
1722 */
1723 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1724
1725 if (valid)
1726 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1728
1729 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1730
1732
1733 if (valid)
1734 {
1735 /*
1736 * All checks passed, so return the tuple as valid. Caller is now
1737 * responsible for releasing the buffer.
1738 */
1739 *userbuf = buffer;
1740
1741 return true;
1742 }
1743
1744 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1745 if (keep_buf)
1746 *userbuf = buffer;
1747 else
1748 {
1749 ReleaseBuffer(buffer);
1751 tuple->t_data = NULL;
1752 }
1753
1754 return false;
1755}
1756
1757/*
1758 * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1759 *
1760 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1761 * of a HOT chain), and buffer is the buffer holding this tuple. We search
1762 * for the first chain member satisfying the given snapshot. If one is
1763 * found, we update *tid to reference that tuple's offset number, and
1764 * return true. If no match, return false without modifying *tid.
1765 *
1766 * heapTuple is a caller-supplied buffer. When a match is found, we return
1767 * the tuple here, in addition to updating *tid. If no match is found, the
1768 * contents of this buffer on return are undefined.
1769 *
1770 * If all_dead is not NULL, we check non-visible tuples to see if they are
1771 * globally dead; *all_dead is set true if all members of the HOT chain
1772 * are vacuumable, false if not.
1773 *
1774 * Unlike heap_fetch, the caller must already have pin and (at least) share
1775 * lock on the buffer; it is still pinned/locked at exit.
1776 */
1777bool
1779 Snapshot snapshot, HeapTuple heapTuple,
1780 bool *all_dead, bool first_call)
1781{
1782 Page page = BufferGetPage(buffer);
1784 BlockNumber blkno;
1785 OffsetNumber offnum;
1786 bool at_chain_start;
1787 bool valid;
1788 bool skip;
1789 GlobalVisState *vistest = NULL;
1790
1791 /* If this is not the first call, previous call returned a (live!) tuple */
1792 if (all_dead)
1794
1795 blkno = ItemPointerGetBlockNumber(tid);
1796 offnum = ItemPointerGetOffsetNumber(tid);
1798 skip = !first_call;
1799
1800 /* XXX: we should assert that a snapshot is pushed or registered */
1802 Assert(BufferGetBlockNumber(buffer) == blkno);
1803
1804 /* Scan through possible multiple members of HOT-chain */
1805 for (;;)
1806 {
1807 ItemId lp;
1808
1809 /* check for bogus TID */
1811 break;
1812
1813 lp = PageGetItemId(page, offnum);
1814
1815 /* check for unused, dead, or redirected items */
1816 if (!ItemIdIsNormal(lp))
1817 {
1818 /* We should only see a redirect at start of chain */
1820 {
1821 /* Follow the redirect */
1822 offnum = ItemIdGetRedirect(lp);
1823 at_chain_start = false;
1824 continue;
1825 }
1826 /* else must be end of chain */
1827 break;
1828 }
1829
1830 /*
1831 * Update heapTuple to point to the element of the HOT chain we're
1832 * currently investigating. Having t_self set correctly is important
1833 * because the SSI checks and the *Satisfies routine for historical
1834 * MVCC snapshots need the correct tid to decide about the visibility.
1835 */
1836 heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1837 heapTuple->t_len = ItemIdGetLength(lp);
1838 heapTuple->t_tableOid = RelationGetRelid(relation);
1839 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1840
1841 /*
1842 * Shouldn't see a HEAP_ONLY tuple at chain start.
1843 */
1845 break;
1846
1847 /*
1848 * The xmin should match the previous xmax value, else chain is
1849 * broken.
1850 */
1854 break;
1855
1856 /*
1857 * When first_call is true (and thus, skip is initially false) we'll
1858 * return the first tuple we find. But on later passes, heapTuple
1859 * will initially be pointing to the tuple we returned last time.
1860 * Returning it again would be incorrect (and would loop forever), so
1861 * we skip it and return the next match we find.
1862 */
1863 if (!skip)
1864 {
1865 /* If it's visible per the snapshot, we must return it */
1866 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1868 buffer, snapshot);
1869
1870 if (valid)
1871 {
1872 ItemPointerSetOffsetNumber(tid, offnum);
1873 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1875 if (all_dead)
1876 *all_dead = false;
1877 return true;
1878 }
1879 }
1880 skip = false;
1881
1882 /*
1883 * If we can't see it, maybe no one else can either. At caller
1884 * request, check whether all chain members are dead to all
1885 * transactions.
1886 *
1887 * Note: if you change the criterion here for what is "dead", fix the
1888 * planner's get_actual_variable_range() function to match.
1889 */
1890 if (all_dead && *all_dead)
1891 {
1892 if (!vistest)
1893 vistest = GlobalVisTestFor(relation);
1894
1895 if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1896 *all_dead = false;
1897 }
1898
1899 /*
1900 * Check to see if HOT chain continues past this tuple; if so fetch
1901 * the next offnum and loop around.
1902 */
1904 {
1905 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1906 blkno);
1907 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1908 at_chain_start = false;
1910 }
1911 else
1912 break; /* end of chain */
1913 }
1914
1915 return false;
1916}
1917
1918/*
1919 * heap_get_latest_tid - get the latest tid of a specified tuple
1920 *
1921 * Actually, this gets the latest version that is visible according to the
1922 * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1923 * possibly uncommitted version.
1924 *
1925 * *tid is both an input and an output parameter: it is updated to
1926 * show the latest version of the row. Note that it will not be changed
1927 * if no version of the row passes the snapshot test.
1928 */
1929void
1931 ItemPointer tid)
1932{
1933 Relation relation = sscan->rs_rd;
1934 Snapshot snapshot = sscan->rs_snapshot;
1935 ItemPointerData ctid;
1937
1938 /*
1939 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1940 * Assume that t_ctid links are valid however - there shouldn't be invalid
1941 * ones in the table.
1942 */
1944
1945 /*
1946 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1947 * need to examine, and *tid is the TID we will return if ctid turns out
1948 * to be bogus.
1949 *
1950 * Note that we will loop until we reach the end of the t_ctid chain.
1951 * Depending on the snapshot passed, there might be at most one visible
1952 * version of the row, but we don't try to optimize for that.
1953 */
1954 ctid = *tid;
1955 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1956 for (;;)
1957 {
1958 Buffer buffer;
1959 Page page;
1960 OffsetNumber offnum;
1961 ItemId lp;
1962 HeapTupleData tp;
1963 bool valid;
1964
1965 /*
1966 * Read, pin, and lock the page.
1967 */
1968 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1970 page = BufferGetPage(buffer);
1971
1972 /*
1973 * Check for bogus item number. This is not treated as an error
1974 * condition because it can happen while following a t_ctid link. We
1975 * just assume that the prior tid is OK and return it unchanged.
1976 */
1977 offnum = ItemPointerGetOffsetNumber(&ctid);
1979 {
1980 UnlockReleaseBuffer(buffer);
1981 break;
1982 }
1983 lp = PageGetItemId(page, offnum);
1984 if (!ItemIdIsNormal(lp))
1985 {
1986 UnlockReleaseBuffer(buffer);
1987 break;
1988 }
1989
1990 /* OK to access the tuple */
1991 tp.t_self = ctid;
1992 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1993 tp.t_len = ItemIdGetLength(lp);
1994 tp.t_tableOid = RelationGetRelid(relation);
1995
1996 /*
1997 * After following a t_ctid link, we might arrive at an unrelated
1998 * tuple. Check for XMIN match.
1999 */
2002 {
2003 UnlockReleaseBuffer(buffer);
2004 break;
2005 }
2006
2007 /*
2008 * Check tuple visibility; if visible, set it as the new result
2009 * candidate.
2010 */
2011 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2012 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2013 if (valid)
2014 *tid = ctid;
2015
2016 /*
2017 * If there's a valid t_ctid link, follow it, else we're done.
2018 */
2019 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2023 {
2024 UnlockReleaseBuffer(buffer);
2025 break;
2026 }
2027
2028 ctid = tp.t_data->t_ctid;
2030 UnlockReleaseBuffer(buffer);
2031 } /* end of loop */
2032}
2033
2034
2035/*
2036 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2037 *
2038 * This is called after we have waited for the XMAX transaction to terminate.
2039 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2040 * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2041 * hint bit if possible --- but beware that that may not yet be possible,
2042 * if the transaction committed asynchronously.
2043 *
2044 * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2045 * even if it commits.
2046 *
2047 * Hence callers should look only at XMAX_INVALID.
2048 *
2049 * Note this is not allowed for tuples whose xmax is a multixact.
2050 */
2051static void
2053{
2056
2058 {
2059 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2062 xid);
2063 else
2066 }
2067}
2068
2069
2070/*
2071 * GetBulkInsertState - prepare status object for a bulk insert
2072 */
2075{
2076 BulkInsertState bistate;
2077
2080 bistate->current_buf = InvalidBuffer;
2081 bistate->next_free = InvalidBlockNumber;
2082 bistate->last_free = InvalidBlockNumber;
2083 bistate->already_extended_by = 0;
2084 return bistate;
2085}
2086
2087/*
2088 * FreeBulkInsertState - clean up after finishing a bulk insert
2089 */
2090void
2092{
2093 if (bistate->current_buf != InvalidBuffer)
2094 ReleaseBuffer(bistate->current_buf);
2095 FreeAccessStrategy(bistate->strategy);
2096 pfree(bistate);
2097}
2098
2099/*
2100 * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2101 */
2102void
2104{
2105 if (bistate->current_buf != InvalidBuffer)
2106 ReleaseBuffer(bistate->current_buf);
2107 bistate->current_buf = InvalidBuffer;
2108
2109 /*
2110 * Despite the name, we also reset bulk relation extension state.
2111 * Otherwise we can end up erroring out due to looking for free space in
2112 * ->next_free of one partition, even though ->next_free was set when
2113 * extending another partition. It could obviously also be bad for
2114 * efficiency to look at existing blocks at offsets from another
2115 * partition, even if we don't error out.
2116 */
2117 bistate->next_free = InvalidBlockNumber;
2118 bistate->last_free = InvalidBlockNumber;
2119}
2120
2121
2122/*
2123 * heap_insert - insert tuple into a heap
2124 *
2125 * The new tuple is stamped with current transaction ID and the specified
2126 * command ID.
2127 *
2128 * See table_tuple_insert for comments about most of the input flags, except
2129 * that this routine directly takes a tuple rather than a slot.
2130 *
2131 * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
2132 * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
2133 * implement table_tuple_insert_speculative().
2134 *
2135 * On return the header fields of *tup are updated to match the stored tuple;
2136 * in particular tup->t_self receives the actual TID where the tuple was
2137 * stored. But note that any toasting of fields within the tuple data is NOT
2138 * reflected into *tup.
2139 */
2140void
2142 int options, BulkInsertState bistate)
2143{
2146 Buffer buffer;
2147 Buffer vmbuffer = InvalidBuffer;
2148 bool all_visible_cleared = false;
2149
2150 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2153
2154 AssertHasSnapshotForToast(relation);
2155
2156 /*
2157 * Fill in tuple header fields and toast the tuple if necessary.
2158 *
2159 * Note: below this point, heaptup is the data we actually intend to store
2160 * into the relation; tup is the caller's original untoasted data.
2161 */
2162 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2163
2164 /*
2165 * Find buffer to insert this tuple into. If the page is all visible,
2166 * this will also pin the requisite visibility map page.
2167 */
2168 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2169 InvalidBuffer, options, bistate,
2170 &vmbuffer, NULL,
2171 0);
2172
2173 /*
2174 * We're about to do the actual insert -- but check for conflict first, to
2175 * avoid possibly having to roll back work we've just done.
2176 *
2177 * This is safe without a recheck as long as there is no possibility of
2178 * another process scanning the page between this check and the insert
2179 * being visible to the scan (i.e., an exclusive buffer content lock is
2180 * continuously held from this point until the tuple insert is visible).
2181 *
2182 * For a heap insert, we only need to check for table-level SSI locks. Our
2183 * new tuple can't possibly conflict with existing tuple locks, and heap
2184 * page locks are only consolidated versions of tuple locks; they do not
2185 * lock "gaps" as index page locks do. So we don't need to specify a
2186 * buffer when making the call, which makes for a faster check.
2187 */
2189
2190 /* NO EREPORT(ERROR) from here till changes are logged */
2192
2193 RelationPutHeapTuple(relation, buffer, heaptup,
2195
2196 if (PageIsAllVisible(BufferGetPage(buffer)))
2197 {
2198 all_visible_cleared = true;
2200 visibilitymap_clear(relation,
2202 vmbuffer, VISIBILITYMAP_VALID_BITS);
2203 }
2204
2205 /*
2206 * XXX Should we set PageSetPrunable on this page ?
2207 *
2208 * The inserting transaction may eventually abort thus making this tuple
2209 * DEAD and hence available for pruning. Though we don't want to optimize
2210 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2211 * aborted tuple will never be pruned until next vacuum is triggered.
2212 *
2213 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2214 */
2215
2216 MarkBufferDirty(buffer);
2217
2218 /* XLOG stuff */
2219 if (RelationNeedsWAL(relation))
2220 {
2224 Page page = BufferGetPage(buffer);
2225 uint8 info = XLOG_HEAP_INSERT;
2226 int bufflags = 0;
2227
2228 /*
2229 * If this is a catalog, we need to transmit combo CIDs to properly
2230 * decode, so log that as well.
2231 */
2233 log_heap_new_cid(relation, heaptup);
2234
2235 /*
2236 * If this is the single and first tuple on page, we can reinit the
2237 * page instead of restoring the whole thing. Set flag, and hide
2238 * buffer references from XLogInsert.
2239 */
2242 {
2243 info |= XLOG_HEAP_INIT_PAGE;
2245 }
2246
2247 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2248 xlrec.flags = 0;
2254
2255 /*
2256 * For logical decoding, we need the tuple even if we're doing a full
2257 * page write, so make sure it's included even if we take a full-page
2258 * image. (XXX We could alternatively store a pointer into the FPW).
2259 */
2260 if (RelationIsLogicallyLogged(relation) &&
2262 {
2265
2266 if (IsToastRelation(relation))
2268 }
2269
2272
2273 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2274 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2275 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2276
2277 /*
2278 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2279 * write the whole page to the xlog, we don't need to store
2280 * xl_heap_header in the xlog.
2281 */
2284 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2286 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2288
2289 /* filtering by origin on a row level is much more efficient */
2291
2292 recptr = XLogInsert(RM_HEAP_ID, info);
2293
2294 PageSetLSN(page, recptr);
2295 }
2296
2298
2299 UnlockReleaseBuffer(buffer);
2300 if (vmbuffer != InvalidBuffer)
2301 ReleaseBuffer(vmbuffer);
2302
2303 /*
2304 * If tuple is cacheable, mark it for invalidation from the caches in case
2305 * we abort. Note it is OK to do this after releasing the buffer, because
2306 * the heaptup data structure is all in local memory, not in the shared
2307 * buffer.
2308 */
2310
2311 /* Note: speculative insertions are counted too, even if aborted later */
2312 pgstat_count_heap_insert(relation, 1);
2313
2314 /*
2315 * If heaptup is a private copy, release it. Don't forget to copy t_self
2316 * back to the caller's image, too.
2317 */
2318 if (heaptup != tup)
2319 {
2320 tup->t_self = heaptup->t_self;
2322 }
2323}
2324
2325/*
2326 * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2327 * tuple header fields and toasts the tuple if necessary. Returns a toasted
2328 * version of the tuple if it was toasted, or the original tuple if not. Note
2329 * that in any case, the header fields are also set in the original tuple.
2330 */
2331static HeapTuple
2333 CommandId cid, int options)
2334{
2335 /*
2336 * To allow parallel inserts, we need to ensure that they are safe to be
2337 * performed in workers. We have the infrastructure to allow parallel
2338 * inserts in general except for the cases where inserts generate a new
2339 * CommandId (eg. inserts into a table having a foreign key column).
2340 */
2341 if (IsParallelWorker())
2342 ereport(ERROR,
2344 errmsg("cannot insert tuples in a parallel worker")));
2345
2346 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2347 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2348 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2349 HeapTupleHeaderSetXmin(tup->t_data, xid);
2352
2353 HeapTupleHeaderSetCmin(tup->t_data, cid);
2354 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2355 tup->t_tableOid = RelationGetRelid(relation);
2356
2357 /*
2358 * If the new tuple is too big for storage or contains already toasted
2359 * out-of-line attributes from some other relation, invoke the toaster.
2360 */
2361 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2362 relation->rd_rel->relkind != RELKIND_MATVIEW)
2363 {
2364 /* toast table entries should never be recursively toasted */
2366 return tup;
2367 }
2368 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2369 return heap_toast_insert_or_update(relation, tup, NULL, options);
2370 else
2371 return tup;
2372}
2373
2374/*
2375 * Helper for heap_multi_insert() that computes the number of entire pages
2376 * that inserting the remaining heaptuples requires. Used to determine how
2377 * much the relation needs to be extended by.
2378 */
2379static int
2381{
2383 int npages = 1;
2384
2385 for (int i = done; i < ntuples; i++)
2386 {
2387 size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2388
2389 if (page_avail < tup_sz)
2390 {
2391 npages++;
2393 }
2394 page_avail -= tup_sz;
2395 }
2396
2397 return npages;
2398}
2399
2400/*
2401 * heap_multi_insert - insert multiple tuples into a heap
2402 *
2403 * This is like heap_insert(), but inserts multiple tuples in one operation.
2404 * That's faster than calling heap_insert() in a loop, because when multiple
2405 * tuples can be inserted on a single page, we can write just a single WAL
2406 * record covering all of them, and only need to lock/unlock the page once.
2407 *
2408 * Note: this leaks memory into the current memory context. You can create a
2409 * temporary context before calling this, if that's a problem.
2410 */
2411void
2412heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2413 CommandId cid, int options, BulkInsertState bistate)
2414{
2417 int i;
2418 int ndone;
2420 Page page;
2421 Buffer vmbuffer = InvalidBuffer;
2422 bool needwal;
2426 bool starting_with_empty_page = false;
2427 int npages = 0;
2428 int npages_used = 0;
2429
2430 /* currently not needed (thus unsupported) for heap_multi_insert() */
2432
2433 AssertHasSnapshotForToast(relation);
2434
2435 needwal = RelationNeedsWAL(relation);
2438
2439 /* Toast and set header data in all the slots */
2440 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2441 for (i = 0; i < ntuples; i++)
2442 {
2443 HeapTuple tuple;
2444
2445 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2446 slots[i]->tts_tableOid = RelationGetRelid(relation);
2447 tuple->t_tableOid = slots[i]->tts_tableOid;
2448 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2449 options);
2450 }
2451
2452 /*
2453 * We're about to do the actual inserts -- but check for conflict first,
2454 * to minimize the possibility of having to roll back work we've just
2455 * done.
2456 *
2457 * A check here does not definitively prevent a serialization anomaly;
2458 * that check MUST be done at least past the point of acquiring an
2459 * exclusive buffer content lock on every buffer that will be affected,
2460 * and MAY be done after all inserts are reflected in the buffers and
2461 * those locks are released; otherwise there is a race condition. Since
2462 * multiple buffers can be locked and unlocked in the loop below, and it
2463 * would not be feasible to identify and lock all of those buffers before
2464 * the loop, we must do a final check at the end.
2465 *
2466 * The check here could be omitted with no loss of correctness; it is
2467 * present strictly as an optimization.
2468 *
2469 * For heap inserts, we only need to check for table-level SSI locks. Our
2470 * new tuples can't possibly conflict with existing tuple locks, and heap
2471 * page locks are only consolidated versions of tuple locks; they do not
2472 * lock "gaps" as index page locks do. So we don't need to specify a
2473 * buffer when making the call, which makes for a faster check.
2474 */
2476
2477 ndone = 0;
2478 while (ndone < ntuples)
2479 {
2480 Buffer buffer;
2481 bool all_visible_cleared = false;
2482 bool all_frozen_set = false;
2483 int nthispage;
2484
2486
2487 /*
2488 * Compute number of pages needed to fit the to-be-inserted tuples in
2489 * the worst case. This will be used to determine how much to extend
2490 * the relation by in RelationGetBufferForTuple(), if needed. If we
2491 * filled a prior page from scratch, we can just update our last
2492 * computation, but if we started with a partially filled page,
2493 * recompute from scratch, the number of potentially required pages
2494 * can vary due to tuples needing to fit onto the page, page headers
2495 * etc.
2496 */
2497 if (ndone == 0 || !starting_with_empty_page)
2498 {
2499 npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2501 npages_used = 0;
2502 }
2503 else
2504 npages_used++;
2505
2506 /*
2507 * Find buffer where at least the next tuple will fit. If the page is
2508 * all-visible, this will also pin the requisite visibility map page.
2509 *
2510 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2511 * empty page. See all_frozen_set below.
2512 */
2513 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2514 InvalidBuffer, options, bistate,
2515 &vmbuffer, NULL,
2516 npages - npages_used);
2517 page = BufferGetPage(buffer);
2518
2520
2522 {
2523 all_frozen_set = true;
2524 /* Lock the vmbuffer before entering the critical section */
2526 }
2527
2528 /* NO EREPORT(ERROR) from here till changes are logged */
2530
2531 /*
2532 * RelationGetBufferForTuple has ensured that the first tuple fits.
2533 * Put that on the page, and then as many other tuples as fit.
2534 */
2535 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2536
2537 /*
2538 * For logical decoding we need combo CIDs to properly decode the
2539 * catalog.
2540 */
2541 if (needwal && need_cids)
2542 log_heap_new_cid(relation, heaptuples[ndone]);
2543
2544 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2545 {
2547
2548 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2549 break;
2550
2551 RelationPutHeapTuple(relation, buffer, heaptup, false);
2552
2553 /*
2554 * For logical decoding we need combo CIDs to properly decode the
2555 * catalog.
2556 */
2557 if (needwal && need_cids)
2558 log_heap_new_cid(relation, heaptup);
2559 }
2560
2561 /*
2562 * If the page is all visible, need to clear that, unless we're only
2563 * going to add further frozen rows to it.
2564 *
2565 * If we're only adding already frozen rows to a previously empty
2566 * page, mark it as all-frozen and update the visibility map. We're
2567 * already holding a pin on the vmbuffer.
2568 */
2570 {
2571 all_visible_cleared = true;
2572 PageClearAllVisible(page);
2573 visibilitymap_clear(relation,
2574 BufferGetBlockNumber(buffer),
2575 vmbuffer, VISIBILITYMAP_VALID_BITS);
2576 }
2577 else if (all_frozen_set)
2578 {
2579 PageSetAllVisible(page);
2581 vmbuffer,
2584 relation->rd_locator);
2585 }
2586
2587 /*
2588 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2589 */
2590
2591 MarkBufferDirty(buffer);
2592
2593 /* XLOG stuff */
2594 if (needwal)
2595 {
2599 char *tupledata;
2600 int totaldatalen;
2601 char *scratchptr = scratch.data;
2602 bool init;
2603 int bufflags = 0;
2604
2605 /*
2606 * If the page was previously empty, we can reinit the page
2607 * instead of restoring the whole thing.
2608 */
2610
2611 /* allocate xl_heap_multi_insert struct from the scratch area */
2614
2615 /*
2616 * Allocate offsets array. Unless we're reinitializing the page,
2617 * in that case the tuples are stored in order starting at
2618 * FirstOffsetNumber and we don't need to store the offsets
2619 * explicitly.
2620 */
2621 if (!init)
2622 scratchptr += nthispage * sizeof(OffsetNumber);
2623
2624 /* the rest of the scratch space is used for tuple data */
2625 tupledata = scratchptr;
2626
2627 /* check that the mutually exclusive flags are not both set */
2629
2630 xlrec->flags = 0;
2633
2634 /*
2635 * We don't have to worry about including a conflict xid in the
2636 * WAL record, as HEAP_INSERT_FROZEN intentionally violates
2637 * visibility rules.
2638 */
2639 if (all_frozen_set)
2641
2642 xlrec->ntuples = nthispage;
2643
2644 /*
2645 * Write out an xl_multi_insert_tuple and the tuple data itself
2646 * for each tuple.
2647 */
2648 for (i = 0; i < nthispage; i++)
2649 {
2651 xl_multi_insert_tuple *tuphdr;
2652 int datalen;
2653
2654 if (!init)
2655 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2656 /* xl_multi_insert_tuple needs two-byte alignment. */
2658 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2659
2660 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2661 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2662 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2663
2664 /* write bitmap [+ padding] [+ oid] + data */
2665 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2667 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2668 datalen);
2669 tuphdr->datalen = datalen;
2670 scratchptr += datalen;
2671 }
2672 totaldatalen = scratchptr - tupledata;
2673 Assert((scratchptr - scratch.data) < BLCKSZ);
2674
2675 if (need_tuple_data)
2677
2678 /*
2679 * Signal that this is the last xl_heap_multi_insert record
2680 * emitted by this call to heap_multi_insert(). Needed for logical
2681 * decoding so it knows when to cleanup temporary data.
2682 */
2683 if (ndone + nthispage == ntuples)
2685
2686 if (init)
2687 {
2688 info |= XLOG_HEAP_INIT_PAGE;
2690 }
2691
2692 /*
2693 * If we're doing logical decoding, include the new tuple data
2694 * even if we take a full-page image of the page.
2695 */
2696 if (need_tuple_data)
2698
2700 XLogRegisterData(xlrec, tupledata - scratch.data);
2702 if (all_frozen_set)
2703 XLogRegisterBuffer(1, vmbuffer, 0);
2704
2705 XLogRegisterBufData(0, tupledata, totaldatalen);
2706
2707 /* filtering by origin on a row level is much more efficient */
2709
2710 recptr = XLogInsert(RM_HEAP2_ID, info);
2711
2712 PageSetLSN(page, recptr);
2713 if (all_frozen_set)
2714 {
2715 Assert(BufferIsDirty(vmbuffer));
2716 PageSetLSN(BufferGetPage(vmbuffer), recptr);
2717 }
2718 }
2719
2721
2722 if (all_frozen_set)
2723 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
2724
2725 UnlockReleaseBuffer(buffer);
2726 ndone += nthispage;
2727
2728 /*
2729 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2730 * likely that we'll insert into subsequent heap pages that are likely
2731 * to use the same vm page.
2732 */
2733 }
2734
2735 /* We're done with inserting all tuples, so release the last vmbuffer. */
2736 if (vmbuffer != InvalidBuffer)
2737 ReleaseBuffer(vmbuffer);
2738
2739 /*
2740 * We're done with the actual inserts. Check for conflicts again, to
2741 * ensure that all rw-conflicts in to these inserts are detected. Without
2742 * this final check, a sequential scan of the heap may have locked the
2743 * table after the "before" check, missing one opportunity to detect the
2744 * conflict, and then scanned the table before the new tuples were there,
2745 * missing the other chance to detect the conflict.
2746 *
2747 * For heap inserts, we only need to check for table-level SSI locks. Our
2748 * new tuples can't possibly conflict with existing tuple locks, and heap
2749 * page locks are only consolidated versions of tuple locks; they do not
2750 * lock "gaps" as index page locks do. So we don't need to specify a
2751 * buffer when making the call.
2752 */
2754
2755 /*
2756 * If tuples are cacheable, mark them for invalidation from the caches in
2757 * case we abort. Note it is OK to do this after releasing the buffer,
2758 * because the heaptuples data structure is all in local memory, not in
2759 * the shared buffer.
2760 */
2761 if (IsCatalogRelation(relation))
2762 {
2763 for (i = 0; i < ntuples; i++)
2765 }
2766
2767 /* copy t_self fields back to the caller's slots */
2768 for (i = 0; i < ntuples; i++)
2769 slots[i]->tts_tid = heaptuples[i]->t_self;
2770
2771 pgstat_count_heap_insert(relation, ntuples);
2772}
2773
2774/*
2775 * simple_heap_insert - insert a tuple
2776 *
2777 * Currently, this routine differs from heap_insert only in supplying
2778 * a default command ID and not allowing access to the speedup options.
2779 *
2780 * This should be used rather than using heap_insert directly in most places
2781 * where we are modifying system catalogs.
2782 */
2783void
2785{
2786 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2787}
2788
2789/*
2790 * Given infomask/infomask2, compute the bits that must be saved in the
2791 * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2792 * xl_heap_lock_updated WAL records.
2793 *
2794 * See fix_infomask_from_infobits.
2795 */
2796static uint8
2798{
2799 return
2803 /* note we ignore HEAP_XMAX_SHR_LOCK here */
2805 ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2806 XLHL_KEYS_UPDATED : 0);
2807}
2808
2809/*
2810 * Given two versions of the same t_infomask for a tuple, compare them and
2811 * return whether the relevant status for a tuple Xmax has changed. This is
2812 * used after a buffer lock has been released and reacquired: we want to ensure
2813 * that the tuple state continues to be the same it was when we previously
2814 * examined it.
2815 *
2816 * Note the Xmax field itself must be compared separately.
2817 */
2818static inline bool
2820{
2821 const uint16 interesting =
2823
2824 if ((new_infomask & interesting) != (old_infomask & interesting))
2825 return true;
2826
2827 return false;
2828}
2829
2830/*
2831 * heap_delete - delete a tuple
2832 *
2833 * See table_tuple_delete() for an explanation of the parameters, except that
2834 * this routine directly takes a tuple rather than a slot.
2835 *
2836 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2837 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2838 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2839 * generated by another transaction).
2840 */
2842heap_delete(Relation relation, const ItemPointerData *tid,
2843 CommandId cid, Snapshot crosscheck, bool wait,
2844 TM_FailureData *tmfd, bool changingPart)
2845{
2846 TM_Result result;
2848 ItemId lp;
2849 HeapTupleData tp;
2850 Page page;
2851 BlockNumber block;
2852 Buffer buffer;
2853 Buffer vmbuffer = InvalidBuffer;
2854 TransactionId new_xmax;
2857 bool have_tuple_lock = false;
2858 bool iscombo;
2859 bool all_visible_cleared = false;
2860 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2861 bool old_key_copied = false;
2862
2864
2865 AssertHasSnapshotForToast(relation);
2866
2867 /*
2868 * Forbid this during a parallel operation, lest it allocate a combo CID.
2869 * Other workers might need that combo CID for visibility checks, and we
2870 * have no provision for broadcasting it to them.
2871 */
2872 if (IsInParallelMode())
2873 ereport(ERROR,
2875 errmsg("cannot delete tuples during a parallel operation")));
2876
2877 block = ItemPointerGetBlockNumber(tid);
2878 buffer = ReadBuffer(relation, block);
2879 page = BufferGetPage(buffer);
2880
2881 /*
2882 * Before locking the buffer, pin the visibility map page if it appears to
2883 * be necessary. Since we haven't got the lock yet, someone else might be
2884 * in the middle of changing this, so we'll need to recheck after we have
2885 * the lock.
2886 */
2887 if (PageIsAllVisible(page))
2888 visibilitymap_pin(relation, block, &vmbuffer);
2889
2891
2894
2895 tp.t_tableOid = RelationGetRelid(relation);
2896 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2897 tp.t_len = ItemIdGetLength(lp);
2898 tp.t_self = *tid;
2899
2900l1:
2901
2902 /*
2903 * If we didn't pin the visibility map page and the page has become all
2904 * visible while we were busy locking the buffer, we'll have to unlock and
2905 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2906 * unfortunate, but hopefully shouldn't happen often.
2907 */
2908 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2909 {
2911 visibilitymap_pin(relation, block, &vmbuffer);
2913 }
2914
2915 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2916
2917 if (result == TM_Invisible)
2918 {
2919 UnlockReleaseBuffer(buffer);
2920 ereport(ERROR,
2922 errmsg("attempted to delete invisible tuple")));
2923 }
2924 else if (result == TM_BeingModified && wait)
2925 {
2928
2929 /* must copy state data before unlocking buffer */
2932
2933 /*
2934 * Sleep until concurrent transaction ends -- except when there's a
2935 * single locker and it's our own transaction. Note we don't care
2936 * which lock mode the locker has, because we need the strongest one.
2937 *
2938 * Before sleeping, we need to acquire tuple lock to establish our
2939 * priority for the tuple (see heap_lock_tuple). LockTuple will
2940 * release us when we are next-in-line for the tuple.
2941 *
2942 * If we are forced to "start over" below, we keep the tuple lock;
2943 * this arranges that we stay at the head of the line while rechecking
2944 * tuple state.
2945 */
2947 {
2948 bool current_is_member = false;
2949
2952 {
2954
2955 /*
2956 * Acquire the lock, if necessary (but skip it when we're
2957 * requesting a lock and already have one; avoids deadlock).
2958 */
2959 if (!current_is_member)
2962
2963 /* wait for multixact */
2965 relation, &(tp.t_self), XLTW_Delete,
2966 NULL);
2968
2969 /*
2970 * If xwait had just locked the tuple then some other xact
2971 * could update this tuple before we get to this point. Check
2972 * for xmax change, and start over if so.
2973 *
2974 * We also must start over if we didn't pin the VM page, and
2975 * the page has become all visible.
2976 */
2977 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2980 xwait))
2981 goto l1;
2982 }
2983
2984 /*
2985 * You might think the multixact is necessarily done here, but not
2986 * so: it could have surviving members, namely our own xact or
2987 * other subxacts of this backend. It is legal for us to delete
2988 * the tuple in either case, however (the latter case is
2989 * essentially a situation of upgrading our former shared lock to
2990 * exclusive). We don't bother changing the on-disk hint bits
2991 * since we are about to overwrite the xmax altogether.
2992 */
2993 }
2995 {
2996 /*
2997 * Wait for regular transaction to end; but first, acquire tuple
2998 * lock.
2999 */
3003 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3005
3006 /*
3007 * xwait is done, but if xwait had just locked the tuple then some
3008 * other xact could update this tuple before we get to this point.
3009 * Check for xmax change, and start over if so.
3010 *
3011 * We also must start over if we didn't pin the VM page, and the
3012 * page has become all visible.
3013 */
3014 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
3017 xwait))
3018 goto l1;
3019
3020 /* Otherwise check if it committed or aborted */
3021 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3022 }
3023
3024 /*
3025 * We may overwrite if previous xmax aborted, or if it committed but
3026 * only locked the tuple without updating it.
3027 */
3028 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3031 result = TM_Ok;
3032 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
3033 result = TM_Updated;
3034 else
3035 result = TM_Deleted;
3036 }
3037
3038 /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3039 if (result != TM_Ok)
3040 {
3041 Assert(result == TM_SelfModified ||
3042 result == TM_Updated ||
3043 result == TM_Deleted ||
3044 result == TM_BeingModified);
3046 Assert(result != TM_Updated ||
3048 }
3049
3050 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3051 {
3052 /* Perform additional check for transaction-snapshot mode RI updates */
3053 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3054 result = TM_Updated;
3055 }
3056
3057 if (result != TM_Ok)
3058 {
3059 tmfd->ctid = tp.t_data->t_ctid;
3061 if (result == TM_SelfModified)
3063 else
3064 tmfd->cmax = InvalidCommandId;
3065 UnlockReleaseBuffer(buffer);
3066 if (have_tuple_lock)
3068 if (vmbuffer != InvalidBuffer)
3069 ReleaseBuffer(vmbuffer);
3070 return result;
3071 }
3072
3073 /*
3074 * We're about to do the actual delete -- check for conflict first, to
3075 * avoid possibly having to roll back work we've just done.
3076 *
3077 * This is safe without a recheck as long as there is no possibility of
3078 * another process scanning the page between this check and the delete
3079 * being visible to the scan (i.e., an exclusive buffer content lock is
3080 * continuously held from this point until the tuple delete is visible).
3081 */
3083
3084 /* replace cid with a combo CID if necessary */
3086
3087 /*
3088 * Compute replica identity tuple before entering the critical section so
3089 * we don't PANIC upon a memory allocation failure.
3090 */
3091 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3092
3093 /*
3094 * If this is the first possibly-multixact-able operation in the current
3095 * transaction, set my per-backend OldestMemberMXactId setting. We can be
3096 * certain that the transaction will never become a member of any older
3097 * MultiXactIds than that. (We have to do this even if we end up just
3098 * using our own TransactionId below, since some other backend could
3099 * incorporate our XID into a MultiXact immediately afterwards.)
3100 */
3102
3105 xid, LockTupleExclusive, true,
3106 &new_xmax, &new_infomask, &new_infomask2);
3107
3109
3110 /*
3111 * If this transaction commits, the tuple will become DEAD sooner or
3112 * later. Set flag that this page is a candidate for pruning once our xid
3113 * falls below the OldestXmin horizon. If the transaction finally aborts,
3114 * the subsequent page pruning will be a no-op and the hint will be
3115 * cleared.
3116 */
3117 PageSetPrunable(page, xid);
3118
3119 if (PageIsAllVisible(page))
3120 {
3121 all_visible_cleared = true;
3122 PageClearAllVisible(page);
3123 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3124 vmbuffer, VISIBILITYMAP_VALID_BITS);
3125 }
3126
3127 /* store transaction information of xact deleting the tuple */
3133 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3135 /* Make sure there is no forward chain link in t_ctid */
3136 tp.t_data->t_ctid = tp.t_self;
3137
3138 /* Signal that this is actually a move into another partition */
3139 if (changingPart)
3141
3142 MarkBufferDirty(buffer);
3143
3144 /*
3145 * XLOG stuff
3146 *
3147 * NB: heap_abort_speculative() uses the same xlog record and replay
3148 * routines.
3149 */
3150 if (RelationNeedsWAL(relation))
3151 {
3155
3156 /*
3157 * For logical decode we need combo CIDs to properly decode the
3158 * catalog
3159 */
3161 log_heap_new_cid(relation, &tp);
3162
3163 xlrec.flags = 0;
3166 if (changingPart)
3168 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3169 tp.t_data->t_infomask2);
3171 xlrec.xmax = new_xmax;
3172
3173 if (old_key_tuple != NULL)
3174 {
3175 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3177 else
3179 }
3180
3183
3185
3186 /*
3187 * Log replica identity of the deleted tuple if there is one
3188 */
3189 if (old_key_tuple != NULL)
3190 {
3191 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3192 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3193 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3194
3196 XLogRegisterData((char *) old_key_tuple->t_data
3198 old_key_tuple->t_len
3200 }
3201
3202 /* filtering by origin on a row level is much more efficient */
3204
3206
3207 PageSetLSN(page, recptr);
3208 }
3209
3211
3213
3214 if (vmbuffer != InvalidBuffer)
3215 ReleaseBuffer(vmbuffer);
3216
3217 /*
3218 * If the tuple has toasted out-of-line attributes, we need to delete
3219 * those items too. We have to do this before releasing the buffer
3220 * because we need to look at the contents of the tuple, but it's OK to
3221 * release the content lock on the buffer first.
3222 */
3223 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3224 relation->rd_rel->relkind != RELKIND_MATVIEW)
3225 {
3226 /* toast table entries should never be recursively toasted */
3228 }
3229 else if (HeapTupleHasExternal(&tp))
3230 heap_toast_delete(relation, &tp, false);
3231
3232 /*
3233 * Mark tuple for invalidation from system caches at next command
3234 * boundary. We have to do this before releasing the buffer because we
3235 * need to look at the contents of the tuple.
3236 */
3237 CacheInvalidateHeapTuple(relation, &tp, NULL);
3238
3239 /* Now we can release the buffer */
3240 ReleaseBuffer(buffer);
3241
3242 /*
3243 * Release the lmgr tuple lock, if we had it.
3244 */
3245 if (have_tuple_lock)
3247
3248 pgstat_count_heap_delete(relation);
3249
3252
3253 return TM_Ok;
3254}
3255
3256/*
3257 * simple_heap_delete - delete a tuple
3258 *
3259 * This routine may be used to delete a tuple when concurrent updates of
3260 * the target tuple are not expected (for example, because we have a lock
3261 * on the relation associated with the tuple). Any failure is reported
3262 * via ereport().
3263 */
3264void
3265simple_heap_delete(Relation relation, const ItemPointerData *tid)
3266{
3267 TM_Result result;
3268 TM_FailureData tmfd;
3269
3270 result = heap_delete(relation, tid,
3272 true /* wait for commit */ ,
3273 &tmfd, false /* changingPart */ );
3274 switch (result)
3275 {
3276 case TM_SelfModified:
3277 /* Tuple was already updated in current command? */
3278 elog(ERROR, "tuple already updated by self");
3279 break;
3280
3281 case TM_Ok:
3282 /* done successfully */
3283 break;
3284
3285 case TM_Updated:
3286 elog(ERROR, "tuple concurrently updated");
3287 break;
3288
3289 case TM_Deleted:
3290 elog(ERROR, "tuple concurrently deleted");
3291 break;
3292
3293 default:
3294 elog(ERROR, "unrecognized heap_delete status: %u", result);
3295 break;
3296 }
3297}
3298
3299/*
3300 * heap_update - replace a tuple
3301 *
3302 * See table_tuple_update() for an explanation of the parameters, except that
3303 * this routine directly takes a tuple rather than a slot.
3304 *
3305 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
3306 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
3307 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
3308 * generated by another transaction).
3309 */
3312 CommandId cid, Snapshot crosscheck, bool wait,
3313 TM_FailureData *tmfd, LockTupleMode *lockmode,
3315{
3316 TM_Result result;
3324 ItemId lp;
3328 bool old_key_copied = false;
3329 Page page;
3330 BlockNumber block;
3332 Buffer buffer,
3333 newbuf,
3334 vmbuffer = InvalidBuffer,
3336 bool need_toast;
3338 pagefree;
3339 bool have_tuple_lock = false;
3340 bool iscombo;
3341 bool use_hot_update = false;
3342 bool summarized_update = false;
3343 bool key_intact;
3344 bool all_visible_cleared = false;
3345 bool all_visible_cleared_new = false;
3346 bool checked_lockers;
3347 bool locker_remains;
3348 bool id_has_external = false;
3355
3357
3358 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3361
3362 AssertHasSnapshotForToast(relation);
3363
3364 /*
3365 * Forbid this during a parallel operation, lest it allocate a combo CID.
3366 * Other workers might need that combo CID for visibility checks, and we
3367 * have no provision for broadcasting it to them.
3368 */
3369 if (IsInParallelMode())
3370 ereport(ERROR,
3372 errmsg("cannot update tuples during a parallel operation")));
3373
3374#ifdef USE_ASSERT_CHECKING
3376#endif
3377
3378 /*
3379 * Fetch the list of attributes to be checked for various operations.
3380 *
3381 * For HOT considerations, this is wasted effort if we fail to update or
3382 * have to put the new tuple on a different page. But we must compute the
3383 * list before obtaining buffer lock --- in the worst case, if we are
3384 * doing an update on one of the relevant system catalogs, we could
3385 * deadlock if we try to fetch the list later. In any case, the relcache
3386 * caches the data so this is usually pretty cheap.
3387 *
3388 * We also need columns used by the replica identity and columns that are
3389 * considered the "key" of rows in the table.
3390 *
3391 * Note that we get copies of each bitmap, so we need not worry about
3392 * relcache flush happening midway through.
3393 */
3406
3408 INJECTION_POINT("heap_update-before-pin", NULL);
3409 buffer = ReadBuffer(relation, block);
3410 page = BufferGetPage(buffer);
3411
3412 /*
3413 * Before locking the buffer, pin the visibility map page if it appears to
3414 * be necessary. Since we haven't got the lock yet, someone else might be
3415 * in the middle of changing this, so we'll need to recheck after we have
3416 * the lock.
3417 */
3418 if (PageIsAllVisible(page))
3419 visibilitymap_pin(relation, block, &vmbuffer);
3420
3422
3424
3425 /*
3426 * Usually, a buffer pin and/or snapshot blocks pruning of otid, ensuring
3427 * we see LP_NORMAL here. When the otid origin is a syscache, we may have
3428 * neither a pin nor a snapshot. Hence, we may see other LP_ states, each
3429 * of which indicates concurrent pruning.
3430 *
3431 * Failing with TM_Updated would be most accurate. However, unlike other
3432 * TM_Updated scenarios, we don't know the successor ctid in LP_UNUSED and
3433 * LP_DEAD cases. While the distinction between TM_Updated and TM_Deleted
3434 * does matter to SQL statements UPDATE and MERGE, those SQL statements
3435 * hold a snapshot that ensures LP_NORMAL. Hence, the choice between
3436 * TM_Updated and TM_Deleted affects only the wording of error messages.
3437 * Settle on TM_Deleted, for two reasons. First, it avoids complicating
3438 * the specification of when tmfd->ctid is valid. Second, it creates
3439 * error log evidence that we took this branch.
3440 *
3441 * Since it's possible to see LP_UNUSED at otid, it's also possible to see
3442 * LP_NORMAL for a tuple that replaced LP_UNUSED. If it's a tuple for an
3443 * unrelated row, we'll fail with "duplicate key value violates unique".
3444 * XXX if otid is the live, newer version of the newtup row, we'll discard
3445 * changes originating in versions of this catalog row after the version
3446 * the caller got from syscache. See syscache-update-pruned.spec.
3447 */
3448 if (!ItemIdIsNormal(lp))
3449 {
3451
3452 UnlockReleaseBuffer(buffer);
3454 if (vmbuffer != InvalidBuffer)
3455 ReleaseBuffer(vmbuffer);
3456 tmfd->ctid = *otid;
3457 tmfd->xmax = InvalidTransactionId;
3458 tmfd->cmax = InvalidCommandId;
3460
3465 /* modified_attrs not yet initialized */
3467 return TM_Deleted;
3468 }
3469
3470 /*
3471 * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3472 * properly.
3473 */
3474 oldtup.t_tableOid = RelationGetRelid(relation);
3475 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3476 oldtup.t_len = ItemIdGetLength(lp);
3477 oldtup.t_self = *otid;
3478
3479 /* the new tuple is ready, except for this: */
3480 newtup->t_tableOid = RelationGetRelid(relation);
3481
3482 /*
3483 * Determine columns modified by the update. Additionally, identify
3484 * whether any of the unmodified replica identity key attributes in the
3485 * old tuple is externally stored or not. This is required because for
3486 * such attributes the flattened value won't be WAL logged as part of the
3487 * new tuple so we must include it as part of the old_key_tuple. See
3488 * ExtractReplicaIdentity.
3489 */
3491 id_attrs, &oldtup,
3493
3494 /*
3495 * If we're not updating any "key" column, we can grab a weaker lock type.
3496 * This allows for more concurrency when we are running simultaneously
3497 * with foreign key checks.
3498 *
3499 * Note that if a column gets detoasted while executing the update, but
3500 * the value ends up being the same, this test will fail and we will use
3501 * the stronger lock. This is acceptable; the important case to optimize
3502 * is updates that don't manipulate key columns, not those that
3503 * serendipitously arrive at the same key values.
3504 */
3506 {
3507 *lockmode = LockTupleNoKeyExclusive;
3509 key_intact = true;
3510
3511 /*
3512 * If this is the first possibly-multixact-able operation in the
3513 * current transaction, set my per-backend OldestMemberMXactId
3514 * setting. We can be certain that the transaction will never become a
3515 * member of any older MultiXactIds than that. (We have to do this
3516 * even if we end up just using our own TransactionId below, since
3517 * some other backend could incorporate our XID into a MultiXact
3518 * immediately afterwards.)
3519 */
3521 }
3522 else
3523 {
3524 *lockmode = LockTupleExclusive;
3526 key_intact = false;
3527 }
3528
3529 /*
3530 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3531 * otid may very well point at newtup->t_self, which we will overwrite
3532 * with the new tuple's location, so there's great risk of confusion if we
3533 * use otid anymore.
3534 */
3535
3536l2:
3537 checked_lockers = false;
3538 locker_remains = false;
3539 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3540
3541 /* see below about the "no wait" case */
3542 Assert(result != TM_BeingModified || wait);
3543
3544 if (result == TM_Invisible)
3545 {
3546 UnlockReleaseBuffer(buffer);
3547 ereport(ERROR,
3549 errmsg("attempted to update invisible tuple")));
3550 }
3551 else if (result == TM_BeingModified && wait)
3552 {
3555 bool can_continue = false;
3556
3557 /*
3558 * XXX note that we don't consider the "no wait" case here. This
3559 * isn't a problem currently because no caller uses that case, but it
3560 * should be fixed if such a caller is introduced. It wasn't a
3561 * problem previously because this code would always wait, but now
3562 * that some tuple locks do not conflict with one of the lock modes we
3563 * use, it is possible that this case is interesting to handle
3564 * specially.
3565 *
3566 * This may cause failures with third-party code that calls
3567 * heap_update directly.
3568 */
3569
3570 /* must copy state data before unlocking buffer */
3572 infomask = oldtup.t_data->t_infomask;
3573
3574 /*
3575 * Now we have to do something about the existing locker. If it's a
3576 * multi, sleep on it; we might be awakened before it is completely
3577 * gone (or even not sleep at all in some cases); we need to preserve
3578 * it as locker, unless it is gone completely.
3579 *
3580 * If it's not a multi, we need to check for sleeping conditions
3581 * before actually going to sleep. If the update doesn't conflict
3582 * with the locks, we just continue without sleeping (but making sure
3583 * it is preserved).
3584 *
3585 * Before sleeping, we need to acquire tuple lock to establish our
3586 * priority for the tuple (see heap_lock_tuple). LockTuple will
3587 * release us when we are next-in-line for the tuple. Note we must
3588 * not acquire the tuple lock until we're sure we're going to sleep;
3589 * otherwise we're open for race conditions with other transactions
3590 * holding the tuple lock which sleep on us.
3591 *
3592 * If we are forced to "start over" below, we keep the tuple lock;
3593 * this arranges that we stay at the head of the line while rechecking
3594 * tuple state.
3595 */
3597 {
3599 int remain;
3600 bool current_is_member = false;
3601
3603 *lockmode, &current_is_member))
3604 {
3606
3607 /*
3608 * Acquire the lock, if necessary (but skip it when we're
3609 * requesting a lock and already have one; avoids deadlock).
3610 */
3611 if (!current_is_member)
3612 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3614
3615 /* wait for multixact */
3617 relation, &oldtup.t_self, XLTW_Update,
3618 &remain);
3619 checked_lockers = true;
3620 locker_remains = remain != 0;
3622
3623 /*
3624 * If xwait had just locked the tuple then some other xact
3625 * could update this tuple before we get to this point. Check
3626 * for xmax change, and start over if so.
3627 */
3628 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3629 infomask) ||
3631 xwait))
3632 goto l2;
3633 }
3634
3635 /*
3636 * Note that the multixact may not be done by now. It could have
3637 * surviving members; our own xact or other subxacts of this
3638 * backend, and also any other concurrent transaction that locked
3639 * the tuple with LockTupleKeyShare if we only got
3640 * LockTupleNoKeyExclusive. If this is the case, we have to be
3641 * careful to mark the updated tuple with the surviving members in
3642 * Xmax.
3643 *
3644 * Note that there could have been another update in the
3645 * MultiXact. In that case, we need to check whether it committed
3646 * or aborted. If it aborted we are safe to update it again;
3647 * otherwise there is an update conflict, and we have to return
3648 * TableTuple{Deleted, Updated} below.
3649 *
3650 * In the LockTupleExclusive case, we still need to preserve the
3651 * surviving members: those would include the tuple locks we had
3652 * before this one, which are important to keep in case this
3653 * subxact aborts.
3654 */
3655 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3657 else
3659
3660 /*
3661 * There was no UPDATE in the MultiXact; or it aborted. No
3662 * TransactionIdIsInProgress() call needed here, since we called
3663 * MultiXactIdWait() above.
3664 */
3667 can_continue = true;
3668 }
3670 {
3671 /*
3672 * The only locker is ourselves; we can avoid grabbing the tuple
3673 * lock here, but must preserve our locking information.
3674 */
3675 checked_lockers = true;
3676 locker_remains = true;
3677 can_continue = true;
3678 }
3680 {
3681 /*
3682 * If it's just a key-share locker, and we're not changing the key
3683 * columns, we don't need to wait for it to end; but we need to
3684 * preserve it as locker.
3685 */
3686 checked_lockers = true;
3687 locker_remains = true;
3688 can_continue = true;
3689 }
3690 else
3691 {
3692 /*
3693 * Wait for regular transaction to end; but first, acquire tuple
3694 * lock.
3695 */
3697 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3699 XactLockTableWait(xwait, relation, &oldtup.t_self,
3700 XLTW_Update);
3701 checked_lockers = true;
3703
3704 /*
3705 * xwait is done, but if xwait had just locked the tuple then some
3706 * other xact could update this tuple before we get to this point.
3707 * Check for xmax change, and start over if so.
3708 */
3709 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3712 goto l2;
3713
3714 /* Otherwise check if it committed or aborted */
3715 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3716 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3717 can_continue = true;
3718 }
3719
3720 if (can_continue)
3721 result = TM_Ok;
3722 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3723 result = TM_Updated;
3724 else
3725 result = TM_Deleted;
3726 }
3727
3728 /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3729 if (result != TM_Ok)
3730 {
3731 Assert(result == TM_SelfModified ||
3732 result == TM_Updated ||
3733 result == TM_Deleted ||
3734 result == TM_BeingModified);
3735 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3736 Assert(result != TM_Updated ||
3737 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3738 }
3739
3740 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3741 {
3742 /* Perform additional check for transaction-snapshot mode RI updates */
3744 result = TM_Updated;
3745 }
3746
3747 if (result != TM_Ok)
3748 {
3749 tmfd->ctid = oldtup.t_data->t_ctid;
3750 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3751 if (result == TM_SelfModified)
3752 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3753 else
3754 tmfd->cmax = InvalidCommandId;
3755 UnlockReleaseBuffer(buffer);
3756 if (have_tuple_lock)
3757 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3758 if (vmbuffer != InvalidBuffer)
3759 ReleaseBuffer(vmbuffer);
3761
3768 return result;
3769 }
3770
3771 /*
3772 * If we didn't pin the visibility map page and the page has become all
3773 * visible while we were busy locking the buffer, or during some
3774 * subsequent window during which we had it unlocked, we'll have to unlock
3775 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3776 * bit unfortunate, especially since we'll now have to recheck whether the
3777 * tuple has been locked or updated under us, but hopefully it won't
3778 * happen very often.
3779 */
3780 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3781 {
3783 visibilitymap_pin(relation, block, &vmbuffer);
3785 goto l2;
3786 }
3787
3788 /* Fill in transaction status data */
3789
3790 /*
3791 * If the tuple we're updating is locked, we need to preserve the locking
3792 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3793 */
3795 oldtup.t_data->t_infomask,
3796 oldtup.t_data->t_infomask2,
3797 xid, *lockmode, true,
3800
3801 /*
3802 * And also prepare an Xmax value for the new copy of the tuple. If there
3803 * was no xmax previously, or there was one but all lockers are now gone,
3804 * then use InvalidTransactionId; otherwise, get the xmax from the old
3805 * tuple. (In rare cases that might also be InvalidTransactionId and yet
3806 * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3807 */
3808 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3809 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3812 else
3814
3816 {
3819 }
3820 else
3821 {
3822 /*
3823 * If we found a valid Xmax for the new tuple, then the infomask bits
3824 * to use on the new tuple depend on what was there on the old one.
3825 * Note that since we're doing an update, the only possibility is that
3826 * the lockers had FOR KEY SHARE lock.
3827 */
3828 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3829 {
3832 }
3833 else
3834 {
3837 }
3838 }
3839
3840 /*
3841 * Prepare the new tuple with the appropriate initial values of Xmin and
3842 * Xmax, as well as initial infomask bits as computed above.
3843 */
3844 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3845 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3846 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3848 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3849 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3851
3852 /*
3853 * Replace cid with a combo CID if necessary. Note that we already put
3854 * the plain cid into the new tuple.
3855 */
3857
3858 /*
3859 * If the toaster needs to be activated, OR if the new tuple will not fit
3860 * on the same page as the old, then we need to release the content lock
3861 * (but not the pin!) on the old tuple's buffer while we are off doing
3862 * TOAST and/or table-file-extension work. We must mark the old tuple to
3863 * show that it's locked, else other processes may try to update it
3864 * themselves.
3865 *
3866 * We need to invoke the toaster if there are already any out-of-line
3867 * toasted values present, or if the new tuple is over-threshold.
3868 */
3869 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3870 relation->rd_rel->relkind != RELKIND_MATVIEW)
3871 {
3872 /* toast table entries should never be recursively toasted */
3875 need_toast = false;
3876 }
3877 else
3880 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3881
3883
3884 newtupsize = MAXALIGN(newtup->t_len);
3885
3887 {
3891 bool cleared_all_frozen = false;
3892
3893 /*
3894 * To prevent concurrent sessions from updating the tuple, we have to
3895 * temporarily mark it locked, while we release the page-level lock.
3896 *
3897 * To satisfy the rule that any xid potentially appearing in a buffer
3898 * written out to disk, we unfortunately have to WAL log this
3899 * temporary modification. We can reuse xl_heap_lock for this
3900 * purpose. If we crash/error before following through with the
3901 * actual update, xmax will be of an aborted transaction, allowing
3902 * other sessions to proceed.
3903 */
3904
3905 /*
3906 * Compute xmax / infomask appropriate for locking the tuple. This has
3907 * to be done separately from the combo that's going to be used for
3908 * updating, because the potentially created multixact would otherwise
3909 * be wrong.
3910 */
3912 oldtup.t_data->t_infomask,
3913 oldtup.t_data->t_infomask2,
3914 xid, *lockmode, false,
3917
3919
3921
3922 /* Clear obsolete visibility flags ... */
3923 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3924 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3926 /* ... and store info about transaction updating this tuple */
3929 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3930 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3932
3933 /* temporarily make it look not-updated, but locked */
3934 oldtup.t_data->t_ctid = oldtup.t_self;
3935
3936 /*
3937 * Clear all-frozen bit on visibility map if needed. We could
3938 * immediately reset ALL_VISIBLE, but given that the WAL logging
3939 * overhead would be unchanged, that doesn't seem necessarily
3940 * worthwhile.
3941 */
3942 if (PageIsAllVisible(page) &&
3943 visibilitymap_clear(relation, block, vmbuffer,
3945 cleared_all_frozen = true;
3946
3947 MarkBufferDirty(buffer);
3948
3949 if (RelationNeedsWAL(relation))
3950 {
3953
3956
3957 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3959 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3960 oldtup.t_data->t_infomask2);
3961 xlrec.flags =
3965 PageSetLSN(page, recptr);
3966 }
3967
3969
3971
3972 /*
3973 * Let the toaster do its thing, if needed.
3974 *
3975 * Note: below this point, heaptup is the data we actually intend to
3976 * store into the relation; newtup is the caller's original untoasted
3977 * data.
3978 */
3979 if (need_toast)
3980 {
3981 /* Note we always use WAL and FSM during updates */
3983 newtupsize = MAXALIGN(heaptup->t_len);
3984 }
3985 else
3986 heaptup = newtup;
3987
3988 /*
3989 * Now, do we need a new page for the tuple, or not? This is a bit
3990 * tricky since someone else could have added tuples to the page while
3991 * we weren't looking. We have to recheck the available space after
3992 * reacquiring the buffer lock. But don't bother to do that if the
3993 * former amount of free space is still not enough; it's unlikely
3994 * there's more free now than before.
3995 *
3996 * What's more, if we need to get a new page, we will need to acquire
3997 * buffer locks on both old and new pages. To avoid deadlock against
3998 * some other backend trying to get the same two locks in the other
3999 * order, we must be consistent about the order we get the locks in.
4000 * We use the rule "lock the lower-numbered page of the relation
4001 * first". To implement this, we must do RelationGetBufferForTuple
4002 * while not holding the lock on the old page, and we must rely on it
4003 * to get the locks on both pages in the correct order.
4004 *
4005 * Another consideration is that we need visibility map page pin(s) if
4006 * we will have to clear the all-visible flag on either page. If we
4007 * call RelationGetBufferForTuple, we rely on it to acquire any such
4008 * pins; but if we don't, we have to handle that here. Hence we need
4009 * a loop.
4010 */
4011 for (;;)
4012 {
4013 if (newtupsize > pagefree)
4014 {
4015 /* It doesn't fit, must use RelationGetBufferForTuple. */
4016 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4017 buffer, 0, NULL,
4018 &vmbuffer_new, &vmbuffer,
4019 0);
4020 /* We're all done. */
4021 break;
4022 }
4023 /* Acquire VM page pin if needed and we don't have it. */
4024 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4025 visibilitymap_pin(relation, block, &vmbuffer);
4026 /* Re-acquire the lock on the old tuple's page. */
4028 /* Re-check using the up-to-date free space */
4030 if (newtupsize > pagefree ||
4031 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
4032 {
4033 /*
4034 * Rats, it doesn't fit anymore, or somebody just now set the
4035 * all-visible flag. We must now unlock and loop to avoid
4036 * deadlock. Fortunately, this path should seldom be taken.
4037 */
4039 }
4040 else
4041 {
4042 /* We're all done. */
4043 newbuf = buffer;
4044 break;
4045 }
4046 }
4047 }
4048 else
4049 {
4050 /* No TOAST work needed, and it'll fit on same page */
4051 newbuf = buffer;
4052 heaptup = newtup;
4053 }
4054
4055 /*
4056 * We're about to do the actual update -- check for conflict first, to
4057 * avoid possibly having to roll back work we've just done.
4058 *
4059 * This is safe without a recheck as long as there is no possibility of
4060 * another process scanning the pages between this check and the update
4061 * being visible to the scan (i.e., exclusive buffer content lock(s) are
4062 * continuously held from this point until the tuple update is visible).
4063 *
4064 * For the new tuple the only check needed is at the relation level, but
4065 * since both tuples are in the same relation and the check for oldtup
4066 * will include checking the relation level, there is no benefit to a
4067 * separate check for the new tuple.
4068 */
4069 CheckForSerializableConflictIn(relation, &oldtup.t_self,
4070 BufferGetBlockNumber(buffer));
4071
4072 /*
4073 * At this point newbuf and buffer are both pinned and locked, and newbuf
4074 * has enough space for the new tuple. If they are the same buffer, only
4075 * one pin is held.
4076 */
4077
4078 if (newbuf == buffer)
4079 {
4080 /*
4081 * Since the new tuple is going into the same page, we might be able
4082 * to do a HOT update. Check if any of the index columns have been
4083 * changed.
4084 */
4086 {
4087 use_hot_update = true;
4088
4089 /*
4090 * If none of the columns that are used in hot-blocking indexes
4091 * were updated, we can apply HOT, but we do still need to check
4092 * if we need to update the summarizing indexes, and update those
4093 * indexes if the columns were updated, or we may fail to detect
4094 * e.g. value bound changes in BRIN minmax indexes.
4095 */
4097 summarized_update = true;
4098 }
4099 }
4100 else
4101 {
4102 /* Set a hint that the old page could use prune/defrag */
4103 PageSetFull(page);
4104 }
4105
4106 /*
4107 * Compute replica identity tuple before entering the critical section so
4108 * we don't PANIC upon a memory allocation failure.
4109 * ExtractReplicaIdentity() will return NULL if nothing needs to be
4110 * logged. Pass old key required as true only if the replica identity key
4111 * columns are modified or it has external data.
4112 */
4117
4118 /* NO EREPORT(ERROR) from here till changes are logged */
4120
4121 /*
4122 * If this transaction commits, the old tuple will become DEAD sooner or
4123 * later. Set flag that this page is a candidate for pruning once our xid
4124 * falls below the OldestXmin horizon. If the transaction finally aborts,
4125 * the subsequent page pruning will be a no-op and the hint will be
4126 * cleared.
4127 *
4128 * XXX Should we set hint on newbuf as well? If the transaction aborts,
4129 * there would be a prunable tuple in the newbuf; but for now we choose
4130 * not to optimize for aborts. Note that heap_xlog_update must be kept in
4131 * sync if this decision changes.
4132 */
4133 PageSetPrunable(page, xid);
4134
4135 if (use_hot_update)
4136 {
4137 /* Mark the old tuple as HOT-updated */
4139 /* And mark the new tuple as heap-only */
4141 /* Mark the caller's copy too, in case different from heaptup */
4143 }
4144 else
4145 {
4146 /* Make sure tuples are correctly marked as not-HOT */
4150 }
4151
4152 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4153
4154
4155 /* Clear obsolete visibility flags, possibly set by ourselves above... */
4156 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4157 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4158 /* ... and store info about transaction updating this tuple */
4161 oldtup.t_data->t_infomask |= infomask_old_tuple;
4162 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4164
4165 /* record address of new tuple in t_ctid of old one */
4166 oldtup.t_data->t_ctid = heaptup->t_self;
4167
4168 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4169 if (PageIsAllVisible(BufferGetPage(buffer)))
4170 {
4171 all_visible_cleared = true;
4173 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4174 vmbuffer, VISIBILITYMAP_VALID_BITS);
4175 }
4176 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4177 {
4182 }
4183
4184 if (newbuf != buffer)
4186 MarkBufferDirty(buffer);
4187
4188 /* XLOG stuff */
4189 if (RelationNeedsWAL(relation))
4190 {
4192
4193 /*
4194 * For logical decoding we need combo CIDs to properly decode the
4195 * catalog.
4196 */
4198 {
4199 log_heap_new_cid(relation, &oldtup);
4200 log_heap_new_cid(relation, heaptup);
4201 }
4202
4203 recptr = log_heap_update(relation, buffer,
4208 if (newbuf != buffer)
4209 {
4211 }
4213 }
4214
4216
4217 if (newbuf != buffer)
4220
4221 /*
4222 * Mark old tuple for invalidation from system caches at next command
4223 * boundary, and mark the new tuple for invalidation in case we abort. We
4224 * have to do this before releasing the buffer because oldtup is in the
4225 * buffer. (heaptup is all in local memory, but it's necessary to process
4226 * both tuple versions in one call to inval.c so we can avoid redundant
4227 * sinval messages.)
4228 */
4230
4231 /* Now we can release the buffer(s) */
4232 if (newbuf != buffer)
4234 ReleaseBuffer(buffer);
4237 if (BufferIsValid(vmbuffer))
4238 ReleaseBuffer(vmbuffer);
4239
4240 /*
4241 * Release the lmgr tuple lock, if we had it.
4242 */
4243 if (have_tuple_lock)
4244 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4245
4246 pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
4247
4248 /*
4249 * If heaptup is a private copy, release it. Don't forget to copy t_self
4250 * back to the caller's image, too.
4251 */
4252 if (heaptup != newtup)
4253 {
4254 newtup->t_self = heaptup->t_self;
4256 }
4257
4258 /*
4259 * If it is a HOT update, the update may still need to update summarized
4260 * indexes, lest we fail to update those summaries and get incorrect
4261 * results (for example, minmax bounds of the block may change with this
4262 * update).
4263 */
4264 if (use_hot_update)
4265 {
4268 else
4270 }
4271 else
4273
4276
4283
4284 return TM_Ok;
4285}
4286
4287#ifdef USE_ASSERT_CHECKING
4288/*
4289 * Confirm adequate lock held during heap_update(), per rules from
4290 * README.tuplock section "Locking to write inplace-updated tables".
4291 */
4292static void
4294 const ItemPointerData *otid,
4296{
4297 /* LOCKTAG_TUPLE acceptable for any catalog */
4298 switch (RelationGetRelid(relation))
4299 {
4300 case RelationRelationId:
4301 case DatabaseRelationId:
4302 {
4304
4306 relation->rd_lockInfo.lockRelId.dbId,
4307 relation->rd_lockInfo.lockRelId.relId,
4311 return;
4312 }
4313 break;
4314 default:
4315 Assert(!IsInplaceUpdateRelation(relation));
4316 return;
4317 }
4318
4319 switch (RelationGetRelid(relation))
4320 {
4321 case RelationRelationId:
4322 {
4323 /* LOCKTAG_TUPLE or LOCKTAG_RELATION ok */
4325 Oid relid = classForm->oid;
4326 Oid dbid;
4327 LOCKTAG tag;
4328
4329 if (IsSharedRelation(relid))
4330 dbid = InvalidOid;
4331 else
4332 dbid = MyDatabaseId;
4333
4334 if (classForm->relkind == RELKIND_INDEX)
4335 {
4336 Relation irel = index_open(relid, AccessShareLock);
4337
4338 SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4340 }
4341 else
4342 SET_LOCKTAG_RELATION(tag, dbid, relid);
4343
4344 if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, false) &&
4345 !LockHeldByMe(&tag, ShareRowExclusiveLock, true))
4346 elog(WARNING,
4347 "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4348 NameStr(classForm->relname),
4349 relid,
4350 classForm->relkind,
4353 }
4354 break;
4355 case DatabaseRelationId:
4356 {
4357 /* LOCKTAG_TUPLE required */
4359
4360 elog(WARNING,
4361 "missing lock on database \"%s\" (OID %u) @ TID (%u,%u)",
4362 NameStr(dbForm->datname),
4363 dbForm->oid,
4366 }
4367 break;
4368 }
4369}
4370
4371/*
4372 * Confirm adequate relation lock held, per rules from README.tuplock section
4373 * "Locking to write inplace-updated tables".
4374 */
4375static void
4377{
4379 Oid relid = classForm->oid;
4380 Oid dbid;
4381 LOCKTAG tag;
4382
4383 if (IsSharedRelation(relid))
4384 dbid = InvalidOid;
4385 else
4386 dbid = MyDatabaseId;
4387
4388 if (classForm->relkind == RELKIND_INDEX)
4389 {
4390 Relation irel = index_open(relid, AccessShareLock);
4391
4392 SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4394 }
4395 else
4396 SET_LOCKTAG_RELATION(tag, dbid, relid);
4397
4398 if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, true))
4399 elog(WARNING,
4400 "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4401 NameStr(classForm->relname),
4402 relid,
4403 classForm->relkind,
4406}
4407#endif
4408
4409/*
4410 * Check if the specified attribute's values are the same. Subroutine for
4411 * HeapDetermineColumnsInfo.
4412 */
4413static bool
4414heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2,
4415 bool isnull1, bool isnull2)
4416{
4417 /*
4418 * If one value is NULL and other is not, then they are certainly not
4419 * equal
4420 */
4421 if (isnull1 != isnull2)
4422 return false;
4423
4424 /*
4425 * If both are NULL, they can be considered equal.
4426 */
4427 if (isnull1)
4428 return true;
4429
4430 /*
4431 * We do simple binary comparison of the two datums. This may be overly
4432 * strict because there can be multiple binary representations for the
4433 * same logical value. But we should be OK as long as there are no false
4434 * positives. Using a type-specific equality operator is messy because
4435 * there could be multiple notions of equality in different operator
4436 * classes; furthermore, we cannot safely invoke user-defined functions
4437 * while holding exclusive buffer lock.
4438 */
4439 if (attrnum <= 0)
4440 {
4441 /* The only allowed system columns are OIDs, so do this */
4443 }
4444 else
4445 {
4447
4449 att = TupleDescCompactAttr(tupdesc, attrnum - 1);
4450 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4451 }
4452}
4453
4454/*
4455 * Check which columns are being updated.
4456 *
4457 * Given an updated tuple, determine (and return into the output bitmapset),
4458 * from those listed as interesting, the set of columns that changed.
4459 *
4460 * has_external indicates if any of the unmodified attributes (from those
4461 * listed as interesting) of the old tuple is a member of external_cols and is
4462 * stored externally.
4463 */
4464static Bitmapset *
4469 bool *has_external)
4470{
4471 int attidx;
4473 TupleDesc tupdesc = RelationGetDescr(relation);
4474
4475 attidx = -1;
4476 while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
4477 {
4478 /* attidx is zero-based, attrnum is the normal attribute number */
4480 Datum value1,
4481 value2;
4482 bool isnull1,
4483 isnull2;
4484
4485 /*
4486 * If it's a whole-tuple reference, say "not equal". It's not really
4487 * worth supporting this case, since it could only succeed after a
4488 * no-op update, which is hardly a case worth optimizing for.
4489 */
4490 if (attrnum == 0)
4491 {
4492 modified = bms_add_member(modified, attidx);
4493 continue;
4494 }
4495
4496 /*
4497 * Likewise, automatically say "not equal" for any system attribute
4498 * other than tableOID; we cannot expect these to be consistent in a
4499 * HOT chain, or even to be set correctly yet in the new tuple.
4500 */
4501 if (attrnum < 0)
4502 {
4503 if (attrnum != TableOidAttributeNumber)
4504 {
4505 modified = bms_add_member(modified, attidx);
4506 continue;
4507 }
4508 }
4509
4510 /*
4511 * Extract the corresponding values. XXX this is pretty inefficient
4512 * if there are many indexed columns. Should we do a single
4513 * heap_deform_tuple call on each tuple, instead? But that doesn't
4514 * work for system columns ...
4515 */
4516 value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4517 value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4518
4519 if (!heap_attr_equals(tupdesc, attrnum, value1,
4520 value2, isnull1, isnull2))
4521 {
4522 modified = bms_add_member(modified, attidx);
4523 continue;
4524 }
4525
4526 /*
4527 * No need to check attributes that can't be stored externally. Note
4528 * that system attributes can't be stored externally.
4529 */
4530 if (attrnum < 0 || isnull1 ||
4531 TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
4532 continue;
4533
4534 /*
4535 * Check if the old tuple's attribute is stored externally and is a
4536 * member of external_cols.
4537 */
4540 *has_external = true;
4541 }
4542
4543 return modified;
4544}
4545
4546/*
4547 * simple_heap_update - replace a tuple
4548 *
4549 * This routine may be used to update a tuple when concurrent updates of
4550 * the target tuple are not expected (for example, because we have a lock
4551 * on the relation associated with the tuple). Any failure is reported
4552 * via ereport().
4553 */
4554void
4557{
4558 TM_Result result;
4559 TM_FailureData tmfd;
4560 LockTupleMode lockmode;
4561
4562 result = heap_update(relation, otid, tup,
4564 true /* wait for commit */ ,
4565 &tmfd, &lockmode, update_indexes);
4566 switch (result)
4567 {
4568 case TM_SelfModified:
4569 /* Tuple was already updated in current command? */
4570 elog(ERROR, "tuple already updated by self");
4571 break;
4572
4573 case TM_Ok:
4574 /* done successfully */
4575 break;
4576
4577 case TM_Updated:
4578 elog(ERROR, "tuple concurrently updated");
4579 break;
4580
4581 case TM_Deleted:
4582 elog(ERROR, "tuple concurrently deleted");
4583 break;
4584
4585 default:
4586 elog(ERROR, "unrecognized heap_update status: %u", result);
4587 break;
4588 }
4589}
4590
4591
4592/*
4593 * Return the MultiXactStatus corresponding to the given tuple lock mode.
4594 */
4595static MultiXactStatus
4597{
4598 int retval;
4599
4600 if (is_update)
4601 retval = tupleLockExtraInfo[mode].updstatus;
4602 else
4603 retval = tupleLockExtraInfo[mode].lockstatus;
4604
4605 if (retval == -1)
4606 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4607 is_update ? "true" : "false");
4608
4609 return (MultiXactStatus) retval;
4610}
4611
4612/*
4613 * heap_lock_tuple - lock a tuple in shared or exclusive mode
4614 *
4615 * Note that this acquires a buffer pin, which the caller must release.
4616 *
4617 * Input parameters:
4618 * relation: relation containing tuple (caller must hold suitable lock)
4619 * cid: current command ID (used for visibility test, and stored into
4620 * tuple's cmax if lock is successful)
4621 * mode: indicates if shared or exclusive tuple lock is desired
4622 * wait_policy: what to do if tuple lock is not available
4623 * follow_updates: if true, follow the update chain to also lock descendant
4624 * tuples.
4625 *
4626 * Output parameters:
4627 * *tuple: all fields filled in
4628 * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4629 * *tmfd: filled in failure cases (see below)
4630 *
4631 * Function results are the same as the ones for table_tuple_lock().
4632 *
4633 * In the failure cases other than TM_Invisible, the routine fills
4634 * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4635 * if necessary), and t_cmax (the last only for TM_SelfModified,
4636 * since we cannot obtain cmax from a combo CID generated by another
4637 * transaction).
4638 * See comments for struct TM_FailureData for additional info.
4639 *
4640 * See README.tuplock for a thorough explanation of this mechanism.
4641 */
4643heap_lock_tuple(Relation relation, HeapTuple tuple,
4645 bool follow_updates,
4646 Buffer *buffer, TM_FailureData *tmfd)
4647{
4648 TM_Result result;
4649 ItemPointer tid = &(tuple->t_self);
4650 ItemId lp;
4651 Page page;
4652 Buffer vmbuffer = InvalidBuffer;
4653 BlockNumber block;
4654 TransactionId xid,
4655 xmax;
4659 bool first_time = true;
4660 bool skip_tuple_lock = false;
4661 bool have_tuple_lock = false;
4662 bool cleared_all_frozen = false;
4663
4664 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4665 block = ItemPointerGetBlockNumber(tid);
4666
4667 /*
4668 * Before locking the buffer, pin the visibility map page if it appears to
4669 * be necessary. Since we haven't got the lock yet, someone else might be
4670 * in the middle of changing this, so we'll need to recheck after we have
4671 * the lock.
4672 */
4673 if (PageIsAllVisible(BufferGetPage(*buffer)))
4674 visibilitymap_pin(relation, block, &vmbuffer);
4675
4677
4678 page = BufferGetPage(*buffer);
4681
4682 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4683 tuple->t_len = ItemIdGetLength(lp);
4684 tuple->t_tableOid = RelationGetRelid(relation);
4685
4686l3:
4687 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4688
4689 if (result == TM_Invisible)
4690 {
4691 /*
4692 * This is possible, but only when locking a tuple for ON CONFLICT DO
4693 * SELECT/UPDATE. We return this value here rather than throwing an
4694 * error in order to give that case the opportunity to throw a more
4695 * specific error.
4696 */
4697 result = TM_Invisible;
4698 goto out_locked;
4699 }
4700 else if (result == TM_BeingModified ||
4701 result == TM_Updated ||
4702 result == TM_Deleted)
4703 {
4707 bool require_sleep;
4708 ItemPointerData t_ctid;
4709
4710 /* must copy state data before unlocking buffer */
4712 infomask = tuple->t_data->t_infomask;
4713 infomask2 = tuple->t_data->t_infomask2;
4714 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4715
4717
4718 /*
4719 * If any subtransaction of the current top transaction already holds
4720 * a lock as strong as or stronger than what we're requesting, we
4721 * effectively hold the desired lock already. We *must* succeed
4722 * without trying to take the tuple lock, else we will deadlock
4723 * against anyone wanting to acquire a stronger lock.
4724 *
4725 * Note we only do this the first time we loop on the HTSU result;
4726 * there is no point in testing in subsequent passes, because
4727 * evidently our own transaction cannot have acquired a new lock after
4728 * the first time we checked.
4729 */
4730 if (first_time)
4731 {
4732 first_time = false;
4733
4735 {
4736 int i;
4737 int nmembers;
4738 MultiXactMember *members;
4739
4740 /*
4741 * We don't need to allow old multixacts here; if that had
4742 * been the case, HeapTupleSatisfiesUpdate would have returned
4743 * MayBeUpdated and we wouldn't be here.
4744 */
4745 nmembers =
4746 GetMultiXactIdMembers(xwait, &members, false,
4748
4749 for (i = 0; i < nmembers; i++)
4750 {
4751 /* only consider members of our own transaction */
4752 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4753 continue;
4754
4755 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4756 {
4757 pfree(members);
4758 result = TM_Ok;
4759 goto out_unlocked;
4760 }
4761 else
4762 {
4763 /*
4764 * Disable acquisition of the heavyweight tuple lock.
4765 * Otherwise, when promoting a weaker lock, we might
4766 * deadlock with another locker that has acquired the
4767 * heavyweight tuple lock and is waiting for our
4768 * transaction to finish.
4769 *
4770 * Note that in this case we still need to wait for
4771 * the multixact if required, to avoid acquiring
4772 * conflicting locks.
4773 */
4774 skip_tuple_lock = true;
4775 }
4776 }
4777
4778 if (members)
4779 pfree(members);
4780 }
4782 {
4783 switch (mode)
4784 {
4785 case LockTupleKeyShare:
4789 result = TM_Ok;
4790 goto out_unlocked;
4791 case LockTupleShare:
4794 {
4795 result = TM_Ok;
4796 goto out_unlocked;
4797 }
4798 break;
4801 {
4802 result = TM_Ok;
4803 goto out_unlocked;
4804 }
4805 break;
4806 case LockTupleExclusive:
4809 {
4810 result = TM_Ok;
4811 goto out_unlocked;
4812 }
4813 break;
4814 }
4815 }
4816 }
4817
4818 /*
4819 * Initially assume that we will have to wait for the locking
4820 * transaction(s) to finish. We check various cases below in which
4821 * this can be turned off.
4822 */
4823 require_sleep = true;
4824 if (mode == LockTupleKeyShare)
4825 {
4826 /*
4827 * If we're requesting KeyShare, and there's no update present, we
4828 * don't need to wait. Even if there is an update, we can still
4829 * continue if the key hasn't been modified.
4830 *
4831 * However, if there are updates, we need to walk the update chain
4832 * to mark future versions of the row as locked, too. That way,
4833 * if somebody deletes that future version, we're protected
4834 * against the key going away. This locking of future versions
4835 * could block momentarily, if a concurrent transaction is
4836 * deleting a key; or it could return a value to the effect that
4837 * the transaction deleting the key has already committed. So we
4838 * do this before re-locking the buffer; otherwise this would be
4839 * prone to deadlocks.
4840 *
4841 * Note that the TID we're locking was grabbed before we unlocked
4842 * the buffer. For it to change while we're not looking, the
4843 * other properties we're testing for below after re-locking the
4844 * buffer would also change, in which case we would restart this
4845 * loop above.
4846 */
4848 {
4849 bool updated;
4850
4852
4853 /*
4854 * If there are updates, follow the update chain; bail out if
4855 * that cannot be done.
4856 */
4857 if (follow_updates && updated &&
4858 !ItemPointerEquals(&tuple->t_self, &t_ctid))
4859 {
4860 TM_Result res;
4861
4862 res = heap_lock_updated_tuple(relation,
4863 infomask, xwait, &t_ctid,
4865 mode);
4866 if (res != TM_Ok)
4867 {
4868 result = res;
4869 /* recovery code expects to have buffer lock held */
4871 goto failed;
4872 }
4873 }
4874
4876
4877 /*
4878 * Make sure it's still an appropriate lock, else start over.
4879 * Also, if it wasn't updated before we released the lock, but
4880 * is updated now, we start over too; the reason is that we
4881 * now need to follow the update chain to lock the new
4882 * versions.
4883 */
4884 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4885 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4886 !updated))
4887 goto l3;
4888
4889 /* Things look okay, so we can skip sleeping */
4890 require_sleep = false;
4891
4892 /*
4893 * Note we allow Xmax to change here; other updaters/lockers
4894 * could have modified it before we grabbed the buffer lock.
4895 * However, this is not a problem, because with the recheck we
4896 * just did we ensure that they still don't conflict with the
4897 * lock we want.
4898 */
4899 }
4900 }
4901 else if (mode == LockTupleShare)
4902 {
4903 /*
4904 * If we're requesting Share, we can similarly avoid sleeping if
4905 * there's no update and no exclusive lock present.
4906 */
4909 {
4911
4912 /*
4913 * Make sure it's still an appropriate lock, else start over.
4914 * See above about allowing xmax to change.
4915 */
4918 goto l3;
4919 require_sleep = false;
4920 }
4921 }
4922 else if (mode == LockTupleNoKeyExclusive)
4923 {
4924 /*
4925 * If we're requesting NoKeyExclusive, we might also be able to
4926 * avoid sleeping; just ensure that there no conflicting lock
4927 * already acquired.
4928 */
4930 {
4932 mode, NULL))
4933 {
4934 /*
4935 * No conflict, but if the xmax changed under us in the
4936 * meantime, start over.
4937 */
4941 xwait))
4942 goto l3;
4943
4944 /* otherwise, we're good */
4945 require_sleep = false;
4946 }
4947 }
4949 {
4951
4952 /* if the xmax changed in the meantime, start over */
4955 xwait))
4956 goto l3;
4957 /* otherwise, we're good */
4958 require_sleep = false;
4959 }
4960 }
4961
4962 /*
4963 * As a check independent from those above, we can also avoid sleeping
4964 * if the current transaction is the sole locker of the tuple. Note
4965 * that the strength of the lock already held is irrelevant; this is
4966 * not about recording the lock in Xmax (which will be done regardless
4967 * of this optimization, below). Also, note that the cases where we
4968 * hold a lock stronger than we are requesting are already handled
4969 * above by not doing anything.
4970 *
4971 * Note we only deal with the non-multixact case here; MultiXactIdWait
4972 * is well equipped to deal with this situation on its own.
4973 */
4976 {
4977 /* ... but if the xmax changed in the meantime, start over */
4981 xwait))
4982 goto l3;
4984 require_sleep = false;
4985 }
4986
4987 /*
4988 * Time to sleep on the other transaction/multixact, if necessary.
4989 *
4990 * If the other transaction is an update/delete that's already
4991 * committed, then sleeping cannot possibly do any good: if we're
4992 * required to sleep, get out to raise an error instead.
4993 *
4994 * By here, we either have already acquired the buffer exclusive lock,
4995 * or we must wait for the locking transaction or multixact; so below
4996 * we ensure that we grab buffer lock after the sleep.
4997 */
4998 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4999 {
5001 goto failed;
5002 }
5003 else if (require_sleep)
5004 {
5005 /*
5006 * Acquire tuple lock to establish our priority for the tuple, or
5007 * die trying. LockTuple will release us when we are next-in-line
5008 * for the tuple. We must do this even if we are share-locking,
5009 * but not if we already have a weaker lock on the tuple.
5010 *
5011 * If we are forced to "start over" below, we keep the tuple lock;
5012 * this arranges that we stay at the head of the line while
5013 * rechecking tuple state.
5014 */
5015 if (!skip_tuple_lock &&
5016 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
5018 {
5019 /*
5020 * This can only happen if wait_policy is Skip and the lock
5021 * couldn't be obtained.
5022 */
5023 result = TM_WouldBlock;
5024 /* recovery code expects to have buffer lock held */
5026 goto failed;
5027 }
5028
5030 {
5032
5033 /* We only ever lock tuples, never update them */
5034 if (status >= MultiXactStatusNoKeyUpdate)
5035 elog(ERROR, "invalid lock mode in heap_lock_tuple");
5036
5037 /* wait for multixact to end, or die trying */
5038 switch (wait_policy)
5039 {
5040 case LockWaitBlock:
5042 relation, &tuple->t_self, XLTW_Lock, NULL);
5043 break;
5044 case LockWaitSkip:
5046 status, infomask, relation,
5047 NULL, false))
5048 {
5049 result = TM_WouldBlock;
5050 /* recovery code expects to have buffer lock held */
5052 goto failed;
5053 }
5054 break;
5055 case LockWaitError:
5057 status, infomask, relation,
5059 ereport(ERROR,
5061 errmsg("could not obtain lock on row in relation \"%s\"",
5062 RelationGetRelationName(relation))));
5063
5064 break;
5065 }
5066
5067 /*
5068 * Of course, the multixact might not be done here: if we're
5069 * requesting a light lock mode, other transactions with light
5070 * locks could still be alive, as well as locks owned by our
5071 * own xact or other subxacts of this backend. We need to
5072 * preserve the surviving MultiXact members. Note that it
5073 * isn't absolutely necessary in the latter case, but doing so
5074 * is simpler.
5075 */
5076 }
5077 else
5078 {
5079 /* wait for regular transaction to end, or die trying */
5080 switch (wait_policy)
5081 {
5082 case LockWaitBlock:
5083 XactLockTableWait(xwait, relation, &tuple->t_self,
5084 XLTW_Lock);
5085 break;
5086 case LockWaitSkip:
5088 {
5089 result = TM_WouldBlock;
5090 /* recovery code expects to have buffer lock held */
5092 goto failed;
5093 }
5094 break;
5095 case LockWaitError:
5097 ereport(ERROR,
5099 errmsg("could not obtain lock on row in relation \"%s\"",
5100 RelationGetRelationName(relation))));
5101 break;
5102 }
5103 }
5104
5105 /* if there are updates, follow the update chain */
5107 !ItemPointerEquals(&tuple->t_self, &t_ctid))
5108 {
5109 TM_Result res;
5110
5111 res = heap_lock_updated_tuple(relation,
5112 infomask, xwait, &t_ctid,
5114 mode);
5115 if (res != TM_Ok)
5116 {
5117 result = res;
5118 /* recovery code expects to have buffer lock held */
5120 goto failed;
5121 }
5122 }
5123
5125
5126 /*
5127 * xwait is done, but if xwait had just locked the tuple then some
5128 * other xact could update this tuple before we get to this point.
5129 * Check for xmax change, and start over if so.
5130 */
5133 xwait))
5134 goto l3;
5135
5137 {
5138 /*
5139 * Otherwise check if it committed or aborted. Note we cannot
5140 * be here if the tuple was only locked by somebody who didn't
5141 * conflict with us; that would have been handled above. So
5142 * that transaction must necessarily be gone by now. But
5143 * don't check for this in the multixact case, because some
5144 * locker transactions might still be running.
5145 */
5146 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5147 }
5148 }
5149
5150 /* By here, we're certain that we hold buffer exclusive lock again */
5151
5152 /*
5153 * We may lock if previous xmax aborted, or if it committed but only
5154 * locked the tuple without updating it; or if we didn't have to wait
5155 * at all for whatever reason.
5156 */
5157 if (!require_sleep ||
5158 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5161 result = TM_Ok;
5162 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
5163 result = TM_Updated;
5164 else
5165 result = TM_Deleted;
5166 }
5167
5168failed:
5169 if (result != TM_Ok)
5170 {
5171 Assert(result == TM_SelfModified || result == TM_Updated ||
5172 result == TM_Deleted || result == TM_WouldBlock);
5173
5174 /*
5175 * When locking a tuple under LockWaitSkip semantics and we fail with
5176 * TM_WouldBlock above, it's possible for concurrent transactions to
5177 * release the lock and set HEAP_XMAX_INVALID in the meantime. So
5178 * this assert is slightly different from the equivalent one in
5179 * heap_delete and heap_update.
5180 */
5181 Assert((result == TM_WouldBlock) ||
5182 !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5183 Assert(result != TM_Updated ||
5184 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
5185 tmfd->ctid = tuple->t_data->t_ctid;
5186 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5187 if (result == TM_SelfModified)
5188 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5189 else
5190 tmfd->cmax = InvalidCommandId;
5191 goto out_locked;
5192 }
5193
5194 /*
5195 * If we didn't pin the visibility map page and the page has become all
5196 * visible while we were busy locking the buffer, or during some
5197 * subsequent window during which we had it unlocked, we'll have to unlock
5198 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5199 * unfortunate, especially since we'll now have to recheck whether the
5200 * tuple has been locked or updated under us, but hopefully it won't
5201 * happen very often.
5202 */
5203 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5204 {
5206 visibilitymap_pin(relation, block, &vmbuffer);
5208 goto l3;
5209 }
5210
5211 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5212 old_infomask = tuple->t_data->t_infomask;
5213
5214 /*
5215 * If this is the first possibly-multixact-able operation in the current
5216 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5217 * certain that the transaction will never become a member of any older
5218 * MultiXactIds than that. (We have to do this even if we end up just
5219 * using our own TransactionId below, since some other backend could
5220 * incorporate our XID into a MultiXact immediately afterwards.)
5221 */
5223
5224 /*
5225 * Compute the new xmax and infomask to store into the tuple. Note we do
5226 * not modify the tuple just yet, because that would leave it in the wrong
5227 * state if multixact.c elogs.
5228 */
5230 GetCurrentTransactionId(), mode, false,
5231 &xid, &new_infomask, &new_infomask2);
5232
5234
5235 /*
5236 * Store transaction information of xact locking the tuple.
5237 *
5238 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5239 * possibly generating a useless combo CID. Moreover, if we're locking a
5240 * previously updated tuple, it's important to preserve the Cmax.
5241 *
5242 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5243 * we would break the HOT chain.
5244 */
5247 tuple->t_data->t_infomask |= new_infomask;
5248 tuple->t_data->t_infomask2 |= new_infomask2;
5251 HeapTupleHeaderSetXmax(tuple->t_data, xid);
5252
5253 /*
5254 * Make sure there is no forward chain link in t_ctid. Note that in the
5255 * cases where the tuple has been updated, we must not overwrite t_ctid,
5256 * because it was set by the updater. Moreover, if the tuple has been
5257 * updated, we need to follow the update chain to lock the new versions of
5258 * the tuple as well.
5259 */
5261 tuple->t_data->t_ctid = *tid;
5262
5263 /* Clear only the all-frozen bit on visibility map if needed */
5264 if (PageIsAllVisible(page) &&
5265 visibilitymap_clear(relation, block, vmbuffer,
5267 cleared_all_frozen = true;
5268
5269
5270 MarkBufferDirty(*buffer);
5271
5272 /*
5273 * XLOG stuff. You might think that we don't need an XLOG record because
5274 * there is no state change worth restoring after a crash. You would be
5275 * wrong however: we have just written either a TransactionId or a
5276 * MultiXactId that may never have been seen on disk before, and we need
5277 * to make sure that there are XLOG entries covering those ID numbers.
5278 * Else the same IDs might be re-used after a crash, which would be
5279 * disastrous if this page made it to disk before the crash. Essentially
5280 * we have to enforce the WAL log-before-data rule even in this case.
5281 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5282 * entries for everything anyway.)
5283 */
5284 if (RelationNeedsWAL(relation))
5285 {
5288
5291
5292 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5293 xlrec.xmax = xid;
5294 xlrec.infobits_set = compute_infobits(new_infomask,
5295 tuple->t_data->t_infomask2);
5298
5299 /* we don't decode row locks atm, so no need to log the origin */
5300
5302
5303 PageSetLSN(page, recptr);
5304 }
5305
5307
5308 result = TM_Ok;
5309
5312
5314 if (BufferIsValid(vmbuffer))
5315 ReleaseBuffer(vmbuffer);
5316
5317 /*
5318 * Don't update the visibility map here. Locking a tuple doesn't change
5319 * visibility info.
5320 */
5321
5322 /*
5323 * Now that we have successfully marked the tuple as locked, we can
5324 * release the lmgr tuple lock, if we had it.
5325 */
5326 if (have_tuple_lock)
5327 UnlockTupleTuplock(relation, tid, mode);
5328
5329 return result;
5330}
5331
5332/*
5333 * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5334 * its normal, Xmax-based tuple lock.
5335 *
5336 * have_tuple_lock is an input and output parameter: on input, it indicates
5337 * whether the lock has previously been acquired (and this function does
5338 * nothing in that case). If this function returns success, have_tuple_lock
5339 * has been flipped to true.
5340 *
5341 * Returns false if it was unable to obtain the lock; this can only happen if
5342 * wait_policy is Skip.
5343 */
5344static bool
5347{
5348 if (*have_tuple_lock)
5349 return true;
5350
5351 switch (wait_policy)
5352 {
5353 case LockWaitBlock:
5354 LockTupleTuplock(relation, tid, mode);
5355 break;
5356
5357 case LockWaitSkip:
5358 if (!ConditionalLockTupleTuplock(relation, tid, mode, false))
5359 return false;
5360 break;
5361
5362 case LockWaitError:
5364 ereport(ERROR,
5366 errmsg("could not obtain lock on row in relation \"%s\"",
5367 RelationGetRelationName(relation))));
5368 break;
5369 }
5370 *have_tuple_lock = true;
5371
5372 return true;
5373}
5374
5375/*
5376 * Given an original set of Xmax and infomask, and a transaction (identified by
5377 * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5378 * corresponding infomasks to use on the tuple.
5379 *
5380 * Note that this might have side effects such as creating a new MultiXactId.
5381 *
5382 * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5383 * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5384 * but it was not running anymore. There is a race condition, which is that the
5385 * MultiXactId may have finished since then, but that uncommon case is handled
5386 * either here, or within MultiXactIdExpand.
5387 *
5388 * There is a similar race condition possible when the old xmax was a regular
5389 * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5390 * window, but it's still possible to end up creating an unnecessary
5391 * MultiXactId. Fortunately this is harmless.
5392 */
5393static void
5399{
5400 TransactionId new_xmax;
5403
5405
5406l5:
5407 new_infomask = 0;
5408 new_infomask2 = 0;
5410 {
5411 /*
5412 * No previous locker; we just insert our own TransactionId.
5413 *
5414 * Note that it's critical that this case be the first one checked,
5415 * because there are several blocks below that come back to this one
5416 * to implement certain optimizations; old_infomask might contain
5417 * other dirty bits in those cases, but we don't really care.
5418 */
5419 if (is_update)
5420 {
5421 new_xmax = add_to_xmax;
5422 if (mode == LockTupleExclusive)
5424 }
5425 else
5426 {
5428 switch (mode)
5429 {
5430 case LockTupleKeyShare:
5431 new_xmax = add_to_xmax;
5433 break;
5434 case LockTupleShare:
5435 new_xmax = add_to_xmax;
5437 break;
5439 new_xmax = add_to_xmax;
5441 break;
5442 case LockTupleExclusive:
5443 new_xmax = add_to_xmax;
5446 break;
5447 default:
5448 new_xmax = InvalidTransactionId; /* silence compiler */
5449 elog(ERROR, "invalid lock mode");
5450 }
5451 }
5452 }
5454 {
5456
5457 /*
5458 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5459 * cross-check.
5460 */
5462
5463 /*
5464 * A multixact together with LOCK_ONLY set but neither lock bit set
5465 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5466 * anymore. This check is critical for databases upgraded by
5467 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5468 * that such multis are never passed.
5469 */
5471 {
5474 goto l5;
5475 }
5476
5477 /*
5478 * If the XMAX is already a MultiXactId, then we need to expand it to
5479 * include add_to_xmax; but if all the members were lockers and are
5480 * all gone, we can do away with the IS_MULTI bit and just set
5481 * add_to_xmax as the only locker/updater. If all lockers are gone
5482 * and we have an updater that aborted, we can also do without a
5483 * multi.
5484 *
5485 * The cost of doing GetMultiXactIdMembers would be paid by
5486 * MultiXactIdExpand if we weren't to do this, so this check is not
5487 * incurring extra work anyhow.
5488 */
5490 {
5493 old_infomask)))
5494 {
5495 /*
5496 * Reset these bits and restart; otherwise fall through to
5497 * create a new multi below.
5498 */
5501 goto l5;
5502 }
5503 }
5504
5506
5507 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5508 new_status);
5510 }
5512 {
5513 /*
5514 * It's a committed update, so we need to preserve him as updater of
5515 * the tuple.
5516 */
5517 MultiXactStatus status;
5519
5521 status = MultiXactStatusUpdate;
5522 else
5524
5526
5527 /*
5528 * since it's not running, it's obviously impossible for the old
5529 * updater to be identical to the current one, so we need not check
5530 * for that case as we do in the block above.
5531 */
5532 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5534 }
5535 else if (TransactionIdIsInProgress(xmax))
5536 {
5537 /*
5538 * If the XMAX is a valid, in-progress TransactionId, then we need to
5539 * create a new MultiXactId that includes both the old locker or
5540 * updater and our own TransactionId.
5541 */
5545
5547 {
5553 {
5556 else
5558 }
5559 else
5560 {
5561 /*
5562 * LOCK_ONLY can be present alone only when a page has been
5563 * upgraded by pg_upgrade. But in that case,
5564 * TransactionIdIsInProgress() should have returned false. We
5565 * assume it's no longer locked in this case.
5566 */
5567 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5570 goto l5;
5571 }
5572 }
5573 else
5574 {
5575 /* it's an update, but which kind? */
5578 else
5580 }
5581
5583
5584 /*
5585 * If the lock to be acquired is for the same TransactionId as the
5586 * existing lock, there's an optimization possible: consider only the
5587 * strongest of both locks as the only one present, and restart.
5588 */
5589 if (xmax == add_to_xmax)
5590 {
5591 /*
5592 * Note that it's not possible for the original tuple to be
5593 * updated: we wouldn't be here because the tuple would have been
5594 * invisible and we wouldn't try to update it. As a subtlety,
5595 * this code can also run when traversing an update chain to lock
5596 * future versions of a tuple. But we wouldn't be here either,
5597 * because the add_to_xmax would be different from the original
5598 * updater.
5599 */
5601
5602 /* acquire the strongest of both */
5603 if (mode < old_mode)
5604 mode = old_mode;
5605 /* mustn't touch is_update */
5606
5608 goto l5;
5609 }
5610
5611 /* otherwise, just fall back to creating a new multixact */
5613 new_xmax = MultiXactIdCreate(xmax, old_status,
5616 }
5619 {
5620 /*
5621 * It's a committed update, so we gotta preserve him as updater of the
5622 * tuple.
5623 */
5624 MultiXactStatus status;
5626
5628 status = MultiXactStatusUpdate;
5629 else
5631
5633
5634 /*
5635 * since it's not running, it's obviously impossible for the old
5636 * updater to be identical to the current one, so we need not check
5637 * for that case as we do in the block above.
5638 */
5639 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5641 }
5642 else
5643 {
5644 /*
5645 * Can get here iff the locking/updating transaction was running when
5646 * the infomask was extracted from the tuple, but finished before
5647 * TransactionIdIsInProgress got to run. Deal with it as if there was
5648 * no locker at all in the first place.
5649 */
5651 goto l5;
5652 }
5653
5656 *result_xmax = new_xmax;
5657}
5658
5659/*
5660 * Subroutine for heap_lock_updated_tuple_rec.
5661 *
5662 * Given a hypothetical multixact status held by the transaction identified
5663 * with the given xid, does the current transaction need to wait, fail, or can
5664 * it continue if it wanted to acquire a lock of the given mode? "needwait"
5665 * is set to true if waiting is necessary; if it can continue, then TM_Ok is
5666 * returned. If the lock is already held by the current transaction, return
5667 * TM_SelfModified. In case of a conflict with another transaction, a
5668 * different HeapTupleSatisfiesUpdate return code is returned.
5669 *
5670 * The held status is said to be hypothetical because it might correspond to a
5671 * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5672 * way for simplicity of API.
5673 */
5674static TM_Result
5677 bool *needwait)
5678{
5680
5681 *needwait = false;
5683
5684 /*
5685 * Note: we *must* check TransactionIdIsInProgress before
5686 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5687 * for an explanation.
5688 */
5690 {
5691 /*
5692 * The tuple has already been locked by our own transaction. This is
5693 * very rare but can happen if multiple transactions are trying to
5694 * lock an ancient version of the same tuple.
5695 */
5696 return TM_SelfModified;
5697 }
5698 else if (TransactionIdIsInProgress(xid))
5699 {
5700 /*
5701 * If the locking transaction is running, what we do depends on
5702 * whether the lock modes conflict: if they do, then we must wait for
5703 * it to finish; otherwise we can fall through to lock this tuple
5704 * version without waiting.
5705 */
5708 {
5709 *needwait = true;
5710 }
5711
5712 /*
5713 * If we set needwait above, then this value doesn't matter;
5714 * otherwise, this value signals to caller that it's okay to proceed.
5715 */
5716 return TM_Ok;
5717 }
5718 else if (TransactionIdDidAbort(xid))
5719 return TM_Ok;
5720 else if (TransactionIdDidCommit(xid))
5721 {
5722 /*
5723 * The other transaction committed. If it was only a locker, then the
5724 * lock is completely gone now and we can return success; but if it
5725 * was an update, then what we do depends on whether the two lock
5726 * modes conflict. If they conflict, then we must report error to
5727 * caller. But if they don't, we can fall through to allow the current
5728 * transaction to lock the tuple.
5729 *
5730 * Note: the reason we worry about ISUPDATE here is because as soon as
5731 * a transaction ends, all its locks are gone and meaningless, and
5732 * thus we can ignore them; whereas its updates persist. In the
5733 * TransactionIdIsInProgress case, above, we don't need to check
5734 * because we know the lock is still "alive" and thus a conflict needs
5735 * always be checked.
5736 */
5737 if (!ISUPDATE_from_mxstatus(status))
5738 return TM_Ok;
5739
5742 {
5743 /* bummer */
5744 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5745 return TM_Updated;
5746 else
5747 return TM_Deleted;
5748 }
5749
5750 return TM_Ok;
5751 }
5752
5753 /* Not in progress, not aborted, not committed -- must have crashed */
5754 return TM_Ok;
5755}
5756
5757
5758/*
5759 * Recursive part of heap_lock_updated_tuple
5760 *
5761 * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5762 * xid with the given mode; if this tuple is updated, recurse to lock the new
5763 * version as well.
5764 */
5765static TM_Result
5767 const ItemPointerData *tid, TransactionId xid,
5769{
5770 TM_Result result;
5773 Buffer buf;
5778 TransactionId xmax,
5779 new_xmax;
5780 bool cleared_all_frozen = false;
5782 Buffer vmbuffer = InvalidBuffer;
5783 BlockNumber block;
5784
5785 ItemPointerCopy(tid, &tupid);
5786
5787 for (;;)
5788 {
5789 new_infomask = 0;
5790 new_xmax = InvalidTransactionId;
5792 ItemPointerCopy(&tupid, &(mytup.t_self));
5793
5794 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
5795 {
5796 /*
5797 * if we fail to find the updated version of the tuple, it's
5798 * because it was vacuumed/pruned away after its creator
5799 * transaction aborted. So behave as if we got to the end of the
5800 * chain, and there's no further tuple to lock: return success to
5801 * caller.
5802 */
5803 result = TM_Ok;
5804 goto out_unlocked;
5805 }
5806
5807l4:
5809
5810 /*
5811 * Before locking the buffer, pin the visibility map page if it
5812 * appears to be necessary. Since we haven't got the lock yet,
5813 * someone else might be in the middle of changing this, so we'll need
5814 * to recheck after we have the lock.
5815 */
5817 {
5818 visibilitymap_pin(rel, block, &vmbuffer);
5819 pinned_desired_page = true;
5820 }
5821 else
5822 pinned_desired_page = false;
5823
5825
5826 /*
5827 * If we didn't pin the visibility map page and the page has become
5828 * all visible while we were busy locking the buffer, we'll have to
5829 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5830 * That's a bit unfortunate, but hopefully shouldn't happen often.
5831 *
5832 * Note: in some paths through this function, we will reach here
5833 * holding a pin on a vm page that may or may not be the one matching
5834 * this page. If this page isn't all-visible, we won't use the vm
5835 * page, but we hold onto such a pin till the end of the function.
5836 */
5838 {
5840 visibilitymap_pin(rel, block, &vmbuffer);
5842 }
5843
5844 /*
5845 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5846 * end of the chain, we're done, so return success.
5847 */
5850 priorXmax))
5851 {
5852 result = TM_Ok;
5853 goto out_locked;
5854 }
5855
5856 /*
5857 * Also check Xmin: if this tuple was created by an aborted
5858 * (sub)transaction, then we already locked the last live one in the
5859 * chain, thus we're done, so return success.
5860 */
5862 {
5863 result = TM_Ok;
5864 goto out_locked;
5865 }
5866
5867 old_infomask = mytup.t_data->t_infomask;
5868 old_infomask2 = mytup.t_data->t_infomask2;
5869 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5870
5871 /*
5872 * If this tuple version has been updated or locked by some concurrent
5873 * transaction(s), what we do depends on whether our lock mode
5874 * conflicts with what those other transactions hold, and also on the
5875 * status of them.
5876 */
5878 {
5880 bool needwait;
5881
5884 {
5885 int nmembers;
5886 int i;
5887 MultiXactMember *members;
5888
5889 /*
5890 * We don't need a test for pg_upgrade'd tuples: this is only
5891 * applied to tuples after the first in an update chain. Said
5892 * first tuple in the chain may well be locked-in-9.2-and-
5893 * pg_upgraded, but that one was already locked by our caller,
5894 * not us; and any subsequent ones cannot be because our
5895 * caller must necessarily have obtained a snapshot later than
5896 * the pg_upgrade itself.
5897 */
5898 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5899
5900 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5902 for (i = 0; i < nmembers; i++)
5903 {
5904 result = test_lockmode_for_conflict(members[i].status,
5905 members[i].xid,
5906 mode,
5907 &mytup,
5908 &needwait);
5909
5910 /*
5911 * If the tuple was already locked by ourselves in a
5912 * previous iteration of this (say heap_lock_tuple was
5913 * forced to restart the locking loop because of a change
5914 * in xmax), then we hold the lock already on this tuple
5915 * version and we don't need to do anything; and this is
5916 * not an error condition either. We just need to skip
5917 * this tuple and continue locking the next version in the
5918 * update chain.
5919 */
5920 if (result == TM_SelfModified)
5921 {
5922 pfree(members);
5923 goto next;
5924 }
5925
5926 if (needwait)
5927 {
5929 XactLockTableWait(members[i].xid, rel,
5930 &mytup.t_self,
5932 pfree(members);
5933 goto l4;
5934 }
5935 if (result != TM_Ok)
5936 {
5937 pfree(members);
5938 goto out_locked;
5939 }
5940 }
5941 if (members)
5942 pfree(members);
5943 }
5944 else
5945 {
5946 MultiXactStatus status;
5947
5948 /*
5949 * For a non-multi Xmax, we first need to compute the
5950 * corresponding MultiXactStatus by using the infomask bits.
5951 */
5953 {
5957 status = MultiXactStatusForShare;
5959 {
5961 status = MultiXactStatusForUpdate;
5962 else
5964 }
5965 else
5966 {
5967 /*
5968 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5969 * as share-locked in the old cluster) shouldn't be
5970 * seen in the middle of an update chain.
5971 */
5972 elog(ERROR, "invalid lock status in tuple");
5973 }
5974 }
5975 else
5976 {
5977 /* it's an update, but which kind? */
5979 status = MultiXactStatusUpdate;
5980 else
5982 }
5983
5984 result = test_lockmode_for_conflict(status, rawxmax, mode,
5985 &mytup, &needwait);
5986
5987 /*
5988 * If the tuple was already locked by ourselves in a previous
5989 * iteration of this (say heap_lock_tuple was forced to
5990 * restart the locking loop because of a change in xmax), then
5991 * we hold the lock already on this tuple version and we don't
5992 * need to do anything; and this is not an error condition
5993 * either. We just need to skip this tuple and continue
5994 * locking the next version in the update chain.
5995 */
5996 if (result == TM_SelfModified)
5997 goto next;
5998
5999 if (needwait)
6000 {
6002 XactLockTableWait(rawxmax, rel, &mytup.t_self,
6004 goto l4;
6005 }
6006 if (result != TM_Ok)
6007 {
6008 goto out_locked;
6009 }
6010 }
6011 }
6012
6013 /* compute the new Xmax and infomask values for the tuple ... */
6014 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
6015 xid, mode, false,
6016 &new_xmax, &new_infomask, &new_infomask2);
6017
6019 visibilitymap_clear(rel, block, vmbuffer,
6021 cleared_all_frozen = true;
6022
6024
6025 /* ... and set them */
6026 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
6027 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
6028 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6029 mytup.t_data->t_infomask |= new_infomask;
6030 mytup.t_data->t_infomask2 |= new_infomask2;
6031
6033
6034 /* XLOG stuff */
6035 if (RelationNeedsWAL(rel))
6036 {
6039 Page page = BufferGetPage(buf);
6040
6043
6044 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
6045 xlrec.xmax = new_xmax;
6047 xlrec.flags =
6049
6051
6053
6054 PageSetLSN(page, recptr);
6055 }
6056
6058
6059next:
6060 /* if we find the end of update chain, we're done. */
6061 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
6063 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
6065 {
6066 result = TM_Ok;
6067 goto out_locked;
6068 }
6069
6070 /* tail recursion */
6072 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
6074 }
6075
6076 result = TM_Ok;
6077
6080
6082 if (vmbuffer != InvalidBuffer)
6083 ReleaseBuffer(vmbuffer);
6084
6085 return result;
6086}
6087
6088/*
6089 * heap_lock_updated_tuple
6090 * Follow update chain when locking an updated tuple, acquiring locks (row
6091 * marks) on the updated versions.
6092 *
6093 * 'prior_infomask', 'prior_raw_xmax' and 'prior_ctid' are the corresponding
6094 * fields from the initial tuple. We will lock the tuples starting from the
6095 * one that 'prior_ctid' points to. Note: This function does not lock the
6096 * initial tuple itself.
6097 *
6098 * This function doesn't check visibility, it just unconditionally marks the
6099 * tuple(s) as locked. If any tuple in the updated chain is being deleted
6100 * concurrently (or updated with the key being modified), sleep until the
6101 * transaction doing it is finished.
6102 *
6103 * Note that we don't acquire heavyweight tuple locks on the tuples we walk
6104 * when we have to wait for other transactions to release them, as opposed to
6105 * what heap_lock_tuple does. The reason is that having more than one
6106 * transaction walking the chain is probably uncommon enough that risk of
6107 * starvation is not likely: one of the preconditions for being here is that
6108 * the snapshot in use predates the update that created this tuple (because we
6109 * started at an earlier version of the tuple), but at the same time such a
6110 * transaction cannot be using repeatable read or serializable isolation
6111 * levels, because that would lead to a serializability failure.
6112 */
6113static TM_Result
6119{
6120 INJECTION_POINT("heap_lock_updated_tuple", NULL);
6121
6122 /*
6123 * If the tuple has moved into another partition (effectively a delete)
6124 * stop here.
6125 */
6127 {
6129
6130 /*
6131 * If this is the first possibly-multixact-able operation in the
6132 * current transaction, set my per-backend OldestMemberMXactId
6133 * setting. We can be certain that the transaction will never become a
6134 * member of any older MultiXactIds than that. (We have to do this
6135 * even if we end up just using our own TransactionId below, since
6136 * some other backend could incorporate our XID into a MultiXact
6137 * immediately afterwards.)
6138 */
6140
6144 }
6145
6146 /* nothing to lock */
6147 return TM_Ok;
6148}
6149
6150/*
6151 * heap_finish_speculative - mark speculative insertion as successful
6152 *
6153 * To successfully finish a speculative insertion we have to clear speculative
6154 * token from tuple. To do so the t_ctid field, which will contain a
6155 * speculative token value, is modified in place to point to the tuple itself,
6156 * which is characteristic of a newly inserted ordinary tuple.
6157 *
6158 * NB: It is not ok to commit without either finishing or aborting a
6159 * speculative insertion. We could treat speculative tuples of committed
6160 * transactions implicitly as completed, but then we would have to be prepared
6161 * to deal with speculative tokens on committed tuples. That wouldn't be
6162 * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
6163 * but clearing the token at completion isn't very expensive either.
6164 * An explicit confirmation WAL record also makes logical decoding simpler.
6165 */
6166void
6168{
6169 Buffer buffer;
6170 Page page;
6171 OffsetNumber offnum;
6172 ItemId lp;
6173 HeapTupleHeader htup;
6174
6175 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
6177 page = BufferGetPage(buffer);
6178
6179 offnum = ItemPointerGetOffsetNumber(tid);
6181 elog(ERROR, "offnum out of range");
6182 lp = PageGetItemId(page, offnum);
6183 if (!ItemIdIsNormal(lp))
6184 elog(ERROR, "invalid lp");
6185
6186 htup = (HeapTupleHeader) PageGetItem(page, lp);
6187
6188 /* NO EREPORT(ERROR) from here till changes are logged */
6190
6192
6193 MarkBufferDirty(buffer);
6194
6195 /*
6196 * Replace the speculative insertion token with a real t_ctid, pointing to
6197 * itself like it does on regular tuples.
6198 */
6199 htup->t_ctid = *tid;
6200
6201 /* XLOG stuff */
6202 if (RelationNeedsWAL(relation))
6203 {
6206
6208
6210
6211 /* We want the same filtering on this as on a plain insert */
6213
6216
6218
6219 PageSetLSN(page, recptr);
6220 }
6221
6223
6224 UnlockReleaseBuffer(buffer);
6225}
6226
6227/*
6228 * heap_abort_speculative - kill a speculatively inserted tuple
6229 *
6230 * Marks a tuple that was speculatively inserted in the same command as dead,
6231 * by setting its xmin as invalid. That makes it immediately appear as dead
6232 * to all transactions, including our own. In particular, it makes
6233 * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
6234 * inserting a duplicate key value won't unnecessarily wait for our whole
6235 * transaction to finish (it'll just wait for our speculative insertion to
6236 * finish).
6237 *
6238 * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
6239 * that arise due to a mutual dependency that is not user visible. By
6240 * definition, unprincipled deadlocks cannot be prevented by the user
6241 * reordering lock acquisition in client code, because the implementation level
6242 * lock acquisitions are not under the user's direct control. If speculative
6243 * inserters did not take this precaution, then under high concurrency they
6244 * could deadlock with each other, which would not be acceptable.
6245 *
6246 * This is somewhat redundant with heap_delete, but we prefer to have a
6247 * dedicated routine with stripped down requirements. Note that this is also
6248 * used to delete the TOAST tuples created during speculative insertion.
6249 *
6250 * This routine does not affect logical decoding as it only looks at
6251 * confirmation records.
6252 */
6253void
6255{
6257 ItemId lp;
6258 HeapTupleData tp;
6259 Page page;
6260 BlockNumber block;
6261 Buffer buffer;
6262
6264
6265 block = ItemPointerGetBlockNumber(tid);
6266 buffer = ReadBuffer(relation, block);
6267 page = BufferGetPage(buffer);
6268
6270
6271 /*
6272 * Page can't be all visible, we just inserted into it, and are still
6273 * running.
6274 */
6275 Assert(!PageIsAllVisible(page));
6276
6279
6280 tp.t_tableOid = RelationGetRelid(relation);
6281 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6282 tp.t_len = ItemIdGetLength(lp);
6283 tp.t_self = *tid;
6284
6285 /*
6286 * Sanity check that the tuple really is a speculatively inserted tuple,
6287 * inserted by us.
6288 */
6289 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6290 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6291 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6292 elog(ERROR, "attempted to kill a non-speculative tuple");
6294
6295 /*
6296 * No need to check for serializable conflicts here. There is never a
6297 * need for a combo CID, either. No need to extract replica identity, or
6298 * do anything special with infomask bits.
6299 */
6300
6302
6303 /*
6304 * The tuple will become DEAD immediately. Flag that this page is a
6305 * candidate for pruning by setting xmin to TransactionXmin. While not
6306 * immediately prunable, it is the oldest xid we can cheaply determine
6307 * that's safe against wraparound / being older than the table's
6308 * relfrozenxid. To defend against the unlikely case of a new relation
6309 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6310 * if so (vacuum can't subsequently move relfrozenxid to beyond
6311 * TransactionXmin, so there's no race here).
6312 */
6314 {
6315 TransactionId relfrozenxid = relation->rd_rel->relfrozenxid;
6317
6318 if (TransactionIdPrecedes(TransactionXmin, relfrozenxid))
6319 prune_xid = relfrozenxid;
6320 else
6323 }
6324
6325 /* store transaction information of xact deleting the tuple */
6328
6329 /*
6330 * Set the tuple header xmin to InvalidTransactionId. This makes the
6331 * tuple immediately invisible everyone. (In particular, to any
6332 * transactions waiting on the speculative token, woken up later.)
6333 */
6335
6336 /* Clear the speculative insertion token too */
6337 tp.t_data->t_ctid = tp.t_self;
6338
6339 MarkBufferDirty(buffer);
6340
6341 /*
6342 * XLOG stuff
6343 *
6344 * The WAL records generated here match heap_delete(). The same recovery
6345 * routines are used.
6346 */
6347 if (RelationNeedsWAL(relation))
6348 {
6351
6353 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6354 tp.t_data->t_infomask2);
6356 xlrec.xmax = xid;
6357
6361
6362 /* No replica identity & replication origin logged */
6363
6365
6366 PageSetLSN(page, recptr);
6367 }
6368
6370
6372
6373 if (HeapTupleHasExternal(&tp))
6374 {
6375 Assert(!IsToastRelation(relation));
6376 heap_toast_delete(relation, &tp, true);
6377 }
6378
6379 /*
6380 * Never need to mark tuple for invalidation, since catalogs don't support
6381 * speculative insertion
6382 */
6383
6384 /* Now we can release the buffer */
6385 ReleaseBuffer(buffer);
6386
6387 /* count deletion, as we counted the insertion too */
6388 pgstat_count_heap_delete(relation);
6389}
6390
6391/*
6392 * heap_inplace_lock - protect inplace update from concurrent heap_update()
6393 *
6394 * Evaluate whether the tuple's state is compatible with a no-key update.
6395 * Current transaction rowmarks are fine, as is KEY SHARE from any
6396 * transaction. If compatible, return true with the buffer exclusive-locked,
6397 * and the caller must release that by calling
6398 * heap_inplace_update_and_unlock(), calling heap_inplace_unlock(), or raising
6399 * an error. Otherwise, call release_callback(arg), wait for blocking
6400 * transactions to end, and return false.
6401 *
6402 * Since this is intended for system catalogs and SERIALIZABLE doesn't cover
6403 * DDL, this doesn't guarantee any particular predicate locking.
6404 *
6405 * heap_delete() is a rarer source of blocking transactions (xwait). We'll
6406 * wait for such a transaction just like for the normal heap_update() case.
6407 * Normal concurrent DROP commands won't cause that, because all inplace
6408 * updaters take some lock that conflicts with DROP. An explicit SQL "DELETE
6409 * FROM pg_class" can cause it. By waiting, if the concurrent transaction
6410 * executed both "DELETE FROM pg_class" and "INSERT INTO pg_class", our caller
6411 * can find the successor tuple.
6412 *
6413 * Readers of inplace-updated fields expect changes to those fields are
6414 * durable. For example, vac_truncate_clog() reads datfrozenxid from
6415 * pg_database tuples via catalog snapshots. A future snapshot must not
6416 * return a lower datfrozenxid for the same database OID (lower in the
6417 * FullTransactionIdPrecedes() sense). We achieve that since no update of a
6418 * tuple can start while we hold a lock on its buffer. In cases like
6419 * BEGIN;GRANT;CREATE INDEX;COMMIT we're inplace-updating a tuple visible only
6420 * to this transaction. ROLLBACK then is one case where it's okay to lose
6421 * inplace updates. (Restoring relhasindex=false on ROLLBACK is fine, since
6422 * any concurrent CREATE INDEX would have blocked, then inplace-updated the
6423 * committed tuple.)
6424 *
6425 * In principle, we could avoid waiting by overwriting every tuple in the
6426 * updated tuple chain. Reader expectations permit updating a tuple only if
6427 * it's aborted, is the tail of the chain, or we already updated the tuple
6428 * referenced in its t_ctid. Hence, we would need to overwrite the tuples in
6429 * order from tail to head. That would imply either (a) mutating all tuples
6430 * in one critical section or (b) accepting a chance of partial completion.
6431 * Partial completion of a relfrozenxid update would have the weird
6432 * consequence that the table's next VACUUM could see the table's relfrozenxid
6433 * move forward between vacuum_get_cutoffs() and finishing.
6434 */
6435bool
6437 HeapTuple oldtup_ptr, Buffer buffer,
6438 void (*release_callback) (void *), void *arg)
6439{
6440 HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */
6441 TM_Result result;
6442 bool ret;
6443
6444#ifdef USE_ASSERT_CHECKING
6445 if (RelationGetRelid(relation) == RelationRelationId)
6447#endif
6448
6449 Assert(BufferIsValid(buffer));
6450
6451 /*
6452 * Register shared cache invals if necessary. Other sessions may finish
6453 * inplace updates of this tuple between this step and LockTuple(). Since
6454 * inplace updates don't change cache keys, that's harmless.
6455 *
6456 * While it's tempting to register invals only after confirming we can
6457 * return true, the following obstacle precludes reordering steps that
6458 * way. Registering invals might reach a CatalogCacheInitializeCache()
6459 * that locks "buffer". That would hang indefinitely if running after our
6460 * own LockBuffer(). Hence, we must register invals before LockBuffer().
6461 */
6463
6464 LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6466
6467 /*----------
6468 * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
6469 *
6470 * - wait unconditionally
6471 * - already locked tuple above, since inplace needs that unconditionally
6472 * - don't recheck header after wait: simpler to defer to next iteration
6473 * - don't try to continue even if the updater aborts: likewise
6474 * - no crosscheck
6475 */
6477 buffer);
6478
6479 if (result == TM_Invisible)
6480 {
6481 /* no known way this can happen */
6482 ereport(ERROR,
6484 errmsg_internal("attempted to overwrite invisible tuple")));
6485 }
6486 else if (result == TM_SelfModified)
6487 {
6488 /*
6489 * CREATE INDEX might reach this if an expression is silly enough to
6490 * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
6491 * statements might get here after a heap_update() of the same row, in
6492 * the absence of an intervening CommandCounterIncrement().
6493 */
6494 ereport(ERROR,
6496 errmsg("tuple to be updated was already modified by an operation triggered by the current command")));
6497 }
6498 else if (result == TM_BeingModified)
6499 {
6502
6504 infomask = oldtup.t_data->t_infomask;
6505
6507 {
6510 int remain;
6511
6513 lockmode, NULL))
6514 {
6517 ret = false;
6519 relation, &oldtup.t_self, XLTW_Update,
6520 &remain);
6521 }
6522 else
6523 ret = true;
6524 }
6526 ret = true;
6528 ret = true;
6529 else
6530 {
6533 ret = false;
6534 XactLockTableWait(xwait, relation, &oldtup.t_self,
6535 XLTW_Update);
6536 }
6537 }
6538 else
6539 {
6540 ret = (result == TM_Ok);
6541 if (!ret)
6542 {
6545 }
6546 }
6547
6548 /*
6549 * GetCatalogSnapshot() relies on invalidation messages to know when to
6550 * take a new snapshot. COMMIT of xwait is responsible for sending the
6551 * invalidation. We're not acquiring heavyweight locks sufficient to
6552 * block if not yet sent, so we must take a new snapshot to ensure a later
6553 * attempt has a fair chance. While we don't need this if xwait aborted,
6554 * don't bother optimizing that.
6555 */
6556 if (!ret)
6557 {
6558 UnlockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6561 }
6562 return ret;
6563}
6564
6565/*
6566 * heap_inplace_update_and_unlock - core of systable_inplace_update_finish
6567 *
6568 * The tuple cannot change size, and therefore its header fields and null
6569 * bitmap (if any) don't change either.
6570 *
6571 * Since we hold LOCKTAG_TUPLE, no updater has a local copy of this tuple.
6572 */
6573void
6575 HeapTuple oldtup, HeapTuple tuple,
6576 Buffer buffer)
6577{
6578 HeapTupleHeader htup = oldtup->t_data;
6579 uint32 oldlen;
6580 uint32 newlen;
6581 char *dst;
6582 char *src;
6583 int nmsgs = 0;
6585 bool RelcacheInitFileInval = false;
6586
6587 Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
6588 oldlen = oldtup->t_len - htup->t_hoff;
6589 newlen = tuple->t_len - tuple->t_data->t_hoff;
6590 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6591 elog(ERROR, "wrong tuple length");
6592
6593 dst = (char *) htup + htup->t_hoff;
6594 src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6595
6596 /* Like RecordTransactionCommit(), log only if needed */
6599 &RelcacheInitFileInval);
6600
6601 /*
6602 * Unlink relcache init files as needed. If unlinking, acquire
6603 * RelCacheInitLock until after associated invalidations. By doing this
6604 * in advance, if we checkpoint and then crash between inplace
6605 * XLogInsert() and inval, we don't rely on StartupXLOG() ->
6606 * RelationCacheInitFileRemove(). That uses elevel==LOG, so replay would
6607 * neglect to PANIC on EIO.
6608 */
6610
6611 /*----------
6612 * NO EREPORT(ERROR) from here till changes are complete
6613 *
6614 * Our buffer lock won't stop a reader having already pinned and checked
6615 * visibility for this tuple. Hence, we write WAL first, then mutate the
6616 * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6617 * checkpoint delay makes that acceptable. With the usual order of
6618 * changes, a crash after memcpy() and before XLogInsert() could allow
6619 * datfrozenxid to overtake relfrozenxid:
6620 *
6621 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6622 * ["R" is a VACUUM tbl]
6623 * D: vac_update_datfrozenxid() -> systable_beginscan(pg_class)
6624 * D: systable_getnext() returns pg_class tuple of tbl
6625 * R: memcpy() into pg_class tuple of tbl
6626 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6627 * [crash]
6628 * [recovery restores datfrozenxid w/o relfrozenxid]
6629 *
6630 * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
6631 * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
6632 * The stack copy facilitates a FPI of the post-mutation block before we
6633 * accept other sessions seeing it. DELAY_CHKPT_START allows us to
6634 * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
6635 * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
6636 * This function, however, likely could avoid it with the following order
6637 * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
6638 * DELAY_CHKPT_START here, too, as a way to have fewer distinct code
6639 * patterns to analyze. Inplace update isn't so frequent that it should
6640 * pursue the small optimization of skipping DELAY_CHKPT_START.
6641 */
6645
6646 /* XLOG stuff */
6647 if (RelationNeedsWAL(relation))
6648 {
6651 char *origdata = (char *) BufferGetBlock(buffer);
6652 Page page = BufferGetPage(buffer);
6653 uint16 lower = ((PageHeader) page)->pd_lower;
6654 uint16 upper = ((PageHeader) page)->pd_upper;
6656 RelFileLocator rlocator;
6657 ForkNumber forkno;
6658 BlockNumber blkno;
6660
6661 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6662 xlrec.dbId = MyDatabaseId;
6664 xlrec.relcacheInitFileInval = RelcacheInitFileInval;
6665 xlrec.nmsgs = nmsgs;
6666
6669 if (nmsgs != 0)
6671 nmsgs * sizeof(SharedInvalidationMessage));
6672
6673 /* register block matching what buffer will look like after changes */
6678 BufferGetTag(buffer, &rlocator, &forkno, &blkno);
6679 Assert(forkno == MAIN_FORKNUM);
6680 XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
6682 XLogRegisterBufData(0, src, newlen);
6683
6684 /* inplace updates aren't decoded atm, don't log the origin */
6685
6687
6688 PageSetLSN(page, recptr);
6689 }
6690
6691 memcpy(dst, src, newlen);
6692
6693 MarkBufferDirty(buffer);
6694
6696
6697 /*
6698 * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we
6699 * do this before UnlockTuple().
6700 */
6702
6705 UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
6706
6707 AcceptInvalidationMessages(); /* local processing of just-sent inval */
6708
6709 /*
6710 * Queue a transactional inval, for logical decoding and for third-party
6711 * code that might have been relying on it since long before inplace
6712 * update adopted immediate invalidation. See README.tuplock section
6713 * "Reading inplace-updated columns" for logical decoding details.
6714 */
6716 CacheInvalidateHeapTuple(relation, tuple, NULL);
6717}
6718
6719/*
6720 * heap_inplace_unlock - reverse of heap_inplace_lock
6721 */
6722void
6724 HeapTuple oldtup, Buffer buffer)
6725{
6727 UnlockTuple(relation, &oldtup->t_self, InplaceUpdateTupleLock);
6729}
6730
6731#define FRM_NOOP 0x0001
6732#define FRM_INVALIDATE_XMAX 0x0002
6733#define FRM_RETURN_IS_XID 0x0004
6734#define FRM_RETURN_IS_MULTI 0x0008
6735#define FRM_MARK_COMMITTED 0x0010
6736
6737/*
6738 * FreezeMultiXactId
6739 * Determine what to do during freezing when a tuple is marked by a
6740 * MultiXactId.
6741 *
6742 * "flags" is an output value; it's used to tell caller what to do on return.
6743 * "pagefrz" is an input/output value, used to manage page level freezing.
6744 *
6745 * Possible values that we can set in "flags":
6746 * FRM_NOOP
6747 * don't do anything -- keep existing Xmax
6748 * FRM_INVALIDATE_XMAX
6749 * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
6750 * FRM_RETURN_IS_XID
6751 * The Xid return value is a single update Xid to set as xmax.
6752 * FRM_MARK_COMMITTED
6753 * Xmax can be marked as HEAP_XMAX_COMMITTED
6754 * FRM_RETURN_IS_MULTI
6755 * The return value is a new MultiXactId to set as new Xmax.
6756 * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
6757 *
6758 * Caller delegates control of page freezing to us. In practice we always
6759 * force freezing of caller's page unless FRM_NOOP processing is indicated.
6760 * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff
6761 * can never be left behind. We freely choose when and how to process each
6762 * Multi, without ever violating the cutoff postconditions for freezing.
6763 *
6764 * It's useful to remove Multis on a proactive timeline (relative to freezing
6765 * XIDs) to keep MultiXact member SLRU buffer misses to a minimum. It can also
6766 * be cheaper in the short run, for us, since we too can avoid SLRU buffer
6767 * misses through eager processing.
6768 *
6769 * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only
6770 * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice.
6771 * This can usually be put off, which is usually enough to avoid it altogether.
6772 * Allocating new multis during VACUUM should be avoided on general principle;
6773 * only VACUUM can advance relminmxid, so allocating new Multis here comes with
6774 * its own special risks.
6775 *
6776 * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers
6777 * using heap_tuple_should_freeze when we haven't forced page-level freezing.
6778 *
6779 * NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we
6780 * have already forced page-level freezing, since that might incur the same
6781 * SLRU buffer misses that we specifically intended to avoid by freezing.
6782 */
6783static TransactionId
6784FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
6785 const struct VacuumCutoffs *cutoffs, uint16 *flags,
6786 HeapPageFreeze *pagefrz)
6787{
6789 MultiXactMember *members;
6790 int nmembers;
6791 bool need_replace;
6792 int nnewmembers;
6794 bool has_lockers;
6796 bool update_committed;
6797 TransactionId FreezePageRelfrozenXid;
6798
6799 *flags = 0;
6800
6801 /* We should only be called in Multis */
6802 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6803
6804 if (!MultiXactIdIsValid(multi) ||
6805 HEAP_LOCKED_UPGRADED(t_infomask))
6806 {
6807 *flags |= FRM_INVALIDATE_XMAX;
6808 pagefrz->freeze_required = true;
6809 return InvalidTransactionId;
6810 }
6811 else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
6812 ereport(ERROR,
6814 errmsg_internal("found multixact %u from before relminmxid %u",
6815 multi, cutoffs->relminmxid)));
6816 else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
6817 {
6819
6820 /*
6821 * This old multi cannot possibly have members still running, but
6822 * verify just in case. If it was a locker only, it can be removed
6823 * without any further consideration; but if it contained an update,
6824 * we might need to preserve it.
6825 */
6826 if (MultiXactIdIsRunning(multi,
6827 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6828 ereport(ERROR,
6830 errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
6831 multi, cutoffs->OldestMxact)));
6832
6833 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6834 {
6835 *flags |= FRM_INVALIDATE_XMAX;
6836 pagefrz->freeze_required = true;
6837 return InvalidTransactionId;
6838 }
6839
6840 /* replace multi with single XID for its updater? */
6841 update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
6843 ereport(ERROR,
6845 errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
6846 multi, update_xact,
6847 cutoffs->relfrozenxid)));
6848 else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
6849 {
6850 /*
6851 * Updater XID has to have aborted (otherwise the tuple would have
6852 * been pruned away instead, since updater XID is < OldestXmin).
6853 * Just remove xmax.
6854 */
6856 ereport(ERROR,
6858 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6859 multi, update_xact,
6860 cutoffs->OldestXmin)));
6861 *flags |= FRM_INVALIDATE_XMAX;
6862 pagefrz->freeze_required = true;
6863 return InvalidTransactionId;
6864 }
6865
6866 /* Have to keep updater XID as new xmax */
6867 *flags |= FRM_RETURN_IS_XID;
6868 pagefrz->freeze_required = true;
6869 return update_xact;
6870 }
6871
6872 /*
6873 * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
6874 * need to walk the whole members array to figure out what to do, if
6875 * anything.
6876 */
6877 nmembers =
6878 GetMultiXactIdMembers(multi, &members, false,
6879 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6880 if (nmembers <= 0)
6881 {
6882 /* Nothing worth keeping */
6883 *flags |= FRM_INVALIDATE_XMAX;
6884 pagefrz->freeze_required = true;
6885 return InvalidTransactionId;
6886 }
6887
6888 /*
6889 * The FRM_NOOP case is the only case where we might need to ratchet back
6890 * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
6891 * case where our caller might ratchet back its NoFreezePageRelfrozenXid
6892 * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
6893 * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
6894 * trackers managed by VACUUM being ratcheting back by xmax to the degree
6895 * required to make it safe to leave xmax undisturbed, independent of
6896 * whether or not page freezing is triggered somewhere else.
6897 *
6898 * Our policy is to force freezing in every case other than FRM_NOOP,
6899 * which obviates the need to maintain either set of trackers, anywhere.
6900 * Every other case will reliably execute a freeze plan for xmax that
6901 * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
6902 * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
6903 * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
6904 * OldestXmin/OldestMxact, so later values never need to be tracked here.)
6905 */
6906 need_replace = false;
6907 FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
6908 for (int i = 0; i < nmembers; i++)
6909 {
6910 TransactionId xid = members[i].xid;
6911
6912 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6913
6914 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
6915 {
6916 /* Can't violate the FreezeLimit postcondition */
6917 need_replace = true;
6918 break;
6919 }
6920 if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
6921 FreezePageRelfrozenXid = xid;
6922 }
6923
6924 /* Can't violate the MultiXactCutoff postcondition, either */
6925 if (!need_replace)
6927
6928 if (!need_replace)
6929 {
6930 /*
6931 * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
6932 * both together to make it safe to retain this particular multi after
6933 * freezing its page
6934 */
6935 *flags |= FRM_NOOP;
6936 pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
6937 if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
6938 pagefrz->FreezePageRelminMxid = multi;
6939 pfree(members);
6940 return multi;
6941 }
6942
6943 /*
6944 * Do a more thorough second pass over the multi to figure out which
6945 * member XIDs actually need to be kept. Checking the precise status of
6946 * individual members might even show that we don't need to keep anything.
6947 * That is quite possible even though the Multi must be >= OldestMxact,
6948 * since our second pass only keeps member XIDs when it's truly necessary;
6949 * even member XIDs >= OldestXmin often won't be kept by second pass.
6950 */
6951 nnewmembers = 0;
6953 has_lockers = false;
6955 update_committed = false;
6956
6957 /*
6958 * Determine whether to keep each member xid, or to ignore it instead
6959 */
6960 for (int i = 0; i < nmembers; i++)
6961 {
6962 TransactionId xid = members[i].xid;
6963 MultiXactStatus mstatus = members[i].status;
6964
6965 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6966
6967 if (!ISUPDATE_from_mxstatus(mstatus))
6968 {
6969 /*
6970 * Locker XID (not updater XID). We only keep lockers that are
6971 * still running.
6972 */
6975 {
6976 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6977 ereport(ERROR,
6979 errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
6980 multi, xid,
6981 cutoffs->OldestXmin)));
6982 newmembers[nnewmembers++] = members[i];
6983 has_lockers = true;
6984 }
6985
6986 continue;
6987 }
6988
6989 /*
6990 * Updater XID (not locker XID). Should we keep it?
6991 *
6992 * Since the tuple wasn't totally removed when vacuum pruned, the
6993 * update Xid cannot possibly be older than OldestXmin cutoff unless
6994 * the updater XID aborted. If the updater transaction is known
6995 * aborted or crashed then it's okay to ignore it, otherwise not.
6996 *
6997 * In any case the Multi should never contain two updaters, whatever
6998 * their individual commit status. Check for that first, in passing.
6999 */
7001 ereport(ERROR,
7003 errmsg_internal("multixact %u has two or more updating members",
7004 multi),
7005 errdetail_internal("First updater XID=%u second updater XID=%u.",
7006 update_xid, xid)));
7007
7008 /*
7009 * As with all tuple visibility routines, it's critical to test
7010 * TransactionIdIsInProgress before TransactionIdDidCommit, because of
7011 * race conditions explained in detail in heapam_visibility.c.
7012 */
7015 update_xid = xid;
7016 else if (TransactionIdDidCommit(xid))
7017 {
7018 /*
7019 * The transaction committed, so we can tell caller to set
7020 * HEAP_XMAX_COMMITTED. (We can only do this because we know the
7021 * transaction is not running.)
7022 */
7023 update_committed = true;
7024 update_xid = xid;
7025 }
7026 else
7027 {
7028 /*
7029 * Not in progress, not committed -- must be aborted or crashed;
7030 * we can ignore it.
7031 */
7032 continue;
7033 }
7034
7035 /*
7036 * We determined that updater must be kept -- add it to pending new
7037 * members list
7038 */
7039 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
7040 ereport(ERROR,
7042 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
7043 multi, xid, cutoffs->OldestXmin)));
7044 newmembers[nnewmembers++] = members[i];
7045 }
7046
7047 pfree(members);
7048
7049 /*
7050 * Determine what to do with caller's multi based on information gathered
7051 * during our second pass
7052 */
7053 if (nnewmembers == 0)
7054 {
7055 /* Nothing worth keeping */
7056 *flags |= FRM_INVALIDATE_XMAX;
7058 }
7060 {
7061 /*
7062 * If there's a single member and it's an update, pass it back alone
7063 * without creating a new Multi. (XXX we could do this when there's a
7064 * single remaining locker, too, but that would complicate the API too
7065 * much; moreover, the case with the single updater is more
7066 * interesting, because those are longer-lived.)
7067 */
7068 Assert(nnewmembers == 1);
7069 *flags |= FRM_RETURN_IS_XID;
7070 if (update_committed)
7071 *flags |= FRM_MARK_COMMITTED;
7073 }
7074 else
7075 {
7076 /*
7077 * Create a new multixact with the surviving members of the previous
7078 * one, to set as new Xmax in the tuple
7079 */
7081 *flags |= FRM_RETURN_IS_MULTI;
7082 }
7083
7085
7086 pagefrz->freeze_required = true;
7087 return newxmax;
7088}
7089
7090/*
7091 * heap_prepare_freeze_tuple
7092 *
7093 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7094 * are older than the OldestXmin and/or OldestMxact freeze cutoffs. If so,
7095 * setup enough state (in the *frz output argument) to enable caller to
7096 * process this tuple as part of freezing its page, and return true. Return
7097 * false if nothing can be changed about the tuple right now.
7098 *
7099 * Also sets *totally_frozen to true if the tuple will be totally frozen once
7100 * caller executes returned freeze plan (or if the tuple was already totally
7101 * frozen by an earlier VACUUM). This indicates that there are no remaining
7102 * XIDs or MultiXactIds that will need to be processed by a future VACUUM.
7103 *
7104 * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every
7105 * tuple that we returned true for, and then execute freezing. Caller must
7106 * initialize pagefrz fields for page as a whole before first call here for
7107 * each heap page.
7108 *
7109 * VACUUM caller decides on whether or not to freeze the page as a whole.
7110 * We'll often prepare freeze plans for a page that caller just discards.
7111 * However, VACUUM doesn't always get to make a choice; it must freeze when
7112 * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and
7113 * MXIDs < MultiXactCutoff) can never be left behind. We help to make sure
7114 * that VACUUM always follows that rule.
7115 *
7116 * We sometimes force freezing of xmax MultiXactId values long before it is
7117 * strictly necessary to do so just to ensure the FreezeLimit postcondition.
7118 * It's worth processing MultiXactIds proactively when it is cheap to do so,
7119 * and it's convenient to make that happen by piggy-backing it on the "force
7120 * freezing" mechanism. Conversely, we sometimes delay freezing MultiXactIds
7121 * because it is expensive right now (though only when it's still possible to
7122 * do so without violating the FreezeLimit/MultiXactCutoff postcondition).
7123 *
7124 * It is assumed that the caller has checked the tuple with
7125 * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
7126 * (else we should be removing the tuple, not freezing it).
7127 *
7128 * NB: This function has side effects: it might allocate a new MultiXactId.
7129 * It will be set as tuple's new xmax when our *frz output is processed within
7130 * heap_execute_freeze_tuple later on. If the tuple is in a shared buffer
7131 * then caller had better have an exclusive lock on it already.
7132 */
7133bool
7135 const struct VacuumCutoffs *cutoffs,
7136 HeapPageFreeze *pagefrz,
7138{
7139 bool xmin_already_frozen = false,
7140 xmax_already_frozen = false;
7141 bool freeze_xmin = false,
7142 replace_xvac = false,
7143 replace_xmax = false,
7144 freeze_xmax = false;
7145 TransactionId xid;
7146
7147 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
7148 frz->t_infomask2 = tuple->t_infomask2;
7149 frz->t_infomask = tuple->t_infomask;
7150 frz->frzflags = 0;
7151 frz->checkflags = 0;
7152
7153 /*
7154 * Process xmin, while keeping track of whether it's already frozen, or
7155 * will become frozen iff our freeze plan is executed by caller (could be
7156 * neither).
7157 */
7158 xid = HeapTupleHeaderGetXmin(tuple);
7159 if (!TransactionIdIsNormal(xid))
7160 xmin_already_frozen = true;
7161 else
7162 {
7163 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7164 ereport(ERROR,
7166 errmsg_internal("found xmin %u from before relfrozenxid %u",
7167 xid, cutoffs->relfrozenxid)));
7168
7169 /* Will set freeze_xmin flags in freeze plan below */
7171
7172 /* Verify that xmin committed if and when freeze plan is executed */
7173 if (freeze_xmin)
7175 }
7176
7177 /*
7178 * Old-style VACUUM FULL is gone, but we have to process xvac for as long
7179 * as we support having MOVED_OFF/MOVED_IN tuples in the database
7180 */
7181 xid = HeapTupleHeaderGetXvac(tuple);
7182 if (TransactionIdIsNormal(xid))
7183 {
7185 Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
7186
7187 /*
7188 * For Xvac, we always freeze proactively. This allows totally_frozen
7189 * tracking to ignore xvac.
7190 */
7191 replace_xvac = pagefrz->freeze_required = true;
7192
7193 /* Will set replace_xvac flags in freeze plan below */
7194 }
7195
7196 /* Now process xmax */
7197 xid = frz->xmax;
7198 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7199 {
7200 /* Raw xmax is a MultiXactId */
7202 uint16 flags;
7203
7204 /*
7205 * We will either remove xmax completely (in the "freeze_xmax" path),
7206 * process xmax by replacing it (in the "replace_xmax" path), or
7207 * perform no-op xmax processing. The only constraint is that the
7208 * FreezeLimit/MultiXactCutoff postcondition must never be violated.
7209 */
7210 newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
7211 &flags, pagefrz);
7212
7213 if (flags & FRM_NOOP)
7214 {
7215 /*
7216 * xmax is a MultiXactId, and nothing about it changes for now.
7217 * This is the only case where 'freeze_required' won't have been
7218 * set for us by FreezeMultiXactId, as well as the only case where
7219 * neither freeze_xmax nor replace_xmax are set (given a multi).
7220 *
7221 * This is a no-op, but the call to FreezeMultiXactId might have
7222 * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
7223 * for us (the "freeze page" variants, specifically). That'll
7224 * make it safe for our caller to freeze the page later on, while
7225 * leaving this particular xmax undisturbed.
7226 *
7227 * FreezeMultiXactId is _not_ responsible for the "no freeze"
7228 * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
7229 * job. A call to heap_tuple_should_freeze for this same tuple
7230 * will take place below if 'freeze_required' isn't set already.
7231 * (This repeats work from FreezeMultiXactId, but allows "no
7232 * freeze" tracker maintenance to happen in only one place.)
7233 */
7236 }
7237 else if (flags & FRM_RETURN_IS_XID)
7238 {
7239 /*
7240 * xmax will become an updater Xid (original MultiXact's updater
7241 * member Xid will be carried forward as a simple Xid in Xmax).
7242 */
7244
7245 /*
7246 * NB -- some of these transformations are only valid because we
7247 * know the return Xid is a tuple updater (i.e. not merely a
7248 * locker.) Also note that the only reason we don't explicitly
7249 * worry about HEAP_KEYS_UPDATED is because it lives in
7250 * t_infomask2 rather than t_infomask.
7251 */
7252 frz->t_infomask &= ~HEAP_XMAX_BITS;
7253 frz->xmax = newxmax;
7254 if (flags & FRM_MARK_COMMITTED)
7255 frz->t_infomask |= HEAP_XMAX_COMMITTED;
7256 replace_xmax = true;
7257 }
7258 else if (flags & FRM_RETURN_IS_MULTI)
7259 {
7262
7263 /*
7264 * xmax is an old MultiXactId that we have to replace with a new
7265 * MultiXactId, to carry forward two or more original member XIDs.
7266 */
7268
7269 /*
7270 * We can't use GetMultiXactIdHintBits directly on the new multi
7271 * here; that routine initializes the masks to all zeroes, which
7272 * would lose other bits we need. Doing it this way ensures all
7273 * unrelated bits remain untouched.
7274 */
7275 frz->t_infomask &= ~HEAP_XMAX_BITS;
7276 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7278 frz->t_infomask |= newbits;
7279 frz->t_infomask2 |= newbits2;
7280 frz->xmax = newxmax;
7281 replace_xmax = true;
7282 }
7283 else
7284 {
7285 /*
7286 * Freeze plan for tuple "freezes xmax" in the strictest sense:
7287 * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
7288 */
7289 Assert(flags & FRM_INVALIDATE_XMAX);
7291
7292 /* Will set freeze_xmax flags in freeze plan below */
7293 freeze_xmax = true;
7294 }
7295
7296 /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
7297 Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
7298 }
7299 else if (TransactionIdIsNormal(xid))
7300 {
7301 /* Raw xmax is normal XID */
7302 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7303 ereport(ERROR,
7305 errmsg_internal("found xmax %u from before relfrozenxid %u",
7306 xid, cutoffs->relfrozenxid)));
7307
7308 /* Will set freeze_xmax flags in freeze plan below */
7310
7311 /*
7312 * Verify that xmax aborted if and when freeze plan is executed,
7313 * provided it's from an update. (A lock-only xmax can be removed
7314 * independent of this, since the lock is released at xact end.)
7315 */
7317 frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
7318 }
7319 else if (!TransactionIdIsValid(xid))
7320 {
7321 /* Raw xmax is InvalidTransactionId XID */
7322 Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
7323 xmax_already_frozen = true;
7324 }
7325 else
7326 ereport(ERROR,
7328 errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
7329 xid, tuple->t_infomask)));
7330
7331 if (freeze_xmin)
7332 {
7334
7335 frz->t_infomask |= HEAP_XMIN_FROZEN;
7336 }
7337 if (replace_xvac)
7338 {
7339 /*
7340 * If a MOVED_OFF tuple is not dead, the xvac transaction must have
7341 * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
7342 * transaction succeeded.
7343 */
7344 Assert(pagefrz->freeze_required);
7345 if (tuple->t_infomask & HEAP_MOVED_OFF)
7346 frz->frzflags |= XLH_INVALID_XVAC;
7347 else
7348 frz->frzflags |= XLH_FREEZE_XVAC;
7349 }
7350 if (replace_xmax)
7351 {
7353 Assert(pagefrz->freeze_required);
7354
7355 /* Already set replace_xmax flags in freeze plan earlier */
7356 }
7357 if (freeze_xmax)
7358 {
7360
7361 frz->xmax = InvalidTransactionId;
7362
7363 /*
7364 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7365 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
7366 * Also get rid of the HEAP_KEYS_UPDATED bit.
7367 */
7368 frz->t_infomask &= ~HEAP_XMAX_BITS;
7369 frz->t_infomask |= HEAP_XMAX_INVALID;
7370 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7371 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7372 }
7373
7374 /*
7375 * Determine if this tuple is already totally frozen, or will become
7376 * totally frozen (provided caller executes freeze plans for the page)
7377 */
7380
7381 if (!pagefrz->freeze_required && !(xmin_already_frozen &&
7383 {
7384 /*
7385 * So far no previous tuple from the page made freezing mandatory.
7386 * Does this tuple force caller to freeze the entire page?
7387 */
7388 pagefrz->freeze_required =
7389 heap_tuple_should_freeze(tuple, cutoffs,
7390 &pagefrz->NoFreezePageRelfrozenXid,
7391 &pagefrz->NoFreezePageRelminMxid);
7392 }
7393
7394 /* Tell caller if this tuple has a usable freeze plan set in *frz */
7396}
7397
7398/*
7399 * Perform xmin/xmax XID status sanity checks before actually executing freeze
7400 * plans.
7401 *
7402 * heap_prepare_freeze_tuple doesn't perform these checks directly because
7403 * pg_xact lookups are relatively expensive. They shouldn't be repeated by
7404 * successive VACUUMs that each decide against freezing the same page.
7405 */
7406void
7408 HeapTupleFreeze *tuples, int ntuples)
7409{
7410 Page page = BufferGetPage(buffer);
7411
7412 for (int i = 0; i < ntuples; i++)
7413 {
7414 HeapTupleFreeze *frz = tuples + i;
7415 ItemId itemid = PageGetItemId(page, frz->offset);
7416 HeapTupleHeader htup;
7417
7418 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7419
7420 /* Deliberately avoid relying on tuple hint bits here */
7421 if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
7422 {
7424
7426 if (unlikely(!TransactionIdDidCommit(xmin)))
7427 ereport(ERROR,
7429 errmsg_internal("uncommitted xmin %u needs to be frozen",
7430 xmin)));
7431 }
7432
7433 /*
7434 * TransactionIdDidAbort won't work reliably in the presence of XIDs
7435 * left behind by transactions that were in progress during a crash,
7436 * so we can only check that xmax didn't commit
7437 */
7438 if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
7439 {
7441
7444 ereport(ERROR,
7446 errmsg_internal("cannot freeze committed xmax %u",
7447 xmax)));
7448 }
7449 }
7450}
7451
7452/*
7453 * Helper which executes freezing of one or more heap tuples on a page on
7454 * behalf of caller. Caller passes an array of tuple plans from
7455 * heap_prepare_freeze_tuple. Caller must set 'offset' in each plan for us.
7456 * Must be called in a critical section that also marks the buffer dirty and,
7457 * if needed, emits WAL.
7458 */
7459void
7460heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
7461{
7462 Page page = BufferGetPage(buffer);
7463
7464 for (int i = 0; i < ntuples; i++)
7465 {
7466 HeapTupleFreeze *frz = tuples + i;
7467 ItemId itemid = PageGetItemId(page, frz->offset);
7468 HeapTupleHeader htup;
7469
7470 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7472 }
7473}
7474
7475/*
7476 * heap_freeze_tuple
7477 * Freeze tuple in place, without WAL logging.
7478 *
7479 * Useful for callers like CLUSTER that perform their own WAL logging.
7480 */
7481bool
7483 TransactionId relfrozenxid, TransactionId relminmxid,
7484 TransactionId FreezeLimit, TransactionId MultiXactCutoff)
7485{
7487 bool do_freeze;
7488 bool totally_frozen;
7489 struct VacuumCutoffs cutoffs;
7490 HeapPageFreeze pagefrz;
7491
7492 cutoffs.relfrozenxid = relfrozenxid;
7493 cutoffs.relminmxid = relminmxid;
7494 cutoffs.OldestXmin = FreezeLimit;
7495 cutoffs.OldestMxact = MultiXactCutoff;
7496 cutoffs.FreezeLimit = FreezeLimit;
7498
7499 pagefrz.freeze_required = true;
7500 pagefrz.FreezePageRelfrozenXid = FreezeLimit;
7501 pagefrz.FreezePageRelminMxid = MultiXactCutoff;
7502 pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
7503 pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
7504
7505 do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
7506 &pagefrz, &frz, &totally_frozen);
7507
7508 /*
7509 * Note that because this is not a WAL-logged operation, we don't need to
7510 * fill in the offset in the freeze record.
7511 */
7512
7513 if (do_freeze)
7515 return do_freeze;
7516}
7517
7518/*
7519 * For a given MultiXactId, return the hint bits that should be set in the
7520 * tuple's infomask.
7521 *
7522 * Normally this should be called for a multixact that was just created, and
7523 * so is on our local cache, so the GetMembers call is fast.
7524 */
7525static void
7528{
7529 int nmembers;
7530 MultiXactMember *members;
7531 int i;
7533 uint16 bits2 = 0;
7534 bool has_update = false;
7536
7537 /*
7538 * We only use this in multis we just created, so they cannot be values
7539 * pre-pg_upgrade.
7540 */
7541 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7542
7543 for (i = 0; i < nmembers; i++)
7544 {
7546
7547 /*
7548 * Remember the strongest lock mode held by any member of the
7549 * multixact.
7550 */
7551 mode = TUPLOCK_from_mxstatus(members[i].status);
7552 if (mode > strongest)
7553 strongest = mode;
7554
7555 /* See what other bits we need */
7556 switch (members[i].status)
7557 {
7561 break;
7562
7565 break;
7566
7568 has_update = true;
7569 break;
7570
7573 has_update = true;
7574 break;
7575 }
7576 }
7577
7580 bits |= HEAP_XMAX_EXCL_LOCK;
7581 else if (strongest == LockTupleShare)
7582 bits |= HEAP_XMAX_SHR_LOCK;
7583 else if (strongest == LockTupleKeyShare)
7584 bits |= HEAP_XMAX_KEYSHR_LOCK;
7585
7586 if (!has_update)
7587 bits |= HEAP_XMAX_LOCK_ONLY;
7588
7589 if (nmembers > 0)
7590 pfree(members);
7591
7592 *new_infomask = bits;
7594}
7595
7596/*
7597 * MultiXactIdGetUpdateXid
7598 *
7599 * Given a multixact Xmax and corresponding infomask, which does not have the
7600 * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
7601 * transaction.
7602 *
7603 * Caller is expected to check the status of the updating transaction, if
7604 * necessary.
7605 */
7606static TransactionId
7608{
7610 MultiXactMember *members;
7611 int nmembers;
7612
7613 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7614 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7615
7616 /*
7617 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7618 * pre-pg_upgrade.
7619 */
7620 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7621
7622 if (nmembers > 0)
7623 {
7624 int i;
7625
7626 for (i = 0; i < nmembers; i++)
7627 {
7628 /* Ignore lockers */
7629 if (!ISUPDATE_from_mxstatus(members[i].status))
7630 continue;
7631
7632 /* there can be at most one updater */
7634 update_xact = members[i].xid;
7635#ifndef USE_ASSERT_CHECKING
7636
7637 /*
7638 * in an assert-enabled build, walk the whole array to ensure
7639 * there's no other updater.
7640 */
7641 break;
7642#endif
7643 }
7644
7645 pfree(members);
7646 }
7647
7648 return update_xact;
7649}
7650
7651/*
7652 * HeapTupleGetUpdateXid
7653 * As above, but use a HeapTupleHeader
7654 *
7655 * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
7656 * checking the hint bits.
7657 */
7660{
7662 tup->t_infomask);
7663}
7664
7665/*
7666 * Does the given multixact conflict with the current transaction grabbing a
7667 * tuple lock of the given strength?
7668 *
7669 * The passed infomask pairs up with the given multixact in the tuple header.
7670 *
7671 * If current_is_member is not NULL, it is set to 'true' if the current
7672 * transaction is a member of the given multixact.
7673 */
7674static bool
7676 LockTupleMode lockmode, bool *current_is_member)
7677{
7678 int nmembers;
7679 MultiXactMember *members;
7680 bool result = false;
7681 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7682
7684 return false;
7685
7686 nmembers = GetMultiXactIdMembers(multi, &members, false,
7688 if (nmembers >= 0)
7689 {
7690 int i;
7691
7692 for (i = 0; i < nmembers; i++)
7693 {
7696
7697 if (result && (current_is_member == NULL || *current_is_member))
7698 break;
7699
7700 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7701
7702 /* ignore members from current xact (but track their presence) */
7703 memxid = members[i].xid;
7705 {
7706 if (current_is_member != NULL)
7707 *current_is_member = true;
7708 continue;
7709 }
7710 else if (result)
7711 continue;
7712
7713 /* ignore members that don't conflict with the lock we want */
7715 continue;
7716
7717 if (ISUPDATE_from_mxstatus(members[i].status))
7718 {
7719 /* ignore aborted updaters */
7721 continue;
7722 }
7723 else
7724 {
7725 /* ignore lockers-only that are no longer in progress */
7727 continue;
7728 }
7729
7730 /*
7731 * Whatever remains are either live lockers that conflict with our
7732 * wanted lock, and updaters that are not aborted. Those conflict
7733 * with what we want. Set up to return true, but keep going to
7734 * look for the current transaction among the multixact members,
7735 * if needed.
7736 */
7737 result = true;
7738 }
7739 pfree(members);
7740 }
7741
7742 return result;
7743}
7744
7745/*
7746 * Do_MultiXactIdWait
7747 * Actual implementation for the two functions below.
7748 *
7749 * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
7750 * needed to ensure we only sleep on conflicting members, and the infomask is
7751 * used to optimize multixact access in case it's a lock-only multi); 'nowait'
7752 * indicates whether to use conditional lock acquisition, to allow callers to
7753 * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
7754 * context information for error messages. 'remaining', if not NULL, receives
7755 * the number of members that are still running, including any (non-aborted)
7756 * subtransactions of our own transaction. 'logLockFailure' indicates whether
7757 * to log details when a lock acquisition fails with 'nowait' enabled.
7758 *
7759 * We do this by sleeping on each member using XactLockTableWait. Any
7760 * members that belong to the current backend are *not* waited for, however;
7761 * this would not merely be useless but would lead to Assert failure inside
7762 * XactLockTableWait. By the time this returns, it is certain that all
7763 * transactions *of other backends* that were members of the MultiXactId
7764 * that conflict with the requested status are dead (and no new ones can have
7765 * been added, since it is not legal to add members to an existing
7766 * MultiXactId).
7767 *
7768 * But by the time we finish sleeping, someone else may have changed the Xmax
7769 * of the containing tuple, so the caller needs to iterate on us somehow.
7770 *
7771 * Note that in case we return false, the number of remaining members is
7772 * not to be trusted.
7773 */
7774static bool
7776 uint16 infomask, bool nowait,
7777 Relation rel, const ItemPointerData *ctid, XLTW_Oper oper,
7778 int *remaining, bool logLockFailure)
7779{
7780 bool result = true;
7781 MultiXactMember *members;
7782 int nmembers;
7783 int remain = 0;
7784
7785 /* for pre-pg_upgrade tuples, no need to sleep at all */
7786 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7787 GetMultiXactIdMembers(multi, &members, false,
7789
7790 if (nmembers >= 0)
7791 {
7792 int i;
7793
7794 for (i = 0; i < nmembers; i++)
7795 {
7796 TransactionId memxid = members[i].xid;
7797 MultiXactStatus memstatus = members[i].status;
7798
7800 {
7801 remain++;
7802 continue;
7803 }
7804
7806 LOCKMODE_from_mxstatus(status)))
7807 {
7809 remain++;
7810 continue;
7811 }
7812
7813 /*
7814 * This member conflicts with our multi, so we have to sleep (or
7815 * return failure, if asked to avoid waiting.)
7816 *
7817 * Note that we don't set up an error context callback ourselves,
7818 * but instead we pass the info down to XactLockTableWait. This
7819 * might seem a bit wasteful because the context is set up and
7820 * tore down for each member of the multixact, but in reality it
7821 * should be barely noticeable, and it avoids duplicate code.
7822 */
7823 if (nowait)
7824 {
7826 if (!result)
7827 break;
7828 }
7829 else
7830 XactLockTableWait(memxid, rel, ctid, oper);
7831 }
7832
7833 pfree(members);
7834 }
7835
7836 if (remaining)
7837 *remaining = remain;
7838
7839 return result;
7840}
7841
7842/*
7843 * MultiXactIdWait
7844 * Sleep on a MultiXactId.
7845 *
7846 * By the time we finish sleeping, someone else may have changed the Xmax
7847 * of the containing tuple, so the caller needs to iterate on us somehow.
7848 *
7849 * We return (in *remaining, if not NULL) the number of members that are still
7850 * running, including any (non-aborted) subtransactions of our own transaction.
7851 */
7852static void
7854 Relation rel, const ItemPointerData *ctid, XLTW_Oper oper,
7855 int *remaining)
7856{
7857 (void) Do_MultiXactIdWait(multi, status, infomask, false,
7858 rel, ctid, oper, remaining, false);
7859}
7860
7861/*
7862 * ConditionalMultiXactIdWait
7863 * As above, but only lock if we can get the lock without blocking.
7864 *
7865 * By the time we finish sleeping, someone else may have changed the Xmax
7866 * of the containing tuple, so the caller needs to iterate on us somehow.
7867 *
7868 * If the multixact is now all gone, return true. Returns false if some
7869 * transactions might still be running.
7870 *
7871 * We return (in *remaining, if not NULL) the number of members that are still
7872 * running, including any (non-aborted) subtransactions of our own transaction.
7873 */
7874static bool
7876 uint16 infomask, Relation rel, int *remaining,
7877 bool logLockFailure)
7878{
7879 return Do_MultiXactIdWait(multi, status, infomask, true,
7881}
7882
7883/*
7884 * heap_tuple_needs_eventual_freeze
7885 *
7886 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7887 * will eventually require freezing (if tuple isn't removed by pruning first).
7888 */
7889bool
7891{
7892 TransactionId xid;
7893
7894 /*
7895 * If xmin is a normal transaction ID, this tuple is definitely not
7896 * frozen.
7897 */
7898 xid = HeapTupleHeaderGetXmin(tuple);
7899 if (TransactionIdIsNormal(xid))
7900 return true;
7901
7902 /*
7903 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7904 */
7905 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7906 {
7907 MultiXactId multi;
7908
7909 multi = HeapTupleHeaderGetRawXmax(tuple);
7910 if (MultiXactIdIsValid(multi))
7911 return true;
7912 }
7913 else
7914 {
7915 xid = HeapTupleHeaderGetRawXmax(tuple);
7916 if (TransactionIdIsNormal(xid))
7917 return true;
7918 }
7919
7920 if (tuple->t_infomask & HEAP_MOVED)
7921 {
7922 xid = HeapTupleHeaderGetXvac(tuple);
7923 if (TransactionIdIsNormal(xid))
7924 return true;
7925 }
7926
7927 return false;
7928}
7929
7930/*
7931 * heap_tuple_should_freeze
7932 *
7933 * Return value indicates if heap_prepare_freeze_tuple sibling function would
7934 * (or should) force freezing of the heap page that contains caller's tuple.
7935 * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing.
7936 * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs.
7937 *
7938 * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output
7939 * arguments help VACUUM track the oldest extant XID/MXID remaining in rel.
7940 * Our working assumption is that caller won't decide to freeze this tuple.
7941 * It's up to caller to only ratchet back its own top-level trackers after the
7942 * point that it fully commits to not freezing the tuple/page in question.
7943 */
7944bool
7946 const struct VacuumCutoffs *cutoffs,
7947 TransactionId *NoFreezePageRelfrozenXid,
7948 MultiXactId *NoFreezePageRelminMxid)
7949{
7950 TransactionId xid;
7951 MultiXactId multi;
7952 bool freeze = false;
7953
7954 /* First deal with xmin */
7955 xid = HeapTupleHeaderGetXmin(tuple);
7956 if (TransactionIdIsNormal(xid))
7957 {
7959 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7960 *NoFreezePageRelfrozenXid = xid;
7961 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7962 freeze = true;
7963 }
7964
7965 /* Now deal with xmax */
7967 multi = InvalidMultiXactId;
7968 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7969 multi = HeapTupleHeaderGetRawXmax(tuple);
7970 else
7971 xid = HeapTupleHeaderGetRawXmax(tuple);
7972
7973 if (TransactionIdIsNormal(xid))
7974 {
7976 /* xmax is a non-permanent XID */
7977 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7978 *NoFreezePageRelfrozenXid = xid;
7979 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7980 freeze = true;
7981 }
7982 else if (!MultiXactIdIsValid(multi))
7983 {
7984 /* xmax is a permanent XID or invalid MultiXactId/XID */
7985 }
7986 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7987 {
7988 /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
7989 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
7990 *NoFreezePageRelminMxid = multi;
7991 /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
7992 freeze = true;
7993 }
7994 else
7995 {
7996 /* xmax is a MultiXactId that may have an updater XID */
7997 MultiXactMember *members;
7998 int nmembers;
7999
8001 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
8002 *NoFreezePageRelminMxid = multi;
8003 if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
8004 freeze = true;
8005
8006 /* need to check whether any member of the mxact is old */
8007 nmembers = GetMultiXactIdMembers(multi, &members, false,
8009
8010 for (int i = 0; i < nmembers; i++)
8011 {
8012 xid = members[i].xid;
8014 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8015 *NoFreezePageRelfrozenXid = xid;
8016 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
8017 freeze = true;
8018 }
8019 if (nmembers > 0)
8020 pfree(members);
8021 }
8022
8023 if (tuple->t_infomask & HEAP_MOVED)
8024 {
8025 xid = HeapTupleHeaderGetXvac(tuple);
8026 if (TransactionIdIsNormal(xid))
8027 {
8029 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8030 *NoFreezePageRelfrozenXid = xid;
8031 /* heap_prepare_freeze_tuple forces xvac freezing */
8032 freeze = true;
8033 }
8034 }
8035
8036 return freeze;
8037}
8038
8039/*
8040 * Maintain snapshotConflictHorizon for caller by ratcheting forward its value
8041 * using any committed XIDs contained in 'tuple', an obsolescent heap tuple
8042 * that caller is in the process of physically removing, e.g. via HOT pruning
8043 * or index deletion.
8044 *
8045 * Caller must initialize its value to InvalidTransactionId, which is
8046 * generally interpreted as "definitely no need for a recovery conflict".
8047 * Final value must reflect all heap tuples that caller will physically remove
8048 * (or remove TID references to) via its ongoing pruning/deletion operation.
8049 * ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from
8050 * caller's WAL record) by REDO routine when it replays caller's operation.
8051 */
8052void
8054 TransactionId *snapshotConflictHorizon)
8055{
8059
8060 if (tuple->t_infomask & HEAP_MOVED)
8061 {
8062 if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
8063 *snapshotConflictHorizon = xvac;
8064 }
8065
8066 /*
8067 * Ignore tuples inserted by an aborted transaction or if the tuple was
8068 * updated/deleted by the inserting transaction.
8069 *
8070 * Look for a committed hint bit, or if no xmin bit is set, check clog.
8071 */
8072 if (HeapTupleHeaderXminCommitted(tuple) ||
8074 {
8075 if (xmax != xmin &&
8076 TransactionIdFollows(xmax, *snapshotConflictHorizon))
8077 *snapshotConflictHorizon = xmax;
8078 }
8079}
8080
8081#ifdef USE_PREFETCH
8082/*
8083 * Helper function for heap_index_delete_tuples. Issues prefetch requests for
8084 * prefetch_count buffers. The prefetch_state keeps track of all the buffers
8085 * we can prefetch, and which have already been prefetched; each call to this
8086 * function picks up where the previous call left off.
8087 *
8088 * Note: we expect the deltids array to be sorted in an order that groups TIDs
8089 * by heap block, with all TIDs for each block appearing together in exactly
8090 * one group.
8091 */
8092static void
8095 int prefetch_count)
8096{
8098 int count = 0;
8099 int i;
8100 int ndeltids = prefetch_state->ndeltids;
8101 TM_IndexDelete *deltids = prefetch_state->deltids;
8102
8103 for (i = prefetch_state->next_item;
8104 i < ndeltids && count < prefetch_count;
8105 i++)
8106 {
8107 ItemPointer htid = &deltids[i].tid;
8108
8111 {
8114 count++;
8115 }
8116 }
8117
8118 /*
8119 * Save the prefetch position so that next time we can continue from that
8120 * position.
8121 */
8122 prefetch_state->next_item = i;
8123 prefetch_state->cur_hblkno = cur_hblkno;
8124}
8125#endif
8126
8127/*
8128 * Helper function for heap_index_delete_tuples. Checks for index corruption
8129 * involving an invalid TID in index AM caller's index page.
8130 *
8131 * This is an ideal place for these checks. The index AM must hold a buffer
8132 * lock on the index page containing the TIDs we examine here, so we don't
8133 * have to worry about concurrent VACUUMs at all. We can be sure that the
8134 * index is corrupt when htid points directly to an LP_UNUSED item or
8135 * heap-only tuple, which is not the case during standard index scans.
8136 */
8137static inline void
8139 Page page, OffsetNumber maxoff,
8141{
8143 ItemId iid;
8144
8145 Assert(OffsetNumberIsValid(istatus->idxoffnum));
8146
8147 if (unlikely(indexpagehoffnum > maxoff))
8148 ereport(ERROR,
8150 errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
8153 istatus->idxoffnum, delstate->iblknum,
8155
8157 if (unlikely(!ItemIdIsUsed(iid)))
8158 ereport(ERROR,
8160 errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
8163 istatus->idxoffnum, delstate->iblknum,
8165
8166 if (ItemIdHasStorage(iid))
8167 {
8168 HeapTupleHeader htup;
8169
8171 htup = (HeapTupleHeader) PageGetItem(page, iid);
8172
8174 ereport(ERROR,
8176 errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
8179 istatus->idxoffnum, delstate->iblknum,
8181 }
8182}
8183
8184/*
8185 * heapam implementation of tableam's index_delete_tuples interface.
8186 *
8187 * This helper function is called by index AMs during index tuple deletion.
8188 * See tableam header comments for an explanation of the interface implemented
8189 * here and a general theory of operation. Note that each call here is either
8190 * a simple index deletion call, or a bottom-up index deletion call.
8191 *
8192 * It's possible for this to generate a fair amount of I/O, since we may be
8193 * deleting hundreds of tuples from a single index block. To amortize that
8194 * cost to some degree, this uses prefetching and combines repeat accesses to
8195 * the same heap block.
8196 */
8199{
8200 /* Initial assumption is that earlier pruning took care of conflict */
8201 TransactionId snapshotConflictHorizon = InvalidTransactionId;
8204 Page page = NULL;
8207#ifdef USE_PREFETCH
8210#endif
8212 int finalndeltids = 0,
8213 nblocksaccessed = 0;
8214
8215 /* State that's only used in bottom-up index deletion case */
8216 int nblocksfavorable = 0;
8217 int curtargetfreespace = delstate->bottomupfreespace,
8218 lastfreespace = 0,
8219 actualfreespace = 0;
8220 bool bottomup_final_block = false;
8221
8223
8224 /* Sort caller's deltids array by TID for further processing */
8226
8227 /*
8228 * Bottom-up case: resort deltids array in an order attuned to where the
8229 * greatest number of promising TIDs are to be found, and determine how
8230 * many blocks from the start of sorted array should be considered
8231 * favorable. This will also shrink the deltids array in order to
8232 * eliminate completely unfavorable blocks up front.
8233 */
8234 if (delstate->bottomup)
8236
8237#ifdef USE_PREFETCH
8238 /* Initialize prefetch state. */
8240 prefetch_state.next_item = 0;
8241 prefetch_state.ndeltids = delstate->ndeltids;
8242 prefetch_state.deltids = delstate->deltids;
8243
8244 /*
8245 * Determine the prefetch distance that we will attempt to maintain.
8246 *
8247 * Since the caller holds a buffer lock somewhere in rel, we'd better make
8248 * sure that isn't a catalog relation before we call code that does
8249 * syscache lookups, to avoid risk of deadlock.
8250 */
8251 if (IsCatalogRelation(rel))
8253 else
8256
8257 /* Cap initial prefetch distance for bottom-up deletion caller */
8258 if (delstate->bottomup)
8259 {
8263 }
8264
8265 /* Start prefetching. */
8267#endif
8268
8269 /* Iterate over deltids, determine which to delete, check their horizon */
8270 Assert(delstate->ndeltids > 0);
8271 for (int i = 0; i < delstate->ndeltids; i++)
8272 {
8273 TM_IndexDelete *ideltid = &delstate->deltids[i];
8274 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8275 ItemPointer htid = &ideltid->tid;
8276 OffsetNumber offnum;
8277
8278 /*
8279 * Read buffer, and perform required extra steps each time a new block
8280 * is encountered. Avoid refetching if it's the same block as the one
8281 * from the last htid.
8282 */
8283 if (blkno == InvalidBlockNumber ||
8285 {
8286 /*
8287 * Consider giving up early for bottom-up index deletion caller
8288 * first. (Only prefetch next-next block afterwards, when it
8289 * becomes clear that we're at least going to access the next
8290 * block in line.)
8291 *
8292 * Sometimes the first block frees so much space for bottom-up
8293 * caller that the deletion process can end without accessing any
8294 * more blocks. It is usually necessary to access 2 or 3 blocks
8295 * per bottom-up deletion operation, though.
8296 */
8297 if (delstate->bottomup)
8298 {
8299 /*
8300 * We often allow caller to delete a few additional items
8301 * whose entries we reached after the point that space target
8302 * from caller was satisfied. The cost of accessing the page
8303 * was already paid at that point, so it made sense to finish
8304 * it off. When that happened, we finalize everything here
8305 * (by finishing off the whole bottom-up deletion operation
8306 * without needlessly paying the cost of accessing any more
8307 * blocks).
8308 */
8310 break;
8311
8312 /*
8313 * Give up when we didn't enable our caller to free any
8314 * additional space as a result of processing the page that we
8315 * just finished up with. This rule is the main way in which
8316 * we keep the cost of bottom-up deletion under control.
8317 */
8319 break;
8320 lastfreespace = actualfreespace; /* for next time */
8321
8322 /*
8323 * Deletion operation (which is bottom-up) will definitely
8324 * access the next block in line. Prepare for that now.
8325 *
8326 * Decay target free space so that we don't hang on for too
8327 * long with a marginal case. (Space target is only truly
8328 * helpful when it allows us to recognize that we don't need
8329 * to access more than 1 or 2 blocks to satisfy caller due to
8330 * agreeable workload characteristics.)
8331 *
8332 * We are a bit more patient when we encounter contiguous
8333 * blocks, though: these are treated as favorable blocks. The
8334 * decay process is only applied when the next block in line
8335 * is not a favorable/contiguous block. This is not an
8336 * exception to the general rule; we still insist on finding
8337 * at least one deletable item per block accessed. See
8338 * bottomup_nblocksfavorable() for full details of the theory
8339 * behind favorable blocks and heap block locality in general.
8340 *
8341 * Note: The first block in line is always treated as a
8342 * favorable block, so the earliest possible point that the
8343 * decay can be applied is just before we access the second
8344 * block in line. The Assert() verifies this for us.
8345 */
8347 if (nblocksfavorable > 0)
8349 else
8350 curtargetfreespace /= 2;
8351 }
8352
8353 /* release old buffer */
8354 if (BufferIsValid(buf))
8356
8358 buf = ReadBuffer(rel, blkno);
8360 Assert(!delstate->bottomup ||
8362
8363#ifdef USE_PREFETCH
8364
8365 /*
8366 * To maintain the prefetch distance, prefetch one more page for
8367 * each page we read.
8368 */
8370#endif
8371
8373
8374 page = BufferGetPage(buf);
8375 maxoff = PageGetMaxOffsetNumber(page);
8376 }
8377
8378 /*
8379 * In passing, detect index corruption involving an index page with a
8380 * TID that points to a location in the heap that couldn't possibly be
8381 * correct. We only do this with actual TIDs from caller's index page
8382 * (not items reached by traversing through a HOT chain).
8383 */
8385
8386 if (istatus->knowndeletable)
8387 Assert(!delstate->bottomup && !istatus->promising);
8388 else
8389 {
8390 ItemPointerData tmp = *htid;
8392
8393 /* Are any tuples from this HOT chain non-vacuumable? */
8395 &heapTuple, NULL, true))
8396 continue; /* can't delete entry */
8397
8398 /* Caller will delete, since whole HOT chain is vacuumable */
8399 istatus->knowndeletable = true;
8400
8401 /* Maintain index free space info for bottom-up deletion case */
8402 if (delstate->bottomup)
8403 {
8404 Assert(istatus->freespace > 0);
8405 actualfreespace += istatus->freespace;
8407 bottomup_final_block = true;
8408 }
8409 }
8410
8411 /*
8412 * Maintain snapshotConflictHorizon value for deletion operation as a
8413 * whole by advancing current value using heap tuple headers. This is
8414 * loosely based on the logic for pruning a HOT chain.
8415 */
8417 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
8418 for (;;)
8419 {
8420 ItemId lp;
8421 HeapTupleHeader htup;
8422
8423 /* Sanity check (pure paranoia) */
8424 if (offnum < FirstOffsetNumber)
8425 break;
8426
8427 /*
8428 * An offset past the end of page's line pointer array is possible
8429 * when the array was truncated
8430 */
8431 if (offnum > maxoff)
8432 break;
8433
8434 lp = PageGetItemId(page, offnum);
8436 {
8437 offnum = ItemIdGetRedirect(lp);
8438 continue;
8439 }
8440
8441 /*
8442 * We'll often encounter LP_DEAD line pointers (especially with an
8443 * entry marked knowndeletable by our caller up front). No heap
8444 * tuple headers get examined for an htid that leads us to an
8445 * LP_DEAD item. This is okay because the earlier pruning
8446 * operation that made the line pointer LP_DEAD in the first place
8447 * must have considered the original tuple header as part of
8448 * generating its own snapshotConflictHorizon value.
8449 *
8450 * Relying on XLOG_HEAP2_PRUNE_VACUUM_SCAN records like this is
8451 * the same strategy that index vacuuming uses in all cases. Index
8452 * VACUUM WAL records don't even have a snapshotConflictHorizon
8453 * field of their own for this reason.
8454 */
8455 if (!ItemIdIsNormal(lp))
8456 break;
8457
8458 htup = (HeapTupleHeader) PageGetItem(page, lp);
8459
8460 /*
8461 * Check the tuple XMIN against prior XMAX, if any
8462 */
8465 break;
8466
8468 &snapshotConflictHorizon);
8469
8470 /*
8471 * If the tuple is not HOT-updated, then we are at the end of this
8472 * HOT-chain. No need to visit later tuples from the same update
8473 * chain (they get their own index entries) -- just move on to
8474 * next htid from index AM caller.
8475 */
8476 if (!HeapTupleHeaderIsHotUpdated(htup))
8477 break;
8478
8479 /* Advance to next HOT chain member */
8480 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
8481 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
8483 }
8484
8485 /* Enable further/final shrinking of deltids for caller */
8486 finalndeltids = i + 1;
8487 }
8488
8490
8491 /*
8492 * Shrink deltids array to exclude non-deletable entries at the end. This
8493 * is not just a minor optimization. Final deltids array size might be
8494 * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
8495 * ndeltids being zero in all cases with zero total deletable entries.
8496 */
8497 Assert(finalndeltids > 0 || delstate->bottomup);
8498 delstate->ndeltids = finalndeltids;
8499
8500 return snapshotConflictHorizon;
8501}
8502
8503/*
8504 * Specialized inlineable comparison function for index_delete_sort()
8505 */
8506static inline int
8508{
8509 ItemPointer tid1 = &deltid1->tid;
8510 ItemPointer tid2 = &deltid2->tid;
8511
8512 {
8515
8516 if (blk1 != blk2)
8517 return (blk1 < blk2) ? -1 : 1;
8518 }
8519 {
8522
8523 if (pos1 != pos2)
8524 return (pos1 < pos2) ? -1 : 1;
8525 }
8526
8527 Assert(false);
8528
8529 return 0;
8530}
8531
8532/*
8533 * Sort deltids array from delstate by TID. This prepares it for further
8534 * processing by heap_index_delete_tuples().
8535 *
8536 * This operation becomes a noticeable consumer of CPU cycles with some
8537 * workloads, so we go to the trouble of specialization/micro optimization.
8538 * We use shellsort for this because it's easy to specialize, compiles to
8539 * relatively few instructions, and is adaptive to presorted inputs/subsets
8540 * (which are typical here).
8541 */
8542static void
8544{
8545 TM_IndexDelete *deltids = delstate->deltids;
8546 int ndeltids = delstate->ndeltids;
8547
8548 /*
8549 * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
8550 *
8551 * This implementation is fast with array sizes up to ~4500. This covers
8552 * all supported BLCKSZ values.
8553 */
8554 const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
8555
8556 /* Think carefully before changing anything here -- keep swaps cheap */
8557 StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
8558 "element size exceeds 8 bytes");
8559
8560 for (int g = 0; g < lengthof(gaps); g++)
8561 {
8562 for (int hi = gaps[g], i = hi; i < ndeltids; i++)
8563 {
8564 TM_IndexDelete d = deltids[i];
8565 int j = i;
8566
8567 while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
8568 {
8569 deltids[j] = deltids[j - hi];
8570 j -= hi;
8571 }
8572 deltids[j] = d;
8573 }
8574 }
8575}
8576
8577/*
8578 * Returns how many blocks should be considered favorable/contiguous for a
8579 * bottom-up index deletion pass. This is a number of heap blocks that starts
8580 * from and includes the first block in line.
8581 *
8582 * There is always at least one favorable block during bottom-up index
8583 * deletion. In the worst case (i.e. with totally random heap blocks) the
8584 * first block in line (the only favorable block) can be thought of as a
8585 * degenerate array of contiguous blocks that consists of a single block.
8586 * heap_index_delete_tuples() will expect this.
8587 *
8588 * Caller passes blockgroups, a description of the final order that deltids
8589 * will be sorted in for heap_index_delete_tuples() bottom-up index deletion
8590 * processing. Note that deltids need not actually be sorted just yet (caller
8591 * only passes deltids to us so that we can interpret blockgroups).
8592 *
8593 * You might guess that the existence of contiguous blocks cannot matter much,
8594 * since in general the main factor that determines which blocks we visit is
8595 * the number of promising TIDs, which is a fixed hint from the index AM.
8596 * We're not really targeting the general case, though -- the actual goal is
8597 * to adapt our behavior to a wide variety of naturally occurring conditions.
8598 * The effects of most of the heuristics we apply are only noticeable in the
8599 * aggregate, over time and across many _related_ bottom-up index deletion
8600 * passes.
8601 *
8602 * Deeming certain blocks favorable allows heapam to recognize and adapt to
8603 * workloads where heap blocks visited during bottom-up index deletion can be
8604 * accessed contiguously, in the sense that each newly visited block is the
8605 * neighbor of the block that bottom-up deletion just finished processing (or
8606 * close enough to it). It will likely be cheaper to access more favorable
8607 * blocks sooner rather than later (e.g. in this pass, not across a series of
8608 * related bottom-up passes). Either way it is probably only a matter of time
8609 * (or a matter of further correlated version churn) before all blocks that
8610 * appear together as a single large batch of favorable blocks get accessed by
8611 * _some_ bottom-up pass. Large batches of favorable blocks tend to either
8612 * appear almost constantly or not even once (it all depends on per-index
8613 * workload characteristics).
8614 *
8615 * Note that the blockgroups sort order applies a power-of-two bucketing
8616 * scheme that creates opportunities for contiguous groups of blocks to get
8617 * batched together, at least with workloads that are naturally amenable to
8618 * being driven by heap block locality. This doesn't just enhance the spatial
8619 * locality of bottom-up heap block processing in the obvious way. It also
8620 * enables temporal locality of access, since sorting by heap block number
8621 * naturally tends to make the bottom-up processing order deterministic.
8622 *
8623 * Consider the following example to get a sense of how temporal locality
8624 * might matter: There is a heap relation with several indexes, each of which
8625 * is low to medium cardinality. It is subject to constant non-HOT updates.
8626 * The updates are skewed (in one part of the primary key, perhaps). None of
8627 * the indexes are logically modified by the UPDATE statements (if they were
8628 * then bottom-up index deletion would not be triggered in the first place).
8629 * Naturally, each new round of index tuples (for each heap tuple that gets a
8630 * heap_update() call) will have the same heap TID in each and every index.
8631 * Since these indexes are low cardinality and never get logically modified,
8632 * heapam processing during bottom-up deletion passes will access heap blocks
8633 * in approximately sequential order. Temporal locality of access occurs due
8634 * to bottom-up deletion passes behaving very similarly across each of the
8635 * indexes at any given moment. This keeps the number of buffer misses needed
8636 * to visit heap blocks to a minimum.
8637 */
8638static int
8640 TM_IndexDelete *deltids)
8641{
8642 int64 lastblock = -1;
8643 int nblocksfavorable = 0;
8644
8645 Assert(nblockgroups >= 1);
8647
8648 /*
8649 * We tolerate heap blocks that will be accessed only slightly out of
8650 * physical order. Small blips occur when a pair of almost-contiguous
8651 * blocks happen to fall into different buckets (perhaps due only to a
8652 * small difference in npromisingtids that the bucketing scheme didn't
8653 * quite manage to ignore). We effectively ignore these blips by applying
8654 * a small tolerance. The precise tolerance we use is a little arbitrary,
8655 * but it works well enough in practice.
8656 */
8657 for (int b = 0; b < nblockgroups; b++)
8658 {
8659 IndexDeleteCounts *group = blockgroups + b;
8660 TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
8662
8663 if (lastblock != -1 &&
8666 break;
8667
8669 lastblock = block;
8670 }
8671
8672 /* Always indicate that there is at least 1 favorable block */
8674
8675 return nblocksfavorable;
8676}
8677
8678/*
8679 * qsort comparison function for bottomup_sort_and_shrink()
8680 */
8681static int
8682bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
8683{
8686
8687 /*
8688 * Most significant field is npromisingtids (which we invert the order of
8689 * so as to sort in desc order).
8690 *
8691 * Caller should have already normalized npromisingtids fields into
8692 * power-of-two values (buckets).
8693 */
8694 if (group1->npromisingtids > group2->npromisingtids)
8695 return -1;
8696 if (group1->npromisingtids < group2->npromisingtids)
8697 return 1;
8698
8699 /*
8700 * Tiebreak: desc ntids sort order.
8701 *
8702 * We cannot expect power-of-two values for ntids fields. We should
8703 * behave as if they were already rounded up for us instead.
8704 */
8705 if (group1->ntids != group2->ntids)
8706 {
8709
8710 if (ntids1 > ntids2)
8711 return -1;
8712 if (ntids1 < ntids2)
8713 return 1;
8714 }
8715
8716 /*
8717 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
8718 * block in deltids array) order.
8719 *
8720 * This is equivalent to sorting in ascending heap block number order
8721 * (among otherwise equal subsets of the array). This approach allows us
8722 * to avoid accessing the out-of-line TID. (We rely on the assumption
8723 * that the deltids array was sorted in ascending heap TID order when
8724 * these offsets to the first TID from each heap block group were formed.)
8725 */
8726 if (group1->ifirsttid > group2->ifirsttid)
8727 return 1;
8728 if (group1->ifirsttid < group2->ifirsttid)
8729 return -1;
8730
8732
8733 return 0;
8734}
8735
8736/*
8737 * heap_index_delete_tuples() helper function for bottom-up deletion callers.
8738 *
8739 * Sorts deltids array in the order needed for useful processing by bottom-up
8740 * deletion. The array should already be sorted in TID order when we're
8741 * called. The sort process groups heap TIDs from deltids into heap block
8742 * groupings. Earlier/more-promising groups/blocks are usually those that are
8743 * known to have the most "promising" TIDs.
8744 *
8745 * Sets new size of deltids array (ndeltids) in state. deltids will only have
8746 * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
8747 * return. This often means that deltids will be shrunk to a small fraction
8748 * of its original size (we eliminate many heap blocks from consideration for
8749 * caller up front).
8750 *
8751 * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable()
8752 * for a definition and full details.
8753 */
8754static int
8756{
8760 int nblockgroups = 0;
8761 int ncopied = 0;
8762 int nblocksfavorable = 0;
8763
8764 Assert(delstate->bottomup);
8765 Assert(delstate->ndeltids > 0);
8766
8767 /* Calculate per-heap-block count of TIDs */
8769 for (int i = 0; i < delstate->ndeltids; i++)
8770 {
8771 TM_IndexDelete *ideltid = &delstate->deltids[i];
8772 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8773 ItemPointer htid = &ideltid->tid;
8774 bool promising = istatus->promising;
8775
8777 {
8778 /* New block group */
8779 nblockgroups++;
8780
8783
8785 blockgroups[nblockgroups - 1].ifirsttid = i;
8786 blockgroups[nblockgroups - 1].ntids = 1;
8787 blockgroups[nblockgroups - 1].npromisingtids = 0;
8788 }
8789 else
8790 {
8791 blockgroups[nblockgroups - 1].ntids++;
8792 }
8793
8794 if (promising)
8795 blockgroups[nblockgroups - 1].npromisingtids++;
8796 }
8797
8798 /*
8799 * We're about ready to sort block groups to determine the optimal order
8800 * for visiting heap blocks. But before we do, round the number of
8801 * promising tuples for each block group up to the next power-of-two,
8802 * unless it is very low (less than 4), in which case we round up to 4.
8803 * npromisingtids is far too noisy to trust when choosing between a pair
8804 * of block groups that both have very low values.
8805 *
8806 * This scheme divides heap blocks/block groups into buckets. Each bucket
8807 * contains blocks that have _approximately_ the same number of promising
8808 * TIDs as each other. The goal is to ignore relatively small differences
8809 * in the total number of promising entries, so that the whole process can
8810 * give a little weight to heapam factors (like heap block locality)
8811 * instead. This isn't a trade-off, really -- we have nothing to lose. It
8812 * would be foolish to interpret small differences in npromisingtids
8813 * values as anything more than noise.
8814 *
8815 * We tiebreak on nhtids when sorting block group subsets that have the
8816 * same npromisingtids, but this has the same issues as npromisingtids,
8817 * and so nhtids is subject to the same power-of-two bucketing scheme. The
8818 * only reason that we don't fix nhtids in the same way here too is that
8819 * we'll need accurate nhtids values after the sort. We handle nhtids
8820 * bucketization dynamically instead (in the sort comparator).
8821 *
8822 * See bottomup_nblocksfavorable() for a full explanation of when and how
8823 * heap locality/favorable blocks can significantly influence when and how
8824 * heap blocks are accessed.
8825 */
8826 for (int b = 0; b < nblockgroups; b++)
8827 {
8828 IndexDeleteCounts *group = blockgroups + b;
8829
8830 /* Better off falling back on nhtids with low npromisingtids */
8831 if (group->npromisingtids <= 4)
8832 group->npromisingtids = 4;
8833 else
8834 group->npromisingtids =
8836 }
8837
8838 /* Sort groups and rearrange caller's deltids array */
8841 reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
8842
8844 /* Determine number of favorable blocks at the start of final deltids */
8846 delstate->deltids);
8847
8848 for (int b = 0; b < nblockgroups; b++)
8849 {
8850 IndexDeleteCounts *group = blockgroups + b;
8851 TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
8852
8854 sizeof(TM_IndexDelete) * group->ntids);
8855 ncopied += group->ntids;
8856 }
8857
8858 /* Copy final grouped and sorted TIDs back into start of caller's array */
8860 sizeof(TM_IndexDelete) * ncopied);
8861 delstate->ndeltids = ncopied;
8862
8865
8866 return nblocksfavorable;
8867}
8868
8869/*
8870 * Perform XLogInsert for a heap-visible operation. 'block' is the block
8871 * being marked all-visible, and vm_buffer is the buffer containing the
8872 * corresponding visibility map block. Both should have already been modified
8873 * and dirtied.
8874 *
8875 * snapshotConflictHorizon comes from the largest xmin on the page being
8876 * marked all-visible. REDO routine uses it to generate recovery conflicts.
8877 *
8878 * If checksums or wal_log_hints are enabled, we may also generate a full-page
8879 * image of heap_buffer. Otherwise, we optimize away the FPI (by specifying
8880 * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not*
8881 * update the heap page's LSN.
8882 */
8885 TransactionId snapshotConflictHorizon, uint8 vmflags)
8886{
8889 uint8 flags;
8890
8893
8894 xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
8895 xlrec.flags = vmflags;
8900
8902
8903 flags = REGBUF_STANDARD;
8904 if (!XLogHintBitIsNeeded())
8905 flags |= REGBUF_NO_IMAGE;
8907
8909
8910 return recptr;
8911}
8912
8913/*
8914 * Perform XLogInsert for a heap-update operation. Caller must already
8915 * have modified the buffer(s) and marked them dirty.
8916 */
8917static XLogRecPtr
8922{
8926 uint8 info;
8928 uint16 prefixlen = 0,
8929 suffixlen = 0;
8931 Page page = BufferGetPage(newbuf);
8933 bool init;
8934 int bufflags;
8935
8936 /* Caller should not call me on a non-WAL-logged relation */
8938
8940
8942 info = XLOG_HEAP_HOT_UPDATE;
8943 else
8944 info = XLOG_HEAP_UPDATE;
8945
8946 /*
8947 * If the old and new tuple are on the same page, we only need to log the
8948 * parts of the new tuple that were changed. That saves on the amount of
8949 * WAL we need to write. Currently, we just count any unchanged bytes in
8950 * the beginning and end of the tuple. That's quick to check, and
8951 * perfectly covers the common case that only one field is updated.
8952 *
8953 * We could do this even if the old and new tuple are on different pages,
8954 * but only if we don't make a full-page image of the old page, which is
8955 * difficult to know in advance. Also, if the old tuple is corrupt for
8956 * some reason, it would allow the corruption to propagate the new page,
8957 * so it seems best to avoid. Under the general assumption that most
8958 * updates tend to create the new tuple version on the same page, there
8959 * isn't much to be gained by doing this across pages anyway.
8960 *
8961 * Skip this if we're taking a full-page image of the new page, as we
8962 * don't include the new tuple in the WAL record in that case. Also
8963 * disable if effective_wal_level='logical', as logical decoding needs to
8964 * be able to read the new tuple in whole from the WAL record alone.
8965 */
8966 if (oldbuf == newbuf && !need_tuple_data &&
8968 {
8969 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8970 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8971 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8972 int newlen = newtup->t_len - newtup->t_data->t_hoff;
8973
8974 /* Check for common prefix between old and new tuple */
8975 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8976 {
8977 if (newp[prefixlen] != oldp[prefixlen])
8978 break;
8979 }
8980
8981 /*
8982 * Storing the length of the prefix takes 2 bytes, so we need to save
8983 * at least 3 bytes or there's no point.
8984 */
8985 if (prefixlen < 3)
8986 prefixlen = 0;
8987
8988 /* Same for suffix */
8990 {
8991 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
8992 break;
8993 }
8994 if (suffixlen < 3)
8995 suffixlen = 0;
8996 }
8997
8998 /* Prepare main WAL data chain */
8999 xlrec.flags = 0;
9004 if (prefixlen > 0)
9006 if (suffixlen > 0)
9008 if (need_tuple_data)
9009 {
9011 if (old_key_tuple)
9012 {
9013 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
9015 else
9017 }
9018 }
9019
9020 /* If new tuple is the single and first tuple on page... */
9023 {
9024 info |= XLOG_HEAP_INIT_PAGE;
9025 init = true;
9026 }
9027 else
9028 init = false;
9029
9030 /* Prepare WAL data for the old page */
9031 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
9032 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
9033 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
9034 oldtup->t_data->t_infomask2);
9035
9036 /* Prepare WAL data for the new page */
9037 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
9038 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
9039
9041 if (init)
9043 if (need_tuple_data)
9045
9047 if (oldbuf != newbuf)
9049
9051
9052 /*
9053 * Prepare WAL data for the new tuple.
9054 */
9055 if (prefixlen > 0 || suffixlen > 0)
9056 {
9057 if (prefixlen > 0 && suffixlen > 0)
9058 {
9061 XLogRegisterBufData(0, &prefix_suffix, sizeof(uint16) * 2);
9062 }
9063 else if (prefixlen > 0)
9064 {
9065 XLogRegisterBufData(0, &prefixlen, sizeof(uint16));
9066 }
9067 else
9068 {
9069 XLogRegisterBufData(0, &suffixlen, sizeof(uint16));
9070 }
9071 }
9072
9073 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
9074 xlhdr.t_infomask = newtup->t_data->t_infomask;
9075 xlhdr.t_hoff = newtup->t_data->t_hoff;
9077
9078 /*
9079 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
9080 *
9081 * The 'data' doesn't include the common prefix or suffix.
9082 */
9084 if (prefixlen == 0)
9085 {
9087 (char *) newtup->t_data + SizeofHeapTupleHeader,
9089 }
9090 else
9091 {
9092 /*
9093 * Have to write the null bitmap and data after the common prefix as
9094 * two separate rdata entries.
9095 */
9096 /* bitmap [+ padding] [+ oid] */
9097 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
9098 {
9100 (char *) newtup->t_data + SizeofHeapTupleHeader,
9101 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
9102 }
9103
9104 /* data after common prefix */
9106 (char *) newtup->t_data + newtup->t_data->t_hoff + prefixlen,
9107 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
9108 }
9109
9110 /* We need to log a tuple identity */
9112 {
9113 /* don't really need this, but its more comfy to decode */
9114 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
9115 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
9116 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
9117
9119
9120 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
9123 }
9124
9125 /* filtering by origin on a row level is much more efficient */
9127
9128 recptr = XLogInsert(RM_HEAP_ID, info);
9129
9130 return recptr;
9131}
9132
9133/*
9134 * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
9135 *
9136 * This is only used when effective_wal_level is logical, and only for
9137 * catalog tuples.
9138 */
9139static XLogRecPtr
9141{
9143
9145 HeapTupleHeader hdr = tup->t_data;
9146
9147 Assert(ItemPointerIsValid(&tup->t_self));
9148 Assert(tup->t_tableOid != InvalidOid);
9149
9150 xlrec.top_xid = GetTopTransactionId();
9151 xlrec.target_locator = relation->rd_locator;
9152 xlrec.target_tid = tup->t_self;
9153
9154 /*
9155 * If the tuple got inserted & deleted in the same TX we definitely have a
9156 * combo CID, set cmin and cmax.
9157 */
9158 if (hdr->t_infomask & HEAP_COMBOCID)
9159 {
9162 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
9163 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
9164 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
9165 }
9166 /* No combo CID, so only cmin or cmax can be set by this TX */
9167 else
9168 {
9169 /*
9170 * Tuple inserted.
9171 *
9172 * We need to check for LOCK ONLY because multixacts might be
9173 * transferred to the new tuple in case of FOR KEY SHARE updates in
9174 * which case there will be an xmax, although the tuple just got
9175 * inserted.
9176 */
9177 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
9179 {
9181 xlrec.cmax = InvalidCommandId;
9182 }
9183 /* Tuple from a different tx updated or deleted. */
9184 else
9185 {
9186 xlrec.cmin = InvalidCommandId;
9188 }
9189 xlrec.combocid = InvalidCommandId;
9190 }
9191
9192 /*
9193 * Note that we don't need to register the buffer here, because this
9194 * operation does not modify the page. The insert/update/delete that
9195 * called us certainly did, but that's WAL-logged separately.
9196 */
9199
9200 /* will be looked at irrespective of origin */
9201
9203
9204 return recptr;
9205}
9206
9207/*
9208 * Build a heap tuple representing the configured REPLICA IDENTITY to represent
9209 * the old tuple in an UPDATE or DELETE.
9210 *
9211 * Returns NULL if there's no need to log an identity or if there's no suitable
9212 * key defined.
9213 *
9214 * Pass key_required true if any replica identity columns changed value, or if
9215 * any of them have any external data. Delete must always pass true.
9216 *
9217 * *copy is set to true if the returned tuple is a modified copy rather than
9218 * the same tuple that was passed in.
9219 */
9220static HeapTuple
9222 bool *copy)
9223{
9224 TupleDesc desc = RelationGetDescr(relation);
9225 char replident = relation->rd_rel->relreplident;
9228 bool nulls[MaxHeapAttributeNumber];
9230
9231 *copy = false;
9232
9233 if (!RelationIsLogicallyLogged(relation))
9234 return NULL;
9235
9236 if (replident == REPLICA_IDENTITY_NOTHING)
9237 return NULL;
9238
9239 if (replident == REPLICA_IDENTITY_FULL)
9240 {
9241 /*
9242 * When logging the entire old tuple, it very well could contain
9243 * toasted columns. If so, force them to be inlined.
9244 */
9245 if (HeapTupleHasExternal(tp))
9246 {
9247 *copy = true;
9248 tp = toast_flatten_tuple(tp, desc);
9249 }
9250 return tp;
9251 }
9252
9253 /* if the key isn't required and we're only logging the key, we're done */
9254 if (!key_required)
9255 return NULL;
9256
9257 /* find out the replica identity columns */
9260
9261 /*
9262 * If there's no defined replica identity columns, treat as !key_required.
9263 * (This case should not be reachable from heap_update, since that should
9264 * calculate key_required accurately. But heap_delete just passes
9265 * constant true for key_required, so we can hit this case in deletes.)
9266 */
9267 if (bms_is_empty(idattrs))
9268 return NULL;
9269
9270 /*
9271 * Construct a new tuple containing only the replica identity columns,
9272 * with nulls elsewhere. While we're at it, assert that the replica
9273 * identity columns aren't null.
9274 */
9275 heap_deform_tuple(tp, desc, values, nulls);
9276
9277 for (int i = 0; i < desc->natts; i++)
9278 {
9280 idattrs))
9281 Assert(!nulls[i]);
9282 else
9283 nulls[i] = true;
9284 }
9285
9286 key_tuple = heap_form_tuple(desc, values, nulls);
9287 *copy = true;
9288
9290
9291 /*
9292 * If the tuple, which by here only contains indexed columns, still has
9293 * toasted columns, force them to be inlined. This is somewhat unlikely
9294 * since there's limits on the size of indexed columns, so we don't
9295 * duplicate toast_flatten_tuple()s functionality in the above loop over
9296 * the indexed columns, even if it would be more efficient.
9297 */
9299 {
9301
9304 }
9305
9306 return key_tuple;
9307}
9308
9309/*
9310 * HeapCheckForSerializableConflictOut
9311 * We are reading a tuple. If it's not visible, there may be a
9312 * rw-conflict out with the inserter. Otherwise, if it is visible to us
9313 * but has been deleted, there may be a rw-conflict out with the deleter.
9314 *
9315 * We will determine the top level xid of the writing transaction with which
9316 * we may be in conflict, and ask CheckForSerializableConflictOut() to check
9317 * for overlap with our own transaction.
9318 *
9319 * This function should be called just about anywhere in heapam.c where a
9320 * tuple has been read. The caller must hold at least a shared lock on the
9321 * buffer, because this function might set hint bits on the tuple. There is
9322 * currently no known reason to call this function from an index AM.
9323 */
9324void
9325HeapCheckForSerializableConflictOut(bool visible, Relation relation,
9326 HeapTuple tuple, Buffer buffer,
9327 Snapshot snapshot)
9328{
9329 TransactionId xid;
9331
9332 if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9333 return;
9334
9335 /*
9336 * Check to see whether the tuple has been written to by a concurrent
9337 * transaction, either to create it not visible to us, or to delete it
9338 * while it is visible to us. The "visible" bool indicates whether the
9339 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9340 * is going on with it.
9341 *
9342 * In the event of a concurrently inserted tuple that also happens to have
9343 * been concurrently updated (by a separate transaction), the xmin of the
9344 * tuple will be used -- not the updater's xid.
9345 */
9347 switch (htsvResult)
9348 {
9349 case HEAPTUPLE_LIVE:
9350 if (visible)
9351 return;
9352 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9353 break;
9356 if (visible)
9357 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9358 else
9359 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9360
9362 {
9363 /* This is like the HEAPTUPLE_DEAD case */
9364 Assert(!visible);
9365 return;
9366 }
9367 break;
9369 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9370 break;
9371 case HEAPTUPLE_DEAD:
9372 Assert(!visible);
9373 return;
9374 default:
9375
9376 /*
9377 * The only way to get to this default clause is if a new value is
9378 * added to the enum type without adding it to this switch
9379 * statement. That's a bug, so elog.
9380 */
9381 elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9382
9383 /*
9384 * In spite of having all enum values covered and calling elog on
9385 * this default, some compilers think this is a code path which
9386 * allows xid to be used below without initialization. Silence
9387 * that warning.
9388 */
9390 }
9391
9394
9395 /*
9396 * Find top level xid. Bail out if xid is too early to be a conflict, or
9397 * if it's our own xid.
9398 */
9400 return;
9403 return;
9404
9405 CheckForSerializableConflictOut(relation, xid, snapshot);
9406}
int16 AttrNumber
Definition attnum.h:21
int bms_next_member(const Bitmapset *a, int prevbit)
Definition bitmapset.c:1290
void bms_free(Bitmapset *a)
Definition bitmapset.c:239
bool bms_is_member(int x, const Bitmapset *a)
Definition bitmapset.c:510
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition bitmapset.c:799
Bitmapset * bms_add_members(Bitmapset *a, const Bitmapset *b)
Definition bitmapset.c:901
bool bms_overlap(const Bitmapset *a, const Bitmapset *b)
Definition bitmapset.c:575
#define bms_is_empty(a)
Definition bitmapset.h:118
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static int32 next
Definition blutils.c:225
static Datum values[MAXATTR]
Definition bootstrap.c:147
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4357
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:773
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition bufmgr.c:4378
bool BufferIsDirty(Buffer buffer)
Definition bufmgr.c:3025
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5502
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5519
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3057
int maintenance_io_concurrency
Definition bufmgr.c:192
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:865
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:307
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:466
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:433
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:210
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:220
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:205
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:328
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:417
Size PageGetHeapFreeSpace(const PageData *page)
Definition bufpage.c:990
PageHeaderData * PageHeader
Definition bufpage.h:173
static bool PageIsAllVisible(const PageData *page)
Definition bufpage.h:428
static void PageClearAllVisible(Page page)
Definition bufpage.h:438
#define SizeOfPageHeaderData
Definition bufpage.h:216
static void PageSetAllVisible(Page page)
Definition bufpage.h:433
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:243
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:353
static void PageSetFull(Page page)
Definition bufpage.h:417
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:390
PageData * Page
Definition bufpage.h:81
#define PageSetPrunable(page, xid)
Definition bufpage.h:446
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:371
#define NameStr(name)
Definition c.h:777
#define InvalidCommandId
Definition c.h:695
#define pg_noinline
Definition c.h:307
#define Min(x, y)
Definition c.h:1019
#define likely(x)
Definition c.h:423
#define MAXALIGN(LEN)
Definition c.h:838
uint8_t uint8
Definition c.h:556
#define Assert(condition)
Definition c.h:885
int64_t int64
Definition c.h:555
TransactionId MultiXactId
Definition c.h:688
#define pg_attribute_always_inline
Definition c.h:291
int16_t int16
Definition c.h:553
#define SHORTALIGN(LEN)
Definition c.h:834
uint16_t uint16
Definition c.h:557
#define pg_unreachable()
Definition c.h:353
#define unlikely(x)
Definition c.h:424
uint32_t uint32
Definition c.h:558
#define lengthof(array)
Definition c.h:815
#define StaticAssertDecl(condition, errmessage)
Definition c.h:950
uint32 CommandId
Definition c.h:692
uint32 TransactionId
Definition c.h:678
#define OidIsValid(objectId)
Definition c.h:800
size_t Size
Definition c.h:631
bool IsToastRelation(Relation relation)
Definition catalog.c:206
bool IsCatalogRelation(Relation relation)
Definition catalog.c:104
bool IsSharedRelation(Oid relationId)
Definition catalog.c:304
bool IsInplaceUpdateRelation(Relation relation)
Definition catalog.c:183
CommandId HeapTupleHeaderGetCmin(const HeapTupleHeaderData *tup)
Definition combocid.c:104
void HeapTupleHeaderAdjustCmax(const HeapTupleHeaderData *tup, CommandId *cmax, bool *iscombo)
Definition combocid.c:153
CommandId HeapTupleHeaderGetCmax(const HeapTupleHeaderData *tup)
Definition combocid.c:118
bool datumIsEqual(Datum value1, Datum value2, bool typByVal, int typLen)
Definition datum.c:223
Datum arg
Definition elog.c:1322
int errcode(int sqlerrcode)
Definition elog.c:874
int errmsg(const char *fmt,...)
Definition elog.c:1093
int int errdetail_internal(const char *fmt,...) pg_attribute_printf(1
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:36
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
HeapTuple ExecFetchSlotHeapTuple(TupleTableSlot *slot, bool materialize, bool *shouldFree)
TupleTableSlot * ExecStoreBufferHeapTuple(HeapTuple tuple, TupleTableSlot *slot, Buffer buffer)
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:461
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:643
int NBuffers
Definition globals.c:142
Oid MyDatabaseTableSpace
Definition globals.c:96
Oid MyDatabaseId
Definition globals.c:94
void simple_heap_update(Relation relation, const ItemPointerData *otid, HeapTuple tup, TU_UpdateIndexes *update_indexes)
Definition heapam.c:4556
static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, LockTupleMode lockmode, bool *current_is_member)
Definition heapam.c:7676
void heap_insert(Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate)
Definition heapam.c:2142
static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup)
Definition heapam.c:9141
XLogRecPtr log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags)
Definition heapam.c:8885
static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2)
Definition heapam.c:5395
static TM_Result heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax, const ItemPointerData *tid, TransactionId xid, LockTupleMode mode)
Definition heapam.c:5767
static void heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir)
Definition heapam.c:707
bool heap_inplace_lock(Relation relation, HeapTuple oldtup_ptr, Buffer buffer, void(*release_callback)(void *), void *arg)
Definition heapam.c:6437
bool heap_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf)
Definition heapam.c:1659
#define BOTTOMUP_TOLERANCE_NBLOCKS
Definition heapam.c:190
static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options)
Definition heapam.c:2333
static BlockNumber heap_scan_stream_read_next_parallel(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition heapam.c:252
static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
Definition heapam.c:8756
static bool heap_acquire_tuplock(Relation relation, const ItemPointerData *tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock)
Definition heapam.c:5346
static int heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
Definition heapam.c:2381
static pg_attribute_always_inline int page_collect_tuples(HeapScanDesc scan, Snapshot snapshot, Page page, Buffer buffer, BlockNumber block, int lines, bool all_visible, bool check_serializable)
Definition heapam.c:522
static BlockNumber heap_scan_stream_read_next_serial(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition heapam.c:292
static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2)
Definition heapam.c:7527
void heap_finish_speculative(Relation relation, const ItemPointerData *tid)
Definition heapam.c:6168
void HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
Definition heapam.c:8054
bool heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
Definition heapam.c:1449
#define LOCKMODE_from_mxstatus(status)
Definition heapam.c:159
void heap_endscan(TableScanDesc sscan)
Definition heapam.c:1371
#define FRM_RETURN_IS_XID
Definition heapam.c:6734
#define TUPLOCK_from_mxstatus(status)
Definition heapam.c:218
void heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode)
Definition heapam.c:1318
void heap_inplace_unlock(Relation relation, HeapTuple oldtup, Buffer buffer)
Definition heapam.c:6724
TM_Result heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
Definition heapam.c:3312
static int index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
Definition heapam.c:8508
static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining, bool logLockFailure)
Definition heapam.c:7876
bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
Definition heapam.c:7891
TM_Result heap_delete(Relation relation, const ItemPointerData *tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
Definition heapam.c:2843
static TransactionId FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, const struct VacuumCutoffs *cutoffs, uint16 *flags, HeapPageFreeze *pagefrz)
Definition heapam.c:6785
static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy)
Definition heapam.c:9222
static pg_noinline BlockNumber heapgettup_initial_block(HeapScanDesc scan, ScanDirection dir)
Definition heapam.c:752
static TM_Result heap_lock_updated_tuple(Relation rel, uint16 prior_infomask, TransactionId prior_raw_xmax, const ItemPointerData *prior_ctid, TransactionId xid, LockTupleMode mode)
Definition heapam.c:6115
#define LockTupleTuplock(rel, tup, mode)
Definition heapam.c:167
bool heap_tuple_should_freeze(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, TransactionId *NoFreezePageRelfrozenXid, MultiXactId *NoFreezePageRelminMxid)
Definition heapam.c:7946
bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId FreezeLimit, TransactionId MultiXactCutoff)
Definition heapam.c:7483
void heap_inplace_update_and_unlock(Relation relation, HeapTuple oldtup, HeapTuple tuple, Buffer buffer)
Definition heapam.c:6575
static BlockNumber heapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir)
Definition heapam.c:876
static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
Definition heapam.c:7608
#define BOTTOMUP_MAX_NBLOCKS
Definition heapam.c:189
void ReleaseBulkInsertStatePin(BulkInsertState bistate)
Definition heapam.c:2104
#define FRM_MARK_COMMITTED
Definition heapam.c:6736
#define FRM_NOOP
Definition heapam.c:6732
static void index_delete_check_htid(TM_IndexDeleteOp *delstate, Page page, OffsetNumber maxoff, const ItemPointerData *htid, TM_IndexStatus *istatus)
Definition heapam.c:8139
HeapTuple heap_getnext(TableScanDesc sscan, ScanDirection direction)
Definition heapam.c:1410
bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call)
Definition heapam.c:1779
void heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
Definition heapam.c:7461
bool heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
Definition heapam.c:1552
static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining)
Definition heapam.c:7854
void heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid)
Definition heapam.c:1479
void heap_abort_speculative(Relation relation, const ItemPointerData *tid)
Definition heapam.c:6255
static BlockNumber bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data, void *per_buffer_data)
Definition heapam.c:317
TableScanDesc heap_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, ParallelTableScanDesc parallel_scan, uint32 flags)
Definition heapam.c:1164
static void heapgettup(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
Definition heapam.c:960
static Page heapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
Definition heapam.c:830
static uint8 compute_infobits(uint16 infomask, uint16 infomask2)
Definition heapam.c:2798
#define FRM_RETURN_IS_MULTI
Definition heapam.c:6735
#define FRM_INVALIDATE_XMAX
Definition heapam.c:6733
static bool heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, bool isnull1, bool isnull2)
Definition heapam.c:4415
static void index_delete_sort(TM_IndexDeleteOp *delstate)
Definition heapam.c:8544
void heap_prepare_pagescan(TableScanDesc sscan)
Definition heapam.c:616
static Bitmapset * HeapDetermineColumnsInfo(Relation relation, Bitmapset *interesting_cols, Bitmapset *external_cols, HeapTuple oldtup, HeapTuple newtup, bool *has_external)
Definition heapam.c:4466
static const int MultiXactStatusLock[MaxMultiXactStatus+1]
Definition heapam.c:207
void simple_heap_insert(Relation relation, HeapTuple tup)
Definition heapam.c:2785
static bool xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
Definition heapam.c:2820
#define UnlockTupleTuplock(rel, tup, mode)
Definition heapam.c:169
static TM_Result test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, LockTupleMode mode, HeapTuple tup, bool *needwait)
Definition heapam.c:5676
bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, HeapPageFreeze *pagefrz, HeapTupleFreeze *frz, bool *totally_frozen)
Definition heapam.c:7135
static void AssertHasSnapshotForToast(Relation rel)
Definition heapam.c:225
void simple_heap_delete(Relation relation, const ItemPointerData *tid)
Definition heapam.c:3266
static const struct @15 tupleLockExtraInfo[]
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared)
Definition heapam.c:8919
TransactionId HeapTupleGetUpdateXid(const HeapTupleHeaderData *tup)
Definition heapam.c:7660
TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition heapam.c:8199
void heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate)
Definition heapam.c:2413
#define ConditionalLockTupleTuplock(rel, tup, mode, log)
Definition heapam.c:171
static void initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
Definition heapam.c:357
static int bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups, TM_IndexDelete *deltids)
Definition heapam.c:8640
static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
Definition heapam.c:1070
TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_updates, Buffer *buffer, TM_FailureData *tmfd)
Definition heapam.c:4644
static void UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
Definition heapam.c:2053
static bool Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, bool nowait, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining, bool logLockFailure)
Definition heapam.c:7776
static int bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
Definition heapam.c:8683
void heap_get_latest_tid(TableScanDesc sscan, ItemPointer tid)
Definition heapam.c:1931
void heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
Definition heapam.c:500
void HeapCheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot)
Definition heapam.c:9326
static Page heapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
Definition heapam.c:799
static MultiXactStatus get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
Definition heapam.c:4597
void heap_pre_freeze_checks(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
Definition heapam.c:7408
BulkInsertState GetBulkInsertState(void)
Definition heapam.c:2075
void FreeBulkInsertState(BulkInsertState bistate)
Definition heapam.c:2092
#define HEAP_INSERT_SPECULATIVE
Definition heapam.h:40
#define HEAP_FREEZE_CHECK_XMAX_ABORTED
Definition heapam.h:138
struct HeapScanDescData * HeapScanDesc
Definition heapam.h:102
HTSV_Result
Definition heapam.h:125
@ HEAPTUPLE_RECENTLY_DEAD
Definition heapam.h:128
@ HEAPTUPLE_INSERT_IN_PROGRESS
Definition heapam.h:129
@ HEAPTUPLE_LIVE
Definition heapam.h:127
@ HEAPTUPLE_DELETE_IN_PROGRESS
Definition heapam.h:130
@ HEAPTUPLE_DEAD
Definition heapam.h:126
struct BitmapHeapScanDescData * BitmapHeapScanDesc
Definition heapam.h:110
#define HEAP_INSERT_FROZEN
Definition heapam.h:38
static void heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz)
Definition heapam.h:492
#define HEAP_FREEZE_CHECK_XMIN_COMMITTED
Definition heapam.h:137
#define HEAP_INSERT_NO_LOGICAL
Definition heapam.h:39
struct BulkInsertStateData * BulkInsertState
Definition heapam.h:46
const TableAmRoutine * GetHeapamTableAmRoutine(void)
void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid)
bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer)
bool HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest)
HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer)
int HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, int ntups, BatchMVCCState *batchmvcc, OffsetNumber *vistuples_dense)
bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, Buffer buffer)
#define XLH_INSERT_ON_TOAST_RELATION
Definition heapam_xlog.h:76
#define SizeOfHeapMultiInsert
#define XLOG_HEAP2_MULTI_INSERT
Definition heapam_xlog.h:64
#define SizeOfHeapUpdate
#define XLH_INVALID_XVAC
#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:87
#define SizeOfHeapVisible
#define XLOG_HEAP_HOT_UPDATE
Definition heapam_xlog.h:37
#define XLOG_HEAP_DELETE
Definition heapam_xlog.h:34
#define XLH_INSERT_IS_SPECULATIVE
Definition heapam_xlog.h:74
#define XLH_LOCK_ALL_FROZEN_CLEARED
#define XLH_DELETE_CONTAINS_OLD_KEY
#define XLH_UPDATE_CONTAINS_NEW_TUPLE
Definition heapam_xlog.h:90
#define XLH_INSERT_LAST_IN_MULTI
Definition heapam_xlog.h:73
#define XLH_INSERT_ALL_FROZEN_SET
Definition heapam_xlog.h:79
#define XLH_FREEZE_XVAC
#define XLOG_HEAP_UPDATE
Definition heapam_xlog.h:35
#define XLHL_XMAX_KEYSHR_LOCK
#define XLH_DELETE_ALL_VISIBLE_CLEARED
#define XLH_UPDATE_CONTAINS_OLD_TUPLE
Definition heapam_xlog.h:88
#define SizeOfHeapNewCid
#define SizeOfHeapLockUpdated
#define XLHL_XMAX_IS_MULTI
#define XLH_INSERT_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:72
#define SizeOfHeapHeader
#define XLH_DELETE_IS_PARTITION_MOVE
#define MinSizeOfHeapInplace
#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:85
#define XLHL_XMAX_LOCK_ONLY
#define XLOG_HEAP_INPLACE
Definition heapam_xlog.h:40
#define XLOG_HEAP2_LOCK_UPDATED
Definition heapam_xlog.h:65
#define XLH_UPDATE_SUFFIX_FROM_OLD
Definition heapam_xlog.h:92
#define XLH_UPDATE_PREFIX_FROM_OLD
Definition heapam_xlog.h:91
#define SizeOfMultiInsertTuple
#define XLHL_XMAX_EXCL_LOCK
#define XLOG_HEAP2_NEW_CID
Definition heapam_xlog.h:66
#define XLH_DELETE_CONTAINS_OLD_TUPLE
#define XLOG_HEAP_LOCK
Definition heapam_xlog.h:39
#define XLOG_HEAP_INSERT
Definition heapam_xlog.h:33
#define SizeOfHeapInsert
#define SizeOfHeapDelete
#define XLH_DELETE_IS_SUPER
#define XLH_UPDATE_CONTAINS_OLD_KEY
Definition heapam_xlog.h:89
#define XLHL_KEYS_UPDATED
#define XLOG_HEAP2_VISIBLE
Definition heapam_xlog.h:63
#define XLH_INSERT_CONTAINS_NEW_TUPLE
Definition heapam_xlog.h:75
#define XLOG_HEAP_INIT_PAGE
Definition heapam_xlog.h:47
#define SizeOfHeapConfirm
#define SizeOfHeapLock
#define XLOG_HEAP_CONFIRM
Definition heapam_xlog.h:38
void heap_toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative)
Definition heaptoast.c:43
HeapTuple heap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, int options)
Definition heaptoast.c:96
HeapTuple toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc)
Definition heaptoast.c:350
#define TOAST_TUPLE_THRESHOLD
Definition heaptoast.h:48
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition heaptuple.c:1117
void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull)
Definition heaptuple.c:1346
void heap_freetuple(HeapTuple htup)
Definition heaptuple.c:1435
void RelationPutHeapTuple(Relation relation, Buffer buffer, HeapTuple tuple, bool token)
Definition hio.c:35
Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertState bistate, Buffer *vmbuffer, Buffer *vmbuffer_other, int num_pages)
Definition hio.c:500
HeapTupleHeaderData * HeapTupleHeader
Definition htup.h:23
#define HEAP_MOVED_OFF
#define HEAP_XMAX_SHR_LOCK
static bool HeapTupleIsHotUpdated(const HeapTupleData *tuple)
#define HEAP_XMIN_FROZEN
static Datum heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
static bool HeapTupleHeaderXminFrozen(const HeapTupleHeaderData *tup)
#define HeapTupleHeaderGetNatts(tup)
static void HeapTupleHeaderSetXminFrozen(HeapTupleHeaderData *tup)
#define SizeofHeapTupleHeader
#define HEAP_KEYS_UPDATED
static bool HEAP_XMAX_IS_SHR_LOCKED(uint16 infomask)
static bool HEAP_XMAX_IS_LOCKED_ONLY(uint16 infomask)
static bool HeapTupleHeaderXminInvalid(const HeapTupleHeaderData *tup)
static void HeapTupleClearHotUpdated(const HeapTupleData *tuple)
static bool HeapTupleHasExternal(const HeapTupleData *tuple)
static TransactionId HeapTupleHeaderGetXvac(const HeapTupleHeaderData *tup)
#define HEAP2_XACT_MASK
static void HeapTupleHeaderSetCmax(HeapTupleHeaderData *tup, CommandId cid, bool iscombo)
#define HEAP_XMAX_LOCK_ONLY
static void HeapTupleHeaderClearHotUpdated(HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetCmin(HeapTupleHeaderData *tup, CommandId cid)
#define HEAP_XMAX_BITS
#define HEAP_LOCK_MASK
static CommandId HeapTupleHeaderGetRawCommandId(const HeapTupleHeaderData *tup)
static TransactionId HeapTupleHeaderGetRawXmax(const HeapTupleHeaderData *tup)
static bool HeapTupleHeaderIsHeapOnly(const HeapTupleHeaderData *tup)
static bool HeapTupleIsHeapOnly(const HeapTupleData *tuple)
#define HEAP_MOVED
static void HeapTupleSetHeapOnly(const HeapTupleData *tuple)
#define HEAP_XMAX_IS_MULTI
static bool HEAP_XMAX_IS_KEYSHR_LOCKED(uint16 infomask)
#define HEAP_XMAX_COMMITTED
static TransactionId HeapTupleHeaderGetXmin(const HeapTupleHeaderData *tup)
#define HEAP_COMBOCID
#define HEAP_XACT_MASK
static bool HeapTupleHeaderIndicatesMovedPartitions(const HeapTupleHeaderData *tup)
static void HeapTupleSetHotUpdated(const HeapTupleData *tuple)
#define HEAP_XMAX_EXCL_LOCK
static bool HeapTupleHeaderIsHotUpdated(const HeapTupleHeaderData *tup)
#define HEAP_XMAX_INVALID
static TransactionId HeapTupleHeaderGetRawXmin(const HeapTupleHeaderData *tup)
static void * GETSTRUCT(const HeapTupleData *tuple)
static void HeapTupleClearHeapOnly(const HeapTupleData *tuple)
#define MaxHeapAttributeNumber
static bool HeapTupleHeaderIsSpeculative(const HeapTupleHeaderData *tup)
static TransactionId HeapTupleHeaderGetUpdateXid(const HeapTupleHeaderData *tup)
#define MaxHeapTuplesPerPage
static bool HEAP_XMAX_IS_EXCL_LOCKED(uint16 infomask)
static void HeapTupleHeaderSetXmin(HeapTupleHeaderData *tup, TransactionId xid)
static bool HEAP_LOCKED_UPGRADED(uint16 infomask)
#define HEAP_UPDATED
#define HEAP_XMAX_KEYSHR_LOCK
static void HeapTupleHeaderSetMovedPartitions(HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetXmax(HeapTupleHeaderData *tup, TransactionId xid)
static bool HeapTupleHeaderXminCommitted(const HeapTupleHeaderData *tup)
#define IsParallelWorker()
Definition parallel.h:62
void index_close(Relation relation, LOCKMODE lockmode)
Definition indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition indexam.c:133
int remaining
Definition informix.c:692
#define INJECTION_POINT(name, arg)
void AcceptInvalidationMessages(void)
Definition inval.c:930
int inplaceGetInvalidationMessages(SharedInvalidationMessage **msgs, bool *RelcacheInitFileInval)
Definition inval.c:1088
void PreInplace_Inval(void)
Definition inval.c:1250
void CacheInvalidateHeapTupleInplace(Relation relation, HeapTuple key_equivalent_tuple)
Definition inval.c:1593
void AtInplace_Inval(void)
Definition inval.c:1263
void ForgetInplace_Inval(void)
Definition inval.c:1286
void CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple, HeapTuple newtuple)
Definition inval.c:1571
int b
Definition isn.c:74
int j
Definition isn.c:78
int i
Definition isn.c:77
#define ItemIdGetLength(itemId)
Definition itemid.h:59
#define ItemIdIsNormal(itemId)
Definition itemid.h:99
#define ItemIdGetRedirect(itemId)
Definition itemid.h:78
#define ItemIdIsUsed(itemId)
Definition itemid.h:92
#define ItemIdIsRedirected(itemId)
Definition itemid.h:106
#define ItemIdHasStorage(itemId)
Definition itemid.h:120
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
bool ItemPointerEquals(const ItemPointerData *pointer1, const ItemPointerData *pointer2)
Definition itemptr.c:35
static void ItemPointerSet(ItemPointerData *pointer, BlockNumber blockNumber, OffsetNumber offNum)
Definition itemptr.h:135
static void ItemPointerSetInvalid(ItemPointerData *pointer)
Definition itemptr.h:184
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
Definition itemptr.h:158
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition itemptr.h:147
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition itemptr.h:124
static bool ItemPointerIndicatesMovedPartitions(const ItemPointerData *pointer)
Definition itemptr.h:197
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
static BlockNumber ItemPointerGetBlockNumberNoCheck(const ItemPointerData *pointer)
Definition itemptr.h:93
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition itemptr.h:83
void UnlockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode)
Definition lmgr.c:601
bool ConditionalXactLockTableWait(TransactionId xid, bool logLockFailure)
Definition lmgr.c:739
void LockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode)
Definition lmgr.c:562
void XactLockTableWait(TransactionId xid, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper)
Definition lmgr.c:663
XLTW_Oper
Definition lmgr.h:25
@ XLTW_None
Definition lmgr.h:26
@ XLTW_Lock
Definition lmgr.h:29
@ XLTW_Delete
Definition lmgr.h:28
@ XLTW_LockUpdated
Definition lmgr.h:30
@ XLTW_Update
Definition lmgr.h:27
bool LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode, bool orstronger)
Definition lock.c:643
bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
Definition lock.c:623
bool log_lock_failures
Definition lock.c:54
#define SET_LOCKTAG_RELATION(locktag, dboid, reloid)
Definition lock.h:183
#define SET_LOCKTAG_TUPLE(locktag, dboid, reloid, blocknum, offnum)
Definition lock.h:219
int LOCKMODE
Definition lockdefs.h:26
#define ShareRowExclusiveLock
Definition lockdefs.h:41
#define AccessShareLock
Definition lockdefs.h:36
#define InplaceUpdateTupleLock
Definition lockdefs.h:48
#define ShareUpdateExclusiveLock
Definition lockdefs.h:39
LockWaitPolicy
Definition lockoptions.h:38
@ LockWaitSkip
Definition lockoptions.h:42
@ LockWaitBlock
Definition lockoptions.h:40
@ LockWaitError
Definition lockoptions.h:44
LockTupleMode
Definition lockoptions.h:51
@ LockTupleExclusive
Definition lockoptions.h:59
@ LockTupleNoKeyExclusive
Definition lockoptions.h:57
@ LockTupleShare
Definition lockoptions.h:55
@ LockTupleKeyShare
Definition lockoptions.h:53
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define IsBootstrapProcessingMode()
Definition miscadmin.h:477
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define IsNormalProcessingMode()
Definition miscadmin.h:479
#define END_CRIT_SECTION()
Definition miscadmin.h:152
MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
Definition multixact.c:352
bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
Definition multixact.c:2765
bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
Definition multixact.c:2779
bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
Definition multixact.c:463
void MultiXactIdSetOldestMember(void)
Definition multixact.c:537
MultiXactId MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
Definition multixact.c:656
MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, MultiXactStatus status2)
Definition multixact.c:299
int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly)
Definition multixact.c:1113
#define MultiXactIdIsValid(multi)
Definition multixact.h:29
MultiXactStatus
Definition multixact.h:37
@ MultiXactStatusForShare
Definition multixact.h:39
@ MultiXactStatusForNoKeyUpdate
Definition multixact.h:40
@ MultiXactStatusNoKeyUpdate
Definition multixact.h:43
@ MultiXactStatusUpdate
Definition multixact.h:45
@ MultiXactStatusForUpdate
Definition multixact.h:41
@ MultiXactStatusForKeyShare
Definition multixact.h:38
#define ISUPDATE_from_mxstatus(status)
Definition multixact.h:51
#define InvalidMultiXactId
Definition multixact.h:25
#define MaxMultiXactStatus
Definition multixact.h:48
#define InvalidOffsetNumber
Definition off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition off.h:39
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
#define FirstOffsetNumber
Definition off.h:27
#define OffsetNumberPrev(offsetNumber)
Definition off.h:54
#define MaxOffsetNumber
Definition off.h:28
Datum lower(PG_FUNCTION_ARGS)
Datum upper(PG_FUNCTION_ARGS)
Operator oper(ParseState *pstate, List *opname, Oid ltypeId, Oid rtypeId, bool noError, int location)
Definition parse_oper.c:372
int16 attlen
#define ERRCODE_DATA_CORRUPTED
static uint32 pg_nextpower2_32(uint32 num)
static PgChecksumMode mode
static const struct exclude_list_item skip[]
FormData_pg_class * Form_pg_class
Definition pg_class.h:160
END_CATALOG_STRUCT typedef FormData_pg_database * Form_pg_database
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define pgstat_count_heap_getnext(rel)
Definition pgstat.h:699
#define pgstat_count_heap_scan(rel)
Definition pgstat.h:694
void pgstat_count_heap_update(Relation rel, bool hot, bool newpage)
void pgstat_count_heap_delete(Relation rel)
void pgstat_count_heap_insert(Relation rel, PgStat_Counter n)
#define qsort(a, b, c, d)
Definition port.h:495
static Oid DatumGetObjectId(Datum X)
Definition postgres.h:252
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:342
#define InvalidOid
unsigned int Oid
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition predicate.c:4334
void CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
Definition predicate.c:4021
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition predicate.c:2574
void PredicateLockTID(Relation relation, const ItemPointerData *tid, Snapshot snapshot, TransactionId tuple_xid)
Definition predicate.c:2619
bool CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
Definition predicate.c:3989
static int fb(int x)
#define DELAY_CHKPT_START
Definition proc.h:136
GlobalVisState * GlobalVisTestFor(Relation rel)
Definition procarray.c:4118
bool TransactionIdIsInProgress(TransactionId xid)
Definition procarray.c:1405
void heap_page_prune_opt(Relation relation, Buffer buffer)
Definition pruneheap.c:209
void read_stream_reset(ReadStream *stream)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
BlockNumber(* ReadStreamBlockNumberCB)(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition read_stream.h:77
#define READ_STREAM_DEFAULT
Definition read_stream.h:21
#define READ_STREAM_SEQUENTIAL
Definition read_stream.h:36
#define RelationGetRelid(relation)
Definition rel.h:514
#define RelationIsLogicallyLogged(relation)
Definition rel.h:710
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition rel.h:389
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetNumberOfAttributes(relation)
Definition rel.h:520
#define RelationGetRelationName(relation)
Definition rel.h:548
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition rel.h:693
#define RelationNeedsWAL(relation)
Definition rel.h:637
#define RelationUsesLocalBuffers(relation)
Definition rel.h:646
#define HEAP_DEFAULT_FILLFACTOR
Definition rel.h:360
void RelationDecrementReferenceCount(Relation rel)
Definition relcache.c:2195
Bitmapset * RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
Definition relcache.c:5298
void RelationIncrementReferenceCount(Relation rel)
Definition relcache.c:2182
@ INDEX_ATTR_BITMAP_KEY
Definition relcache.h:69
@ INDEX_ATTR_BITMAP_HOT_BLOCKING
Definition relcache.h:72
@ INDEX_ATTR_BITMAP_SUMMARIZED
Definition relcache.h:73
@ INDEX_ATTR_BITMAP_IDENTITY_KEY
Definition relcache.h:71
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
struct ParallelBlockTableScanDescData * ParallelBlockTableScanDesc
Definition relscan.h:103
#define ScanDirectionIsForward(direction)
Definition sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition sdir.h:50
ScanDirection
Definition sdir.h:25
@ ForwardScanDirection
Definition sdir.h:28
TransactionId RecentXmin
Definition snapmgr.c:160
void UnregisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:866
TransactionId TransactionXmin
Definition snapmgr.c:159
bool HaveRegisteredOrActiveSnapshot(void)
Definition snapmgr.c:1644
void InvalidateCatalogSnapshot(void)
Definition snapmgr.c:455
#define IsHistoricMVCCSnapshot(snapshot)
Definition snapmgr.h:59
#define SnapshotAny
Definition snapmgr.h:33
#define InitNonVacuumableSnapshot(snapshotdata, vistestp)
Definition snapmgr.h:50
#define IsMVCCSnapshot(snapshot)
Definition snapmgr.h:55
#define InvalidSnapshot
Definition snapshot.h:119
int get_tablespace_maintenance_io_concurrency(Oid spcid)
Definition spccache.c:230
#define init()
PGPROC * MyProc
Definition proc.c:67
BlockNumber last_free
Definition hio.h:49
BufferAccessStrategy strategy
Definition hio.h:31
uint32 already_extended_by
Definition hio.h:50
BlockNumber next_free
Definition hio.h:48
Buffer current_buf
Definition hio.h:32
MultiXactId NoFreezePageRelminMxid
Definition heapam.h:220
TransactionId FreezePageRelfrozenXid
Definition heapam.h:208
bool freeze_required
Definition heapam.h:182
MultiXactId FreezePageRelminMxid
Definition heapam.h:209
TransactionId NoFreezePageRelfrozenXid
Definition heapam.h:219
BufferAccessStrategy rs_strategy
Definition heapam.h:73
ScanDirection rs_dir
Definition heapam.h:88
uint32 rs_ntuples
Definition heapam.h:99
OffsetNumber rs_coffset
Definition heapam.h:68
Buffer rs_cbuf
Definition heapam.h:70
ParallelBlockTableScanWorkerData * rs_parallelworkerdata
Definition heapam.h:95
BlockNumber rs_startblock
Definition heapam.h:62
HeapTupleData rs_ctup
Definition heapam.h:75
OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]
Definition heapam.h:100
BlockNumber rs_numblocks
Definition heapam.h:63
BlockNumber rs_nblocks
Definition heapam.h:61
ReadStream * rs_read_stream
Definition heapam.h:78
uint32 rs_cindex
Definition heapam.h:98
BlockNumber rs_prefetch_block
Definition heapam.h:89
BlockNumber rs_cblock
Definition heapam.h:69
TableScanDescData rs_base
Definition heapam.h:58
ItemPointerData t_self
Definition htup.h:65
uint32 t_len
Definition htup.h:64
HeapTupleHeader t_data
Definition htup.h:68
Oid t_tableOid
Definition htup.h:66
TransactionId t_xmin
union HeapTupleHeaderData::@51 t_choice
ItemPointerData t_ctid
HeapTupleFields t_heap
int16 npromisingtids
Definition heapam.c:198
LockRelId lockRelId
Definition rel.h:46
Oid relId
Definition rel.h:40
Oid dbId
Definition rel.h:41
TransactionId xid
Definition multixact.h:57
MultiXactStatus status
Definition multixact.h:58
int delayChkptFlags
Definition proc.h:252
LockInfoData rd_lockInfo
Definition rel.h:114
Form_pg_index rd_index
Definition rel.h:192
RelFileLocator rd_locator
Definition rel.h:57
Form_pg_class rd_rel
Definition rel.h:111
bool takenDuringRecovery
Definition snapshot.h:180
TransactionId xmax
Definition tableam.h:150
CommandId cmax
Definition tableam.h:151
ItemPointerData ctid
Definition tableam.h:149
ItemPointerData tid
Definition tableam.h:212
Relation rs_rd
Definition relscan.h:35
uint32 rs_flags
Definition relscan.h:63
struct ScanKeyData * rs_key
Definition relscan.h:38
struct SnapshotData * rs_snapshot
Definition relscan.h:36
struct ParallelTableScanDescData * rs_parallel
Definition relscan.h:65
TransactionId FreezeLimit
Definition vacuum.h:289
TransactionId OldestXmin
Definition vacuum.h:279
TransactionId relfrozenxid
Definition vacuum.h:263
MultiXactId relminmxid
Definition vacuum.h:264
MultiXactId MultiXactCutoff
Definition vacuum.h:290
MultiXactId OldestMxact
Definition vacuum.h:280
Definition c.h:718
OffsetNumber offnum
TransactionId SubTransGetTopmostTransaction(TransactionId xid)
Definition subtrans.c:162
void ss_report_location(Relation rel, BlockNumber location)
Definition syncscan.c:289
BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks)
Definition syncscan.c:254
#define FirstLowInvalidHeapAttributeNumber
Definition sysattr.h:27
#define TableOidAttributeNumber
Definition sysattr.h:26
bool RelationSupportsSysCache(Oid relid)
Definition syscache.c:762
void table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan, BlockNumber startblock, BlockNumber numblocks)
Definition tableam.c:451
BlockNumber table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan)
Definition tableam.c:546
bool synchronize_seqscans
Definition tableam.c:50
@ SO_ALLOW_STRAT
Definition tableam.h:58
@ SO_TYPE_TIDRANGESCAN
Definition tableam.h:53
@ SO_TEMP_SNAPSHOT
Definition tableam.h:65
@ SO_ALLOW_PAGEMODE
Definition tableam.h:62
@ SO_TYPE_SAMPLESCAN
Definition tableam.h:51
@ SO_ALLOW_SYNC
Definition tableam.h:60
@ SO_TYPE_SEQSCAN
Definition tableam.h:49
@ SO_TYPE_BITMAPSCAN
Definition tableam.h:50
TU_UpdateIndexes
Definition tableam.h:111
@ TU_Summarizing
Definition tableam.h:119
@ TU_All
Definition tableam.h:116
@ TU_None
Definition tableam.h:113
TM_Result
Definition tableam.h:73
@ TM_Ok
Definition tableam.h:78
@ TM_BeingModified
Definition tableam.h:100
@ TM_Deleted
Definition tableam.h:93
@ TM_WouldBlock
Definition tableam.h:103
@ TM_Updated
Definition tableam.h:90
@ TM_SelfModified
Definition tableam.h:84
@ TM_Invisible
Definition tableam.h:81
bool tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
Definition tidbitmap.c:1614
bool TransactionIdDidCommit(TransactionId transactionId)
Definition transam.c:126
bool TransactionIdDidAbort(TransactionId transactionId)
Definition transam.c:188
static bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition transam.h:297
#define InvalidTransactionId
Definition transam.h:31
static bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2)
Definition transam.h:282
static bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition transam.h:312
#define TransactionIdEquals(id1, id2)
Definition transam.h:43
#define TransactionIdIsValid(xid)
Definition transam.h:41
#define TransactionIdIsNormal(xid)
Definition transam.h:42
static bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition transam.h:263
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:175
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition tuptable.h:457
static bool HeapKeyTest(HeapTuple tuple, TupleDesc tupdesc, int nkeys, ScanKey keys)
Definition valid.h:28
static bool VARATT_IS_EXTERNAL(const void *PTR)
Definition varatt.h:354
bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
void visibilitymap_set_vmbits(BlockNumber heapBlk, Buffer vmBuf, uint8 flags, const RelFileLocator rlocator)
#define VISIBILITYMAP_VALID_BITS
#define VISIBILITYMAP_ALL_FROZEN
#define VISIBILITYMAP_XLOG_CATALOG_REL
#define VISIBILITYMAP_ALL_VISIBLE
TransactionId GetTopTransactionId(void)
Definition xact.c:427
TransactionId GetTopTransactionIdIfAny(void)
Definition xact.c:442
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition xact.c:942
bool IsInParallelMode(void)
Definition xact.c:1090
TransactionId GetCurrentTransactionId(void)
Definition xact.c:455
CommandId GetCurrentCommandId(bool used)
Definition xact.c:830
#define IsolationIsSerializable()
Definition xact.h:53
#define XLOG_INCLUDE_ORIGIN
Definition xlog.h:165
#define XLogHintBitIsNeeded()
Definition xlog.h:122
#define XLogStandbyInfoActive()
Definition xlog.h:125
uint64 XLogRecPtr
Definition xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition xloginsert.c:478
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition xloginsert.c:409
bool XLogCheckBufferNeedsBackup(Buffer buffer)
void XLogRegisterData(const void *data, uint32 len)
Definition xloginsert.c:368
void XLogSetRecordFlags(uint8 flags)
Definition xloginsert.c:460
void XLogRegisterBlock(uint8 block_id, RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blknum, const PageData *page, uint8 flags)
Definition xloginsert.c:313
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition xloginsert.c:245
void XLogBeginInsert(void)
Definition xloginsert.c:152
#define REGBUF_STANDARD
Definition xloginsert.h:35
#define REGBUF_NO_IMAGE
Definition xloginsert.h:33
#define REGBUF_KEEP_DATA
Definition xloginsert.h:36
#define REGBUF_WILL_INIT
Definition xloginsert.h:34

◆ FRM_INVALIDATE_XMAX

#define FRM_INVALIDATE_XMAX   0x0002

Definition at line 6733 of file heapam.c.

◆ FRM_MARK_COMMITTED

#define FRM_MARK_COMMITTED   0x0010

Definition at line 6736 of file heapam.c.

◆ FRM_NOOP

#define FRM_NOOP   0x0001

Definition at line 6732 of file heapam.c.

◆ FRM_RETURN_IS_MULTI

#define FRM_RETURN_IS_MULTI   0x0008

Definition at line 6735 of file heapam.c.

◆ FRM_RETURN_IS_XID

#define FRM_RETURN_IS_XID   0x0004

Definition at line 6734 of file heapam.c.

◆ LOCKMODE_from_mxstatus

#define LOCKMODE_from_mxstatus (   status)     (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)

Definition at line 159 of file heapam.c.

◆ LockTupleTuplock

#define LockTupleTuplock (   rel,
  tup,
  mode 
)     LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)

Definition at line 167 of file heapam.c.

◆ TUPLOCK_from_mxstatus

#define TUPLOCK_from_mxstatus (   status)     (MultiXactStatusLock[(status)])

Definition at line 218 of file heapam.c.

◆ UnlockTupleTuplock

#define UnlockTupleTuplock (   rel,
  tup,
  mode 
)     UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)

Definition at line 169 of file heapam.c.

Typedef Documentation

◆ IndexDeleteCounts

Function Documentation

◆ AssertHasSnapshotForToast()

static void AssertHasSnapshotForToast ( Relation  rel)
inlinestatic

Definition at line 225 of file heapam.c.

226{
227#ifdef USE_ASSERT_CHECKING
228
229 /* bootstrap mode in particular breaks this rule */
231 return;
232
233 /* if the relation doesn't have a TOAST table, we are good */
234 if (!OidIsValid(rel->rd_rel->reltoastrelid))
235 return;
236
238
239#endif /* USE_ASSERT_CHECKING */
240}

References Assert, HaveRegisteredOrActiveSnapshot(), IsNormalProcessingMode, OidIsValid, and RelationData::rd_rel.

Referenced by heap_delete(), heap_insert(), heap_multi_insert(), and heap_update().

◆ bitmapheap_stream_read_next()

static BlockNumber bitmapheap_stream_read_next ( ReadStream pgsr,
void private_data,
void per_buffer_data 
)
static

Definition at line 317 of file heapam.c.

319{
320 TBMIterateResult *tbmres = per_buffer_data;
323 TableScanDesc sscan = &hscan->rs_base;
324
325 for (;;)
326 {
328
329 /* no more entries in the bitmap */
330 if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
331 return InvalidBlockNumber;
332
333 /*
334 * Ignore any claimed entries past what we think is the end of the
335 * relation. It may have been extended after the start of our scan (we
336 * only hold an AccessShareLock, and it could be inserts from this
337 * backend). We don't take this optimization in SERIALIZABLE
338 * isolation though, as we need to examine all invisible tuples
339 * reachable by the index.
340 */
342 tbmres->blockno >= hscan->rs_nblocks)
343 continue;
344
345 return tbmres->blockno;
346 }
347
348 /* not reachable */
349 Assert(false);
350}

References Assert, CHECK_FOR_INTERRUPTS, fb(), InvalidBlockNumber, IsolationIsSerializable, and tbm_iterate().

Referenced by heap_beginscan().

◆ bottomup_nblocksfavorable()

static int bottomup_nblocksfavorable ( IndexDeleteCounts blockgroups,
int  nblockgroups,
TM_IndexDelete deltids 
)
static

Definition at line 8640 of file heapam.c.

8642{
8643 int64 lastblock = -1;
8644 int nblocksfavorable = 0;
8645
8646 Assert(nblockgroups >= 1);
8648
8649 /*
8650 * We tolerate heap blocks that will be accessed only slightly out of
8651 * physical order. Small blips occur when a pair of almost-contiguous
8652 * blocks happen to fall into different buckets (perhaps due only to a
8653 * small difference in npromisingtids that the bucketing scheme didn't
8654 * quite manage to ignore). We effectively ignore these blips by applying
8655 * a small tolerance. The precise tolerance we use is a little arbitrary,
8656 * but it works well enough in practice.
8657 */
8658 for (int b = 0; b < nblockgroups; b++)
8659 {
8660 IndexDeleteCounts *group = blockgroups + b;
8661 TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
8663
8664 if (lastblock != -1 &&
8667 break;
8668
8670 lastblock = block;
8671 }
8672
8673 /* Always indicate that there is at least 1 favorable block */
8675
8676 return nblocksfavorable;
8677}

References Assert, b, BOTTOMUP_MAX_NBLOCKS, BOTTOMUP_TOLERANCE_NBLOCKS, fb(), IndexDeleteCounts::ifirsttid, and ItemPointerGetBlockNumber().

Referenced by bottomup_sort_and_shrink().

◆ bottomup_sort_and_shrink()

static int bottomup_sort_and_shrink ( TM_IndexDeleteOp delstate)
static

Definition at line 8756 of file heapam.c.

8757{
8761 int nblockgroups = 0;
8762 int ncopied = 0;
8763 int nblocksfavorable = 0;
8764
8765 Assert(delstate->bottomup);
8766 Assert(delstate->ndeltids > 0);
8767
8768 /* Calculate per-heap-block count of TIDs */
8770 for (int i = 0; i < delstate->ndeltids; i++)
8771 {
8772 TM_IndexDelete *ideltid = &delstate->deltids[i];
8773 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8774 ItemPointer htid = &ideltid->tid;
8775 bool promising = istatus->promising;
8776
8778 {
8779 /* New block group */
8780 nblockgroups++;
8781
8784
8786 blockgroups[nblockgroups - 1].ifirsttid = i;
8787 blockgroups[nblockgroups - 1].ntids = 1;
8788 blockgroups[nblockgroups - 1].npromisingtids = 0;
8789 }
8790 else
8791 {
8792 blockgroups[nblockgroups - 1].ntids++;
8793 }
8794
8795 if (promising)
8796 blockgroups[nblockgroups - 1].npromisingtids++;
8797 }
8798
8799 /*
8800 * We're about ready to sort block groups to determine the optimal order
8801 * for visiting heap blocks. But before we do, round the number of
8802 * promising tuples for each block group up to the next power-of-two,
8803 * unless it is very low (less than 4), in which case we round up to 4.
8804 * npromisingtids is far too noisy to trust when choosing between a pair
8805 * of block groups that both have very low values.
8806 *
8807 * This scheme divides heap blocks/block groups into buckets. Each bucket
8808 * contains blocks that have _approximately_ the same number of promising
8809 * TIDs as each other. The goal is to ignore relatively small differences
8810 * in the total number of promising entries, so that the whole process can
8811 * give a little weight to heapam factors (like heap block locality)
8812 * instead. This isn't a trade-off, really -- we have nothing to lose. It
8813 * would be foolish to interpret small differences in npromisingtids
8814 * values as anything more than noise.
8815 *
8816 * We tiebreak on nhtids when sorting block group subsets that have the
8817 * same npromisingtids, but this has the same issues as npromisingtids,
8818 * and so nhtids is subject to the same power-of-two bucketing scheme. The
8819 * only reason that we don't fix nhtids in the same way here too is that
8820 * we'll need accurate nhtids values after the sort. We handle nhtids
8821 * bucketization dynamically instead (in the sort comparator).
8822 *
8823 * See bottomup_nblocksfavorable() for a full explanation of when and how
8824 * heap locality/favorable blocks can significantly influence when and how
8825 * heap blocks are accessed.
8826 */
8827 for (int b = 0; b < nblockgroups; b++)
8828 {
8829 IndexDeleteCounts *group = blockgroups + b;
8830
8831 /* Better off falling back on nhtids with low npromisingtids */
8832 if (group->npromisingtids <= 4)
8833 group->npromisingtids = 4;
8834 else
8835 group->npromisingtids =
8837 }
8838
8839 /* Sort groups and rearrange caller's deltids array */
8842 reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
8843
8845 /* Determine number of favorable blocks at the start of final deltids */
8847 delstate->deltids);
8848
8849 for (int b = 0; b < nblockgroups; b++)
8850 {
8851 IndexDeleteCounts *group = blockgroups + b;
8852 TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
8853
8855 sizeof(TM_IndexDelete) * group->ntids);
8856 ncopied += group->ntids;
8857 }
8858
8859 /* Copy final grouped and sorted TIDs back into start of caller's array */
8861 sizeof(TM_IndexDelete) * ncopied);
8862 delstate->ndeltids = ncopied;
8863
8866
8867 return nblocksfavorable;
8868}

References Assert, b, BlockNumberIsValid(), BOTTOMUP_MAX_NBLOCKS, bottomup_nblocksfavorable(), bottomup_sort_and_shrink_cmp(), fb(), i, IndexDeleteCounts::ifirsttid, InvalidBlockNumber, ItemPointerGetBlockNumber(), Min, IndexDeleteCounts::npromisingtids, IndexDeleteCounts::ntids, palloc(), palloc_array, pfree(), pg_nextpower2_32(), and qsort.

Referenced by heap_index_delete_tuples().

◆ bottomup_sort_and_shrink_cmp()

static int bottomup_sort_and_shrink_cmp ( const void arg1,
const void arg2 
)
static

Definition at line 8683 of file heapam.c.

8684{
8687
8688 /*
8689 * Most significant field is npromisingtids (which we invert the order of
8690 * so as to sort in desc order).
8691 *
8692 * Caller should have already normalized npromisingtids fields into
8693 * power-of-two values (buckets).
8694 */
8695 if (group1->npromisingtids > group2->npromisingtids)
8696 return -1;
8697 if (group1->npromisingtids < group2->npromisingtids)
8698 return 1;
8699
8700 /*
8701 * Tiebreak: desc ntids sort order.
8702 *
8703 * We cannot expect power-of-two values for ntids fields. We should
8704 * behave as if they were already rounded up for us instead.
8705 */
8706 if (group1->ntids != group2->ntids)
8707 {
8710
8711 if (ntids1 > ntids2)
8712 return -1;
8713 if (ntids1 < ntids2)
8714 return 1;
8715 }
8716
8717 /*
8718 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
8719 * block in deltids array) order.
8720 *
8721 * This is equivalent to sorting in ascending heap block number order
8722 * (among otherwise equal subsets of the array). This approach allows us
8723 * to avoid accessing the out-of-line TID. (We rely on the assumption
8724 * that the deltids array was sorted in ascending heap TID order when
8725 * these offsets to the first TID from each heap block group were formed.)
8726 */
8727 if (group1->ifirsttid > group2->ifirsttid)
8728 return 1;
8729 if (group1->ifirsttid < group2->ifirsttid)
8730 return -1;
8731
8733
8734 return 0;
8735}

References fb(), pg_nextpower2_32(), and pg_unreachable.

Referenced by bottomup_sort_and_shrink().

◆ compute_infobits()

◆ compute_new_xmax_infomask()

static void compute_new_xmax_infomask ( TransactionId  xmax,
uint16  old_infomask,
uint16  old_infomask2,
TransactionId  add_to_xmax,
LockTupleMode  mode,
bool  is_update,
TransactionId result_xmax,
uint16 result_infomask,
uint16 result_infomask2 
)
static

Definition at line 5395 of file heapam.c.

5400{
5401 TransactionId new_xmax;
5404
5406
5407l5:
5408 new_infomask = 0;
5409 new_infomask2 = 0;
5411 {
5412 /*
5413 * No previous locker; we just insert our own TransactionId.
5414 *
5415 * Note that it's critical that this case be the first one checked,
5416 * because there are several blocks below that come back to this one
5417 * to implement certain optimizations; old_infomask might contain
5418 * other dirty bits in those cases, but we don't really care.
5419 */
5420 if (is_update)
5421 {
5422 new_xmax = add_to_xmax;
5423 if (mode == LockTupleExclusive)
5425 }
5426 else
5427 {
5429 switch (mode)
5430 {
5431 case LockTupleKeyShare:
5432 new_xmax = add_to_xmax;
5434 break;
5435 case LockTupleShare:
5436 new_xmax = add_to_xmax;
5438 break;
5440 new_xmax = add_to_xmax;
5442 break;
5443 case LockTupleExclusive:
5444 new_xmax = add_to_xmax;
5447 break;
5448 default:
5449 new_xmax = InvalidTransactionId; /* silence compiler */
5450 elog(ERROR, "invalid lock mode");
5451 }
5452 }
5453 }
5455 {
5457
5458 /*
5459 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5460 * cross-check.
5461 */
5463
5464 /*
5465 * A multixact together with LOCK_ONLY set but neither lock bit set
5466 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5467 * anymore. This check is critical for databases upgraded by
5468 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5469 * that such multis are never passed.
5470 */
5472 {
5475 goto l5;
5476 }
5477
5478 /*
5479 * If the XMAX is already a MultiXactId, then we need to expand it to
5480 * include add_to_xmax; but if all the members were lockers and are
5481 * all gone, we can do away with the IS_MULTI bit and just set
5482 * add_to_xmax as the only locker/updater. If all lockers are gone
5483 * and we have an updater that aborted, we can also do without a
5484 * multi.
5485 *
5486 * The cost of doing GetMultiXactIdMembers would be paid by
5487 * MultiXactIdExpand if we weren't to do this, so this check is not
5488 * incurring extra work anyhow.
5489 */
5491 {
5494 old_infomask)))
5495 {
5496 /*
5497 * Reset these bits and restart; otherwise fall through to
5498 * create a new multi below.
5499 */
5502 goto l5;
5503 }
5504 }
5505
5507
5508 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5509 new_status);
5511 }
5513 {
5514 /*
5515 * It's a committed update, so we need to preserve him as updater of
5516 * the tuple.
5517 */
5518 MultiXactStatus status;
5520
5522 status = MultiXactStatusUpdate;
5523 else
5525
5527
5528 /*
5529 * since it's not running, it's obviously impossible for the old
5530 * updater to be identical to the current one, so we need not check
5531 * for that case as we do in the block above.
5532 */
5533 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5535 }
5536 else if (TransactionIdIsInProgress(xmax))
5537 {
5538 /*
5539 * If the XMAX is a valid, in-progress TransactionId, then we need to
5540 * create a new MultiXactId that includes both the old locker or
5541 * updater and our own TransactionId.
5542 */
5546
5548 {
5554 {
5557 else
5559 }
5560 else
5561 {
5562 /*
5563 * LOCK_ONLY can be present alone only when a page has been
5564 * upgraded by pg_upgrade. But in that case,
5565 * TransactionIdIsInProgress() should have returned false. We
5566 * assume it's no longer locked in this case.
5567 */
5568 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5571 goto l5;
5572 }
5573 }
5574 else
5575 {
5576 /* it's an update, but which kind? */
5579 else
5581 }
5582
5584
5585 /*
5586 * If the lock to be acquired is for the same TransactionId as the
5587 * existing lock, there's an optimization possible: consider only the
5588 * strongest of both locks as the only one present, and restart.
5589 */
5590 if (xmax == add_to_xmax)
5591 {
5592 /*
5593 * Note that it's not possible for the original tuple to be
5594 * updated: we wouldn't be here because the tuple would have been
5595 * invisible and we wouldn't try to update it. As a subtlety,
5596 * this code can also run when traversing an update chain to lock
5597 * future versions of a tuple. But we wouldn't be here either,
5598 * because the add_to_xmax would be different from the original
5599 * updater.
5600 */
5602
5603 /* acquire the strongest of both */
5604 if (mode < old_mode)
5605 mode = old_mode;
5606 /* mustn't touch is_update */
5607
5609 goto l5;
5610 }
5611
5612 /* otherwise, just fall back to creating a new multixact */
5614 new_xmax = MultiXactIdCreate(xmax, old_status,
5617 }
5620 {
5621 /*
5622 * It's a committed update, so we gotta preserve him as updater of the
5623 * tuple.
5624 */
5625 MultiXactStatus status;
5627
5629 status = MultiXactStatusUpdate;
5630 else
5632
5634
5635 /*
5636 * since it's not running, it's obviously impossible for the old
5637 * updater to be identical to the current one, so we need not check
5638 * for that case as we do in the block above.
5639 */
5640 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5642 }
5643 else
5644 {
5645 /*
5646 * Can get here iff the locking/updating transaction was running when
5647 * the infomask was extracted from the tuple, but finished before
5648 * TransactionIdIsInProgress got to run. Deal with it as if there was
5649 * no locker at all in the first place.
5650 */
5652 goto l5;
5653 }
5654
5657 *result_xmax = new_xmax;
5658}

References Assert, elog, ERROR, fb(), get_mxact_status_for_lock(), GetMultiXactIdHintBits(), HEAP_KEYS_UPDATED, HEAP_LOCKED_UPGRADED(), HEAP_XMAX_COMMITTED, HEAP_XMAX_EXCL_LOCK, HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HEAP_XMAX_SHR_LOCK, InvalidTransactionId, LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, mode, MultiXactIdCreate(), MultiXactIdExpand(), MultiXactIdGetUpdateXid(), MultiXactIdIsRunning(), MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), TUPLOCK_from_mxstatus, and WARNING.

Referenced by heap_delete(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), and heap_update().

◆ ConditionalMultiXactIdWait()

static bool ConditionalMultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
Relation  rel,
int remaining,
bool  logLockFailure 
)
static

Definition at line 7876 of file heapam.c.

7879{
7880 return Do_MultiXactIdWait(multi, status, infomask, true,
7882}

References Do_MultiXactIdWait(), fb(), remaining, and XLTW_None.

Referenced by heap_lock_tuple().

◆ Do_MultiXactIdWait()

static bool Do_MultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
bool  nowait,
Relation  rel,
const ItemPointerData ctid,
XLTW_Oper  oper,
int remaining,
bool  logLockFailure 
)
static

Definition at line 7776 of file heapam.c.

7780{
7781 bool result = true;
7782 MultiXactMember *members;
7783 int nmembers;
7784 int remain = 0;
7785
7786 /* for pre-pg_upgrade tuples, no need to sleep at all */
7787 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7788 GetMultiXactIdMembers(multi, &members, false,
7790
7791 if (nmembers >= 0)
7792 {
7793 int i;
7794
7795 for (i = 0; i < nmembers; i++)
7796 {
7797 TransactionId memxid = members[i].xid;
7798 MultiXactStatus memstatus = members[i].status;
7799
7801 {
7802 remain++;
7803 continue;
7804 }
7805
7807 LOCKMODE_from_mxstatus(status)))
7808 {
7810 remain++;
7811 continue;
7812 }
7813
7814 /*
7815 * This member conflicts with our multi, so we have to sleep (or
7816 * return failure, if asked to avoid waiting.)
7817 *
7818 * Note that we don't set up an error context callback ourselves,
7819 * but instead we pass the info down to XactLockTableWait. This
7820 * might seem a bit wasteful because the context is set up and
7821 * tore down for each member of the multixact, but in reality it
7822 * should be barely noticeable, and it avoids duplicate code.
7823 */
7824 if (nowait)
7825 {
7827 if (!result)
7828 break;
7829 }
7830 else
7831 XactLockTableWait(memxid, rel, ctid, oper);
7832 }
7833
7834 pfree(members);
7835 }
7836
7837 if (remaining)
7838 *remaining = remain;
7839
7840 return result;
7841}

References ConditionalXactLockTableWait(), DoLockModesConflict(), fb(), GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), i, LOCKMODE_from_mxstatus, oper(), pfree(), remaining, MultiXactMember::status, TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), XactLockTableWait(), and MultiXactMember::xid.

Referenced by ConditionalMultiXactIdWait(), and MultiXactIdWait().

◆ DoesMultiXactIdConflict()

static bool DoesMultiXactIdConflict ( MultiXactId  multi,
uint16  infomask,
LockTupleMode  lockmode,
bool current_is_member 
)
static

Definition at line 7676 of file heapam.c.

7678{
7679 int nmembers;
7680 MultiXactMember *members;
7681 bool result = false;
7682 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7683
7685 return false;
7686
7687 nmembers = GetMultiXactIdMembers(multi, &members, false,
7689 if (nmembers >= 0)
7690 {
7691 int i;
7692
7693 for (i = 0; i < nmembers; i++)
7694 {
7697
7698 if (result && (current_is_member == NULL || *current_is_member))
7699 break;
7700
7701 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7702
7703 /* ignore members from current xact (but track their presence) */
7704 memxid = members[i].xid;
7706 {
7707 if (current_is_member != NULL)
7708 *current_is_member = true;
7709 continue;
7710 }
7711 else if (result)
7712 continue;
7713
7714 /* ignore members that don't conflict with the lock we want */
7716 continue;
7717
7718 if (ISUPDATE_from_mxstatus(members[i].status))
7719 {
7720 /* ignore aborted updaters */
7722 continue;
7723 }
7724 else
7725 {
7726 /* ignore lockers-only that are no longer in progress */
7728 continue;
7729 }
7730
7731 /*
7732 * Whatever remains are either live lockers that conflict with our
7733 * wanted lock, and updaters that are not aborted. Those conflict
7734 * with what we want. Set up to return true, but keep going to
7735 * look for the current transaction among the multixact members,
7736 * if needed.
7737 */
7738 result = true;
7739 }
7740 pfree(members);
7741 }
7742
7743 return result;
7744}

References DoLockModesConflict(), fb(), GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), i, ISUPDATE_from_mxstatus, LOCKMODE_from_mxstatus, pfree(), TransactionIdDidAbort(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), tupleLockExtraInfo, and MultiXactMember::xid.

Referenced by heap_delete(), heap_inplace_lock(), heap_lock_tuple(), and heap_update().

◆ ExtractReplicaIdentity()

static HeapTuple ExtractReplicaIdentity ( Relation  relation,
HeapTuple  tp,
bool  key_required,
bool copy 
)
static

Definition at line 9222 of file heapam.c.

9224{
9225 TupleDesc desc = RelationGetDescr(relation);
9226 char replident = relation->rd_rel->relreplident;
9229 bool nulls[MaxHeapAttributeNumber];
9231
9232 *copy = false;
9233
9234 if (!RelationIsLogicallyLogged(relation))
9235 return NULL;
9236
9237 if (replident == REPLICA_IDENTITY_NOTHING)
9238 return NULL;
9239
9240 if (replident == REPLICA_IDENTITY_FULL)
9241 {
9242 /*
9243 * When logging the entire old tuple, it very well could contain
9244 * toasted columns. If so, force them to be inlined.
9245 */
9246 if (HeapTupleHasExternal(tp))
9247 {
9248 *copy = true;
9249 tp = toast_flatten_tuple(tp, desc);
9250 }
9251 return tp;
9252 }
9253
9254 /* if the key isn't required and we're only logging the key, we're done */
9255 if (!key_required)
9256 return NULL;
9257
9258 /* find out the replica identity columns */
9261
9262 /*
9263 * If there's no defined replica identity columns, treat as !key_required.
9264 * (This case should not be reachable from heap_update, since that should
9265 * calculate key_required accurately. But heap_delete just passes
9266 * constant true for key_required, so we can hit this case in deletes.)
9267 */
9268 if (bms_is_empty(idattrs))
9269 return NULL;
9270
9271 /*
9272 * Construct a new tuple containing only the replica identity columns,
9273 * with nulls elsewhere. While we're at it, assert that the replica
9274 * identity columns aren't null.
9275 */
9276 heap_deform_tuple(tp, desc, values, nulls);
9277
9278 for (int i = 0; i < desc->natts; i++)
9279 {
9281 idattrs))
9282 Assert(!nulls[i]);
9283 else
9284 nulls[i] = true;
9285 }
9286
9287 key_tuple = heap_form_tuple(desc, values, nulls);
9288 *copy = true;
9289
9291
9292 /*
9293 * If the tuple, which by here only contains indexed columns, still has
9294 * toasted columns, force them to be inlined. This is somewhat unlikely
9295 * since there's limits on the size of indexed columns, so we don't
9296 * duplicate toast_flatten_tuple()s functionality in the above loop over
9297 * the indexed columns, even if it would be more efficient.
9298 */
9300 {
9302
9305 }
9306
9307 return key_tuple;
9308}

References Assert, bms_free(), bms_is_empty, bms_is_member(), fb(), FirstLowInvalidHeapAttributeNumber, heap_deform_tuple(), heap_form_tuple(), heap_freetuple(), HeapTupleHasExternal(), i, INDEX_ATTR_BITMAP_IDENTITY_KEY, MaxHeapAttributeNumber, TupleDescData::natts, RelationData::rd_rel, RelationGetDescr, RelationGetIndexAttrBitmap(), RelationIsLogicallyLogged, toast_flatten_tuple(), and values.

Referenced by heap_delete(), and heap_update().

◆ FreeBulkInsertState()

◆ FreezeMultiXactId()

static TransactionId FreezeMultiXactId ( MultiXactId  multi,
uint16  t_infomask,
const struct VacuumCutoffs cutoffs,
uint16 flags,
HeapPageFreeze pagefrz 
)
static

Definition at line 6785 of file heapam.c.

6788{
6790 MultiXactMember *members;
6791 int nmembers;
6792 bool need_replace;
6793 int nnewmembers;
6795 bool has_lockers;
6797 bool update_committed;
6798 TransactionId FreezePageRelfrozenXid;
6799
6800 *flags = 0;
6801
6802 /* We should only be called in Multis */
6803 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6804
6805 if (!MultiXactIdIsValid(multi) ||
6806 HEAP_LOCKED_UPGRADED(t_infomask))
6807 {
6808 *flags |= FRM_INVALIDATE_XMAX;
6809 pagefrz->freeze_required = true;
6810 return InvalidTransactionId;
6811 }
6812 else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
6813 ereport(ERROR,
6815 errmsg_internal("found multixact %u from before relminmxid %u",
6816 multi, cutoffs->relminmxid)));
6817 else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
6818 {
6820
6821 /*
6822 * This old multi cannot possibly have members still running, but
6823 * verify just in case. If it was a locker only, it can be removed
6824 * without any further consideration; but if it contained an update,
6825 * we might need to preserve it.
6826 */
6827 if (MultiXactIdIsRunning(multi,
6828 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6829 ereport(ERROR,
6831 errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
6832 multi, cutoffs->OldestMxact)));
6833
6834 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6835 {
6836 *flags |= FRM_INVALIDATE_XMAX;
6837 pagefrz->freeze_required = true;
6838 return InvalidTransactionId;
6839 }
6840
6841 /* replace multi with single XID for its updater? */
6842 update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
6844 ereport(ERROR,
6846 errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
6847 multi, update_xact,
6848 cutoffs->relfrozenxid)));
6849 else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
6850 {
6851 /*
6852 * Updater XID has to have aborted (otherwise the tuple would have
6853 * been pruned away instead, since updater XID is < OldestXmin).
6854 * Just remove xmax.
6855 */
6857 ereport(ERROR,
6859 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6860 multi, update_xact,
6861 cutoffs->OldestXmin)));
6862 *flags |= FRM_INVALIDATE_XMAX;
6863 pagefrz->freeze_required = true;
6864 return InvalidTransactionId;
6865 }
6866
6867 /* Have to keep updater XID as new xmax */
6868 *flags |= FRM_RETURN_IS_XID;
6869 pagefrz->freeze_required = true;
6870 return update_xact;
6871 }
6872
6873 /*
6874 * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
6875 * need to walk the whole members array to figure out what to do, if
6876 * anything.
6877 */
6878 nmembers =
6879 GetMultiXactIdMembers(multi, &members, false,
6880 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6881 if (nmembers <= 0)
6882 {
6883 /* Nothing worth keeping */
6884 *flags |= FRM_INVALIDATE_XMAX;
6885 pagefrz->freeze_required = true;
6886 return InvalidTransactionId;
6887 }
6888
6889 /*
6890 * The FRM_NOOP case is the only case where we might need to ratchet back
6891 * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
6892 * case where our caller might ratchet back its NoFreezePageRelfrozenXid
6893 * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
6894 * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
6895 * trackers managed by VACUUM being ratcheting back by xmax to the degree
6896 * required to make it safe to leave xmax undisturbed, independent of
6897 * whether or not page freezing is triggered somewhere else.
6898 *
6899 * Our policy is to force freezing in every case other than FRM_NOOP,
6900 * which obviates the need to maintain either set of trackers, anywhere.
6901 * Every other case will reliably execute a freeze plan for xmax that
6902 * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
6903 * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
6904 * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
6905 * OldestXmin/OldestMxact, so later values never need to be tracked here.)
6906 */
6907 need_replace = false;
6908 FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
6909 for (int i = 0; i < nmembers; i++)
6910 {
6911 TransactionId xid = members[i].xid;
6912
6913 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6914
6915 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
6916 {
6917 /* Can't violate the FreezeLimit postcondition */
6918 need_replace = true;
6919 break;
6920 }
6921 if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
6922 FreezePageRelfrozenXid = xid;
6923 }
6924
6925 /* Can't violate the MultiXactCutoff postcondition, either */
6926 if (!need_replace)
6928
6929 if (!need_replace)
6930 {
6931 /*
6932 * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
6933 * both together to make it safe to retain this particular multi after
6934 * freezing its page
6935 */
6936 *flags |= FRM_NOOP;
6937 pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
6938 if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
6939 pagefrz->FreezePageRelminMxid = multi;
6940 pfree(members);
6941 return multi;
6942 }
6943
6944 /*
6945 * Do a more thorough second pass over the multi to figure out which
6946 * member XIDs actually need to be kept. Checking the precise status of
6947 * individual members might even show that we don't need to keep anything.
6948 * That is quite possible even though the Multi must be >= OldestMxact,
6949 * since our second pass only keeps member XIDs when it's truly necessary;
6950 * even member XIDs >= OldestXmin often won't be kept by second pass.
6951 */
6952 nnewmembers = 0;
6954 has_lockers = false;
6956 update_committed = false;
6957
6958 /*
6959 * Determine whether to keep each member xid, or to ignore it instead
6960 */
6961 for (int i = 0; i < nmembers; i++)
6962 {
6963 TransactionId xid = members[i].xid;
6964 MultiXactStatus mstatus = members[i].status;
6965
6966 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6967
6968 if (!ISUPDATE_from_mxstatus(mstatus))
6969 {
6970 /*
6971 * Locker XID (not updater XID). We only keep lockers that are
6972 * still running.
6973 */
6976 {
6977 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6978 ereport(ERROR,
6980 errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
6981 multi, xid,
6982 cutoffs->OldestXmin)));
6983 newmembers[nnewmembers++] = members[i];
6984 has_lockers = true;
6985 }
6986
6987 continue;
6988 }
6989
6990 /*
6991 * Updater XID (not locker XID). Should we keep it?
6992 *
6993 * Since the tuple wasn't totally removed when vacuum pruned, the
6994 * update Xid cannot possibly be older than OldestXmin cutoff unless
6995 * the updater XID aborted. If the updater transaction is known
6996 * aborted or crashed then it's okay to ignore it, otherwise not.
6997 *
6998 * In any case the Multi should never contain two updaters, whatever
6999 * their individual commit status. Check for that first, in passing.
7000 */
7002 ereport(ERROR,
7004 errmsg_internal("multixact %u has two or more updating members",
7005 multi),
7006 errdetail_internal("First updater XID=%u second updater XID=%u.",
7007 update_xid, xid)));
7008
7009 /*
7010 * As with all tuple visibility routines, it's critical to test
7011 * TransactionIdIsInProgress before TransactionIdDidCommit, because of
7012 * race conditions explained in detail in heapam_visibility.c.
7013 */
7016 update_xid = xid;
7017 else if (TransactionIdDidCommit(xid))
7018 {
7019 /*
7020 * The transaction committed, so we can tell caller to set
7021 * HEAP_XMAX_COMMITTED. (We can only do this because we know the
7022 * transaction is not running.)
7023 */
7024 update_committed = true;
7025 update_xid = xid;
7026 }
7027 else
7028 {
7029 /*
7030 * Not in progress, not committed -- must be aborted or crashed;
7031 * we can ignore it.
7032 */
7033 continue;
7034 }
7035
7036 /*
7037 * We determined that updater must be kept -- add it to pending new
7038 * members list
7039 */
7040 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
7041 ereport(ERROR,
7043 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
7044 multi, xid, cutoffs->OldestXmin)));
7045 newmembers[nnewmembers++] = members[i];
7046 }
7047
7048 pfree(members);
7049
7050 /*
7051 * Determine what to do with caller's multi based on information gathered
7052 * during our second pass
7053 */
7054 if (nnewmembers == 0)
7055 {
7056 /* Nothing worth keeping */
7057 *flags |= FRM_INVALIDATE_XMAX;
7059 }
7061 {
7062 /*
7063 * If there's a single member and it's an update, pass it back alone
7064 * without creating a new Multi. (XXX we could do this when there's a
7065 * single remaining locker, too, but that would complicate the API too
7066 * much; moreover, the case with the single updater is more
7067 * interesting, because those are longer-lived.)
7068 */
7069 Assert(nnewmembers == 1);
7070 *flags |= FRM_RETURN_IS_XID;
7071 if (update_committed)
7072 *flags |= FRM_MARK_COMMITTED;
7074 }
7075 else
7076 {
7077 /*
7078 * Create a new multixact with the surviving members of the previous
7079 * one, to set as new Xmax in the tuple
7080 */
7082 *flags |= FRM_RETURN_IS_MULTI;
7083 }
7084
7086
7087 pagefrz->freeze_required = true;
7088 return newxmax;
7089}

References Assert, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errdetail_internal(), errmsg_internal(), ERROR, fb(), HeapPageFreeze::freeze_required, VacuumCutoffs::FreezeLimit, HeapPageFreeze::FreezePageRelfrozenXid, HeapPageFreeze::FreezePageRelminMxid, FRM_INVALIDATE_XMAX, FRM_MARK_COMMITTED, FRM_NOOP, FRM_RETURN_IS_MULTI, FRM_RETURN_IS_XID, GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, i, InvalidTransactionId, ISUPDATE_from_mxstatus, VacuumCutoffs::MultiXactCutoff, MultiXactIdCreateFromMembers(), MultiXactIdGetUpdateXid(), MultiXactIdIsRunning(), MultiXactIdIsValid, MultiXactIdPrecedes(), VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, palloc_array, pfree(), VacuumCutoffs::relfrozenxid, VacuumCutoffs::relminmxid, MultiXactMember::status, TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), TransactionIdIsValid, TransactionIdPrecedes(), and MultiXactMember::xid.

Referenced by heap_prepare_freeze_tuple().

◆ get_mxact_status_for_lock()

static MultiXactStatus get_mxact_status_for_lock ( LockTupleMode  mode,
bool  is_update 
)
static

Definition at line 4597 of file heapam.c.

4598{
4599 int retval;
4600
4601 if (is_update)
4602 retval = tupleLockExtraInfo[mode].updstatus;
4603 else
4604 retval = tupleLockExtraInfo[mode].lockstatus;
4605
4606 if (retval == -1)
4607 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4608 is_update ? "true" : "false");
4609
4610 return (MultiXactStatus) retval;
4611}

References elog, ERROR, fb(), mode, and tupleLockExtraInfo.

Referenced by compute_new_xmax_infomask(), heap_lock_tuple(), and test_lockmode_for_conflict().

◆ GetBulkInsertState()

◆ GetMultiXactIdHintBits()

static void GetMultiXactIdHintBits ( MultiXactId  multi,
uint16 new_infomask,
uint16 new_infomask2 
)
static

Definition at line 7527 of file heapam.c.

7529{
7530 int nmembers;
7531 MultiXactMember *members;
7532 int i;
7534 uint16 bits2 = 0;
7535 bool has_update = false;
7537
7538 /*
7539 * We only use this in multis we just created, so they cannot be values
7540 * pre-pg_upgrade.
7541 */
7542 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7543
7544 for (i = 0; i < nmembers; i++)
7545 {
7547
7548 /*
7549 * Remember the strongest lock mode held by any member of the
7550 * multixact.
7551 */
7552 mode = TUPLOCK_from_mxstatus(members[i].status);
7553 if (mode > strongest)
7554 strongest = mode;
7555
7556 /* See what other bits we need */
7557 switch (members[i].status)
7558 {
7562 break;
7563
7566 break;
7567
7569 has_update = true;
7570 break;
7571
7574 has_update = true;
7575 break;
7576 }
7577 }
7578
7581 bits |= HEAP_XMAX_EXCL_LOCK;
7582 else if (strongest == LockTupleShare)
7583 bits |= HEAP_XMAX_SHR_LOCK;
7584 else if (strongest == LockTupleKeyShare)
7585 bits |= HEAP_XMAX_KEYSHR_LOCK;
7586
7587 if (!has_update)
7588 bits |= HEAP_XMAX_LOCK_ONLY;
7589
7590 if (nmembers > 0)
7591 pfree(members);
7592
7593 *new_infomask = bits;
7595}

References fb(), GetMultiXactIdMembers(), HEAP_KEYS_UPDATED, HEAP_XMAX_EXCL_LOCK, HEAP_XMAX_IS_MULTI, HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HEAP_XMAX_SHR_LOCK, i, LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, mode, MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, pfree(), and TUPLOCK_from_mxstatus.

Referenced by compute_new_xmax_infomask(), heap_prepare_freeze_tuple(), and heap_update().

◆ heap_abort_speculative()

void heap_abort_speculative ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 6255 of file heapam.c.

6256{
6258 ItemId lp;
6259 HeapTupleData tp;
6260 Page page;
6261 BlockNumber block;
6262 Buffer buffer;
6263
6265
6266 block = ItemPointerGetBlockNumber(tid);
6267 buffer = ReadBuffer(relation, block);
6268 page = BufferGetPage(buffer);
6269
6271
6272 /*
6273 * Page can't be all visible, we just inserted into it, and are still
6274 * running.
6275 */
6276 Assert(!PageIsAllVisible(page));
6277
6280
6281 tp.t_tableOid = RelationGetRelid(relation);
6282 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6283 tp.t_len = ItemIdGetLength(lp);
6284 tp.t_self = *tid;
6285
6286 /*
6287 * Sanity check that the tuple really is a speculatively inserted tuple,
6288 * inserted by us.
6289 */
6290 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6291 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6292 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6293 elog(ERROR, "attempted to kill a non-speculative tuple");
6295
6296 /*
6297 * No need to check for serializable conflicts here. There is never a
6298 * need for a combo CID, either. No need to extract replica identity, or
6299 * do anything special with infomask bits.
6300 */
6301
6303
6304 /*
6305 * The tuple will become DEAD immediately. Flag that this page is a
6306 * candidate for pruning by setting xmin to TransactionXmin. While not
6307 * immediately prunable, it is the oldest xid we can cheaply determine
6308 * that's safe against wraparound / being older than the table's
6309 * relfrozenxid. To defend against the unlikely case of a new relation
6310 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6311 * if so (vacuum can't subsequently move relfrozenxid to beyond
6312 * TransactionXmin, so there's no race here).
6313 */
6315 {
6316 TransactionId relfrozenxid = relation->rd_rel->relfrozenxid;
6318
6319 if (TransactionIdPrecedes(TransactionXmin, relfrozenxid))
6320 prune_xid = relfrozenxid;
6321 else
6324 }
6325
6326 /* store transaction information of xact deleting the tuple */
6329
6330 /*
6331 * Set the tuple header xmin to InvalidTransactionId. This makes the
6332 * tuple immediately invisible everyone. (In particular, to any
6333 * transactions waiting on the speculative token, woken up later.)
6334 */
6336
6337 /* Clear the speculative insertion token too */
6338 tp.t_data->t_ctid = tp.t_self;
6339
6340 MarkBufferDirty(buffer);
6341
6342 /*
6343 * XLOG stuff
6344 *
6345 * The WAL records generated here match heap_delete(). The same recovery
6346 * routines are used.
6347 */
6348 if (RelationNeedsWAL(relation))
6349 {
6352
6354 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6355 tp.t_data->t_infomask2);
6357 xlrec.xmax = xid;
6358
6362
6363 /* No replica identity & replication origin logged */
6364
6366
6367 PageSetLSN(page, recptr);
6368 }
6369
6371
6373
6374 if (HeapTupleHasExternal(&tp))
6375 {
6376 Assert(!IsToastRelation(relation));
6377 heap_toast_delete(relation, &tp, true);
6378 }
6379
6380 /*
6381 * Never need to mark tuple for invalidation, since catalogs don't support
6382 * speculative insertion
6383 */
6384
6385 /* Now we can release the buffer */
6386 ReleaseBuffer(buffer);
6387
6388 /* count deletion, as we counted the insertion too */
6389 pgstat_count_heap_delete(relation);
6390}

References Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), compute_infobits(), elog, END_CRIT_SECTION, ERROR, fb(), xl_heap_delete::flags, GetCurrentTransactionId(), HEAP_MOVED, heap_toast_delete(), HEAP_XMAX_BITS, HeapTupleHasExternal(), HeapTupleHeaderIsHeapOnly(), HeapTupleHeaderIsSpeculative(), HeapTupleHeaderSetXmin(), InvalidTransactionId, IsToastRelation(), ItemIdGetLength, ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), MarkBufferDirty(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_delete(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetRelid, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapDelete, START_CRIT_SECTION, HeapTupleHeaderData::t_choice, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_heap, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, HeapTupleFields::t_xmin, TransactionIdIsValid, TransactionIdPrecedes(), TransactionXmin, XLH_DELETE_IS_SUPER, XLOG_HEAP_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by heapam_tuple_complete_speculative(), and toast_delete_datum().

◆ heap_acquire_tuplock()

static bool heap_acquire_tuplock ( Relation  relation,
const ItemPointerData tid,
LockTupleMode  mode,
LockWaitPolicy  wait_policy,
bool have_tuple_lock 
)
static

Definition at line 5346 of file heapam.c.

5348{
5349 if (*have_tuple_lock)
5350 return true;
5351
5352 switch (wait_policy)
5353 {
5354 case LockWaitBlock:
5355 LockTupleTuplock(relation, tid, mode);
5356 break;
5357
5358 case LockWaitSkip:
5359 if (!ConditionalLockTupleTuplock(relation, tid, mode, false))
5360 return false;
5361 break;
5362
5363 case LockWaitError:
5365 ereport(ERROR,
5367 errmsg("could not obtain lock on row in relation \"%s\"",
5368 RelationGetRelationName(relation))));
5369 break;
5370 }
5371 *have_tuple_lock = true;
5372
5373 return true;
5374}

References ConditionalLockTupleTuplock, ereport, errcode(), errmsg(), ERROR, fb(), LockTupleTuplock, LockWaitBlock, LockWaitError, LockWaitSkip, log_lock_failures, mode, and RelationGetRelationName.

Referenced by heap_delete(), heap_lock_tuple(), and heap_update().

◆ heap_attr_equals()

static bool heap_attr_equals ( TupleDesc  tupdesc,
int  attrnum,
Datum  value1,
Datum  value2,
bool  isnull1,
bool  isnull2 
)
static

Definition at line 4415 of file heapam.c.

4417{
4418 /*
4419 * If one value is NULL and other is not, then they are certainly not
4420 * equal
4421 */
4422 if (isnull1 != isnull2)
4423 return false;
4424
4425 /*
4426 * If both are NULL, they can be considered equal.
4427 */
4428 if (isnull1)
4429 return true;
4430
4431 /*
4432 * We do simple binary comparison of the two datums. This may be overly
4433 * strict because there can be multiple binary representations for the
4434 * same logical value. But we should be OK as long as there are no false
4435 * positives. Using a type-specific equality operator is messy because
4436 * there could be multiple notions of equality in different operator
4437 * classes; furthermore, we cannot safely invoke user-defined functions
4438 * while holding exclusive buffer lock.
4439 */
4440 if (attrnum <= 0)
4441 {
4442 /* The only allowed system columns are OIDs, so do this */
4444 }
4445 else
4446 {
4448
4450 att = TupleDescCompactAttr(tupdesc, attrnum - 1);
4451 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4452 }
4453}

References Assert, DatumGetObjectId(), datumIsEqual(), fb(), and TupleDescCompactAttr().

Referenced by HeapDetermineColumnsInfo().

◆ heap_beginscan()

TableScanDesc heap_beginscan ( Relation  relation,
Snapshot  snapshot,
int  nkeys,
ScanKey  key,
ParallelTableScanDesc  parallel_scan,
uint32  flags 
)

Definition at line 1164 of file heapam.c.

1168{
1169 HeapScanDesc scan;
1170
1171 /*
1172 * increment relation ref count while scanning relation
1173 *
1174 * This is just to make really sure the relcache entry won't go away while
1175 * the scan has a pointer to it. Caller should be holding the rel open
1176 * anyway, so this is redundant in all normal scenarios...
1177 */
1179
1180 /*
1181 * allocate and initialize scan descriptor
1182 */
1183 if (flags & SO_TYPE_BITMAPSCAN)
1184 {
1186
1187 /*
1188 * Bitmap Heap scans do not have any fields that a normal Heap Scan
1189 * does not have, so no special initializations required here.
1190 */
1191 scan = (HeapScanDesc) bscan;
1192 }
1193 else
1195
1196 scan->rs_base.rs_rd = relation;
1197 scan->rs_base.rs_snapshot = snapshot;
1198 scan->rs_base.rs_nkeys = nkeys;
1199 scan->rs_base.rs_flags = flags;
1200 scan->rs_base.rs_parallel = parallel_scan;
1201 scan->rs_strategy = NULL; /* set in initscan */
1202 scan->rs_cbuf = InvalidBuffer;
1203
1204 /*
1205 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1206 */
1207 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1209
1210 /* Check that a historic snapshot is not used for non-catalog tables */
1211 if (snapshot &&
1212 IsHistoricMVCCSnapshot(snapshot) &&
1214 {
1215 ereport(ERROR,
1217 errmsg("cannot query non-catalog table \"%s\" during logical decoding",
1218 RelationGetRelationName(relation))));
1219 }
1220
1221 /*
1222 * For seqscan and sample scans in a serializable transaction, acquire a
1223 * predicate lock on the entire relation. This is required not only to
1224 * lock all the matching tuples, but also to conflict with new insertions
1225 * into the table. In an indexscan, we take page locks on the index pages
1226 * covering the range specified in the scan qual, but in a heap scan there
1227 * is nothing more fine-grained to lock. A bitmap scan is a different
1228 * story, there we have already scanned the index and locked the index
1229 * pages covering the predicate. But in that case we still have to lock
1230 * any matching heap tuples. For sample scan we could optimize the locking
1231 * to be at least page-level granularity, but we'd need to add per-tuple
1232 * locking for that.
1233 */
1235 {
1236 /*
1237 * Ensure a missing snapshot is noticed reliably, even if the
1238 * isolation mode means predicate locking isn't performed (and
1239 * therefore the snapshot isn't used here).
1240 */
1241 Assert(snapshot);
1242 PredicateLockRelation(relation, snapshot);
1243 }
1244
1245 /* we only need to set this up once */
1246 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1247
1248 /*
1249 * Allocate memory to keep track of page allocation for parallel workers
1250 * when doing a parallel scan.
1251 */
1252 if (parallel_scan != NULL)
1254 else
1256
1257 /*
1258 * we do this here instead of in initscan() because heap_rescan also calls
1259 * initscan() and we don't want to allocate memory again
1260 */
1261 if (nkeys > 0)
1262 scan->rs_base.rs_key = palloc_array(ScanKeyData, nkeys);
1263 else
1264 scan->rs_base.rs_key = NULL;
1265
1266 initscan(scan, key, false);
1267
1268 scan->rs_read_stream = NULL;
1269
1270 /*
1271 * Set up a read stream for sequential scans and TID range scans. This
1272 * should be done after initscan() because initscan() allocates the
1273 * BufferAccessStrategy object passed to the read stream API.
1274 */
1275 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN ||
1277 {
1279
1280 if (scan->rs_base.rs_parallel)
1282 else
1284
1285 /* ---
1286 * It is safe to use batchmode as the only locks taken by `cb`
1287 * are never taken while waiting for IO:
1288 * - SyncScanLock is used in the non-parallel case
1289 * - in the parallel case, only spinlocks and atomics are used
1290 * ---
1291 */
1294 scan->rs_strategy,
1295 scan->rs_base.rs_rd,
1297 cb,
1298 scan,
1299 0);
1300 }
1301 else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
1302 {
1305 scan->rs_strategy,
1306 scan->rs_base.rs_rd,
1309 scan,
1310 sizeof(TBMIterateResult));
1311 }
1312
1313
1314 return (TableScanDesc) scan;
1315}

References Assert, bitmapheap_stream_read_next(), ereport, errcode(), errmsg(), ERROR, fb(), heap_scan_stream_read_next_parallel(), heap_scan_stream_read_next_serial(), initscan(), InvalidBuffer, IsHistoricMVCCSnapshot, IsMVCCSnapshot, MAIN_FORKNUM, palloc_array, palloc_object, PredicateLockRelation(), read_stream_begin_relation(), READ_STREAM_DEFAULT, READ_STREAM_SEQUENTIAL, READ_STREAM_USE_BATCHING, RelationGetRelationName, RelationGetRelid, RelationIncrementReferenceCount(), RelationIsAccessibleInLogicalDecoding, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_flags, TableScanDescData::rs_key, TableScanDescData::rs_nkeys, TableScanDescData::rs_parallel, HeapScanDescData::rs_parallelworkerdata, TableScanDescData::rs_rd, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, HeapScanDescData::rs_strategy, SO_TYPE_BITMAPSCAN, SO_TYPE_SAMPLESCAN, SO_TYPE_SEQSCAN, SO_TYPE_TIDRANGESCAN, and HeapTupleData::t_tableOid.

◆ heap_delete()

TM_Result heap_delete ( Relation  relation,
const ItemPointerData tid,
CommandId  cid,
Snapshot  crosscheck,
bool  wait,
TM_FailureData tmfd,
bool  changingPart 
)

Definition at line 2843 of file heapam.c.

2846{
2847 TM_Result result;
2849 ItemId lp;
2850 HeapTupleData tp;
2851 Page page;
2852 BlockNumber block;
2853 Buffer buffer;
2854 Buffer vmbuffer = InvalidBuffer;
2855 TransactionId new_xmax;
2858 bool have_tuple_lock = false;
2859 bool iscombo;
2860 bool all_visible_cleared = false;
2861 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2862 bool old_key_copied = false;
2863
2865
2866 AssertHasSnapshotForToast(relation);
2867
2868 /*
2869 * Forbid this during a parallel operation, lest it allocate a combo CID.
2870 * Other workers might need that combo CID for visibility checks, and we
2871 * have no provision for broadcasting it to them.
2872 */
2873 if (IsInParallelMode())
2874 ereport(ERROR,
2876 errmsg("cannot delete tuples during a parallel operation")));
2877
2878 block = ItemPointerGetBlockNumber(tid);
2879 buffer = ReadBuffer(relation, block);
2880 page = BufferGetPage(buffer);
2881
2882 /*
2883 * Before locking the buffer, pin the visibility map page if it appears to
2884 * be necessary. Since we haven't got the lock yet, someone else might be
2885 * in the middle of changing this, so we'll need to recheck after we have
2886 * the lock.
2887 */
2888 if (PageIsAllVisible(page))
2889 visibilitymap_pin(relation, block, &vmbuffer);
2890
2892
2895
2896 tp.t_tableOid = RelationGetRelid(relation);
2897 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2898 tp.t_len = ItemIdGetLength(lp);
2899 tp.t_self = *tid;
2900
2901l1:
2902
2903 /*
2904 * If we didn't pin the visibility map page and the page has become all
2905 * visible while we were busy locking the buffer, we'll have to unlock and
2906 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2907 * unfortunate, but hopefully shouldn't happen often.
2908 */
2909 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2910 {
2912 visibilitymap_pin(relation, block, &vmbuffer);
2914 }
2915
2916 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2917
2918 if (result == TM_Invisible)
2919 {
2920 UnlockReleaseBuffer(buffer);
2921 ereport(ERROR,
2923 errmsg("attempted to delete invisible tuple")));
2924 }
2925 else if (result == TM_BeingModified && wait)
2926 {
2929
2930 /* must copy state data before unlocking buffer */
2933
2934 /*
2935 * Sleep until concurrent transaction ends -- except when there's a
2936 * single locker and it's our own transaction. Note we don't care
2937 * which lock mode the locker has, because we need the strongest one.
2938 *
2939 * Before sleeping, we need to acquire tuple lock to establish our
2940 * priority for the tuple (see heap_lock_tuple). LockTuple will
2941 * release us when we are next-in-line for the tuple.
2942 *
2943 * If we are forced to "start over" below, we keep the tuple lock;
2944 * this arranges that we stay at the head of the line while rechecking
2945 * tuple state.
2946 */
2948 {
2949 bool current_is_member = false;
2950
2953 {
2955
2956 /*
2957 * Acquire the lock, if necessary (but skip it when we're
2958 * requesting a lock and already have one; avoids deadlock).
2959 */
2960 if (!current_is_member)
2963
2964 /* wait for multixact */
2966 relation, &(tp.t_self), XLTW_Delete,
2967 NULL);
2969
2970 /*
2971 * If xwait had just locked the tuple then some other xact
2972 * could update this tuple before we get to this point. Check
2973 * for xmax change, and start over if so.
2974 *
2975 * We also must start over if we didn't pin the VM page, and
2976 * the page has become all visible.
2977 */
2978 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2981 xwait))
2982 goto l1;
2983 }
2984
2985 /*
2986 * You might think the multixact is necessarily done here, but not
2987 * so: it could have surviving members, namely our own xact or
2988 * other subxacts of this backend. It is legal for us to delete
2989 * the tuple in either case, however (the latter case is
2990 * essentially a situation of upgrading our former shared lock to
2991 * exclusive). We don't bother changing the on-disk hint bits
2992 * since we are about to overwrite the xmax altogether.
2993 */
2994 }
2996 {
2997 /*
2998 * Wait for regular transaction to end; but first, acquire tuple
2999 * lock.
3000 */
3004 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3006
3007 /*
3008 * xwait is done, but if xwait had just locked the tuple then some
3009 * other xact could update this tuple before we get to this point.
3010 * Check for xmax change, and start over if so.
3011 *
3012 * We also must start over if we didn't pin the VM page, and the
3013 * page has become all visible.
3014 */
3015 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
3018 xwait))
3019 goto l1;
3020
3021 /* Otherwise check if it committed or aborted */
3022 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3023 }
3024
3025 /*
3026 * We may overwrite if previous xmax aborted, or if it committed but
3027 * only locked the tuple without updating it.
3028 */
3029 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3032 result = TM_Ok;
3033 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
3034 result = TM_Updated;
3035 else
3036 result = TM_Deleted;
3037 }
3038
3039 /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3040 if (result != TM_Ok)
3041 {
3042 Assert(result == TM_SelfModified ||
3043 result == TM_Updated ||
3044 result == TM_Deleted ||
3045 result == TM_BeingModified);
3047 Assert(result != TM_Updated ||
3049 }
3050
3051 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3052 {
3053 /* Perform additional check for transaction-snapshot mode RI updates */
3054 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3055 result = TM_Updated;
3056 }
3057
3058 if (result != TM_Ok)
3059 {
3060 tmfd->ctid = tp.t_data->t_ctid;
3062 if (result == TM_SelfModified)
3064 else
3065 tmfd->cmax = InvalidCommandId;
3066 UnlockReleaseBuffer(buffer);
3067 if (have_tuple_lock)
3069 if (vmbuffer != InvalidBuffer)
3070 ReleaseBuffer(vmbuffer);
3071 return result;
3072 }
3073
3074 /*
3075 * We're about to do the actual delete -- check for conflict first, to
3076 * avoid possibly having to roll back work we've just done.
3077 *
3078 * This is safe without a recheck as long as there is no possibility of
3079 * another process scanning the page between this check and the delete
3080 * being visible to the scan (i.e., an exclusive buffer content lock is
3081 * continuously held from this point until the tuple delete is visible).
3082 */
3084
3085 /* replace cid with a combo CID if necessary */
3087
3088 /*
3089 * Compute replica identity tuple before entering the critical section so
3090 * we don't PANIC upon a memory allocation failure.
3091 */
3092 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3093
3094 /*
3095 * If this is the first possibly-multixact-able operation in the current
3096 * transaction, set my per-backend OldestMemberMXactId setting. We can be
3097 * certain that the transaction will never become a member of any older
3098 * MultiXactIds than that. (We have to do this even if we end up just
3099 * using our own TransactionId below, since some other backend could
3100 * incorporate our XID into a MultiXact immediately afterwards.)
3101 */
3103
3106 xid, LockTupleExclusive, true,
3107 &new_xmax, &new_infomask, &new_infomask2);
3108
3110
3111 /*
3112 * If this transaction commits, the tuple will become DEAD sooner or
3113 * later. Set flag that this page is a candidate for pruning once our xid
3114 * falls below the OldestXmin horizon. If the transaction finally aborts,
3115 * the subsequent page pruning will be a no-op and the hint will be
3116 * cleared.
3117 */
3118 PageSetPrunable(page, xid);
3119
3120 if (PageIsAllVisible(page))
3121 {
3122 all_visible_cleared = true;
3123 PageClearAllVisible(page);
3124 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3125 vmbuffer, VISIBILITYMAP_VALID_BITS);
3126 }
3127
3128 /* store transaction information of xact deleting the tuple */
3134 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3136 /* Make sure there is no forward chain link in t_ctid */
3137 tp.t_data->t_ctid = tp.t_self;
3138
3139 /* Signal that this is actually a move into another partition */
3140 if (changingPart)
3142
3143 MarkBufferDirty(buffer);
3144
3145 /*
3146 * XLOG stuff
3147 *
3148 * NB: heap_abort_speculative() uses the same xlog record and replay
3149 * routines.
3150 */
3151 if (RelationNeedsWAL(relation))
3152 {
3156
3157 /*
3158 * For logical decode we need combo CIDs to properly decode the
3159 * catalog
3160 */
3162 log_heap_new_cid(relation, &tp);
3163
3164 xlrec.flags = 0;
3167 if (changingPart)
3169 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3170 tp.t_data->t_infomask2);
3172 xlrec.xmax = new_xmax;
3173
3174 if (old_key_tuple != NULL)
3175 {
3176 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3178 else
3180 }
3181
3184
3186
3187 /*
3188 * Log replica identity of the deleted tuple if there is one
3189 */
3190 if (old_key_tuple != NULL)
3191 {
3192 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3193 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3194 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3195
3197 XLogRegisterData((char *) old_key_tuple->t_data
3199 old_key_tuple->t_len
3201 }
3202
3203 /* filtering by origin on a row level is much more efficient */
3205
3207
3208 PageSetLSN(page, recptr);
3209 }
3210
3212
3214
3215 if (vmbuffer != InvalidBuffer)
3216 ReleaseBuffer(vmbuffer);
3217
3218 /*
3219 * If the tuple has toasted out-of-line attributes, we need to delete
3220 * those items too. We have to do this before releasing the buffer
3221 * because we need to look at the contents of the tuple, but it's OK to
3222 * release the content lock on the buffer first.
3223 */
3224 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3225 relation->rd_rel->relkind != RELKIND_MATVIEW)
3226 {
3227 /* toast table entries should never be recursively toasted */
3229 }
3230 else if (HeapTupleHasExternal(&tp))
3231 heap_toast_delete(relation, &tp, false);
3232
3233 /*
3234 * Mark tuple for invalidation from system caches at next command
3235 * boundary. We have to do this before releasing the buffer because we
3236 * need to look at the contents of the tuple.
3237 */
3238 CacheInvalidateHeapTuple(relation, &tp, NULL);
3239
3240 /* Now we can release the buffer */
3241 ReleaseBuffer(buffer);
3242
3243 /*
3244 * Release the lmgr tuple lock, if we had it.
3245 */
3246 if (have_tuple_lock)
3248
3249 pgstat_count_heap_delete(relation);
3250
3253
3254 return TM_Ok;
3255}

References Assert, AssertHasSnapshotForToast(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), TM_FailureData::ctid, DoesMultiXactIdConflict(), END_CRIT_SECTION, ereport, errcode(), errmsg(), ERROR, ExtractReplicaIdentity(), fb(), GetCurrentTransactionId(), heap_acquire_tuplock(), heap_freetuple(), HEAP_MOVED, heap_toast_delete(), HEAP_XMAX_BITS, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HeapTupleHasExternal(), HeapTupleHeaderAdjustCmax(), HeapTupleHeaderClearHotUpdated(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetCmax(), HeapTupleHeaderSetMovedPartitions(), HeapTupleHeaderSetXmax(), HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesVisibility(), InvalidBuffer, InvalidCommandId, InvalidSnapshot, IsInParallelMode(), ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), LockTupleExclusive, LockWaitBlock, log_heap_new_cid(), MarkBufferDirty(), MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusUpdate, PageClearAllVisible(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_delete(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetRelid, RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapDelete, SizeOfHeapHeader, SizeofHeapTupleHeader, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TransactionIdEquals, TransactionIdIsCurrentTransactionId(), UnlockReleaseBuffer(), UnlockTupleTuplock, UpdateXmaxHintBits(), visibilitymap_clear(), visibilitymap_pin(), VISIBILITYMAP_VALID_BITS, XactLockTableWait(), XLH_DELETE_ALL_VISIBLE_CLEARED, XLH_DELETE_CONTAINS_OLD_KEY, XLH_DELETE_CONTAINS_OLD_TUPLE, XLH_DELETE_IS_PARTITION_MOVE, XLOG_HEAP_DELETE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLogSetRecordFlags(), XLTW_Delete, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_delete(), and simple_heap_delete().

◆ heap_endscan()

void heap_endscan ( TableScanDesc  sscan)

Definition at line 1371 of file heapam.c.

1372{
1374
1375 /* Note: no locking manipulations needed */
1376
1377 /*
1378 * unpin scan buffers
1379 */
1380 if (BufferIsValid(scan->rs_cbuf))
1381 ReleaseBuffer(scan->rs_cbuf);
1382
1383 /*
1384 * Must free the read stream before freeing the BufferAccessStrategy.
1385 */
1386 if (scan->rs_read_stream)
1388
1389 /*
1390 * decrement relation reference count and free scan descriptor storage
1391 */
1393
1394 if (scan->rs_base.rs_key)
1395 pfree(scan->rs_base.rs_key);
1396
1397 if (scan->rs_strategy != NULL)
1399
1400 if (scan->rs_parallelworkerdata != NULL)
1402
1403 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1405
1406 pfree(scan);
1407}

References BufferIsValid(), fb(), FreeAccessStrategy(), pfree(), read_stream_end(), RelationDecrementReferenceCount(), ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, TableScanDescData::rs_key, HeapScanDescData::rs_parallelworkerdata, TableScanDescData::rs_rd, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, HeapScanDescData::rs_strategy, SO_TEMP_SNAPSHOT, and UnregisterSnapshot().

◆ heap_fetch()

bool heap_fetch ( Relation  relation,
Snapshot  snapshot,
HeapTuple  tuple,
Buffer userbuf,
bool  keep_buf 
)

Definition at line 1659 of file heapam.c.

1664{
1665 ItemPointer tid = &(tuple->t_self);
1666 ItemId lp;
1667 Buffer buffer;
1668 Page page;
1669 OffsetNumber offnum;
1670 bool valid;
1671
1672 /*
1673 * Fetch and pin the appropriate page of the relation.
1674 */
1675 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1676
1677 /*
1678 * Need share lock on buffer to examine tuple commit status.
1679 */
1681 page = BufferGetPage(buffer);
1682
1683 /*
1684 * We'd better check for out-of-range offnum in case of VACUUM since the
1685 * TID was obtained.
1686 */
1687 offnum = ItemPointerGetOffsetNumber(tid);
1689 {
1691 ReleaseBuffer(buffer);
1693 tuple->t_data = NULL;
1694 return false;
1695 }
1696
1697 /*
1698 * get the item line pointer corresponding to the requested tid
1699 */
1700 lp = PageGetItemId(page, offnum);
1701
1702 /*
1703 * Must check for deleted tuple.
1704 */
1705 if (!ItemIdIsNormal(lp))
1706 {
1708 ReleaseBuffer(buffer);
1710 tuple->t_data = NULL;
1711 return false;
1712 }
1713
1714 /*
1715 * fill in *tuple fields
1716 */
1717 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1718 tuple->t_len = ItemIdGetLength(lp);
1719 tuple->t_tableOid = RelationGetRelid(relation);
1720
1721 /*
1722 * check tuple visibility, then release lock
1723 */
1724 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1725
1726 if (valid)
1727 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1729
1730 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1731
1733
1734 if (valid)
1735 {
1736 /*
1737 * All checks passed, so return the tuple as valid. Caller is now
1738 * responsible for releasing the buffer.
1739 */
1740 *userbuf = buffer;
1741
1742 return true;
1743 }
1744
1745 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1746 if (keep_buf)
1747 *userbuf = buffer;
1748 else
1749 {
1750 ReleaseBuffer(buffer);
1752 tuple->t_data = NULL;
1753 }
1754
1755 return false;
1756}

References BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetPage(), fb(), HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetXmin(), HeapTupleSatisfiesVisibility(), InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PredicateLockTID(), ReadBuffer(), RelationGetRelid, ReleaseBuffer(), HeapTupleData::t_data, HeapTupleData::t_len, HeapTupleData::t_self, and HeapTupleData::t_tableOid.

Referenced by heap_lock_updated_tuple_rec(), heapam_fetch_row_version(), and heapam_tuple_lock().

◆ heap_fetch_next_buffer()

static void heap_fetch_next_buffer ( HeapScanDesc  scan,
ScanDirection  dir 
)
inlinestatic

Definition at line 707 of file heapam.c.

708{
709 Assert(scan->rs_read_stream);
710
711 /* release previous scan buffer, if any */
712 if (BufferIsValid(scan->rs_cbuf))
713 {
714 ReleaseBuffer(scan->rs_cbuf);
715 scan->rs_cbuf = InvalidBuffer;
716 }
717
718 /*
719 * Be sure to check for interrupts at least once per page. Checks at
720 * higher code levels won't be able to stop a seqscan that encounters many
721 * pages' worth of consecutive dead tuples.
722 */
724
725 /*
726 * If the scan direction is changing, reset the prefetch block to the
727 * current block. Otherwise, we will incorrectly prefetch the blocks
728 * between the prefetch block and the current block again before
729 * prefetching blocks in the new, correct scan direction.
730 */
731 if (unlikely(scan->rs_dir != dir))
732 {
733 scan->rs_prefetch_block = scan->rs_cblock;
735 }
736
737 scan->rs_dir = dir;
738
740 if (BufferIsValid(scan->rs_cbuf))
742}

References Assert, BufferGetBlockNumber(), BufferIsValid(), CHECK_FOR_INTERRUPTS, fb(), InvalidBuffer, read_stream_next_buffer(), read_stream_reset(), ReleaseBuffer(), HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_dir, HeapScanDescData::rs_prefetch_block, HeapScanDescData::rs_read_stream, and unlikely.

Referenced by heapgettup(), and heapgettup_pagemode().

◆ heap_finish_speculative()

void heap_finish_speculative ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 6168 of file heapam.c.

6169{
6170 Buffer buffer;
6171 Page page;
6172 OffsetNumber offnum;
6173 ItemId lp;
6174 HeapTupleHeader htup;
6175
6176 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
6178 page = BufferGetPage(buffer);
6179
6180 offnum = ItemPointerGetOffsetNumber(tid);
6182 elog(ERROR, "offnum out of range");
6183 lp = PageGetItemId(page, offnum);
6184 if (!ItemIdIsNormal(lp))
6185 elog(ERROR, "invalid lp");
6186
6187 htup = (HeapTupleHeader) PageGetItem(page, lp);
6188
6189 /* NO EREPORT(ERROR) from here till changes are logged */
6191
6193
6194 MarkBufferDirty(buffer);
6195
6196 /*
6197 * Replace the speculative insertion token with a real t_ctid, pointing to
6198 * itself like it does on regular tuples.
6199 */
6200 htup->t_ctid = *tid;
6201
6202 /* XLOG stuff */
6203 if (RelationNeedsWAL(relation))
6204 {
6207
6209
6211
6212 /* We want the same filtering on this as on a plain insert */
6214
6217
6219
6220 PageSetLSN(page, recptr);
6221 }
6222
6224
6225 UnlockReleaseBuffer(buffer);
6226}

References Assert, BUFFER_LOCK_EXCLUSIVE, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, fb(), HeapTupleHeaderIsSpeculative(), ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), MarkBufferDirty(), xl_heap_confirm::offnum, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PageSetLSN(), ReadBuffer(), REGBUF_STANDARD, RelationNeedsWAL, SizeOfHeapConfirm, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, UnlockReleaseBuffer(), XLOG_HEAP_CONFIRM, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heapam_tuple_complete_speculative().

◆ heap_freeze_prepared_tuples()

void heap_freeze_prepared_tuples ( Buffer  buffer,
HeapTupleFreeze tuples,
int  ntuples 
)

Definition at line 7461 of file heapam.c.

7462{
7463 Page page = BufferGetPage(buffer);
7464
7465 for (int i = 0; i < ntuples; i++)
7466 {
7467 HeapTupleFreeze *frz = tuples + i;
7468 ItemId itemid = PageGetItemId(page, frz->offset);
7469 HeapTupleHeader htup;
7470
7471 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7473 }
7474}

References BufferGetPage(), fb(), heap_execute_freeze_tuple(), i, PageGetItem(), and PageGetItemId().

Referenced by heap_page_prune_and_freeze().

◆ heap_freeze_tuple()

bool heap_freeze_tuple ( HeapTupleHeader  tuple,
TransactionId  relfrozenxid,
TransactionId  relminmxid,
TransactionId  FreezeLimit,
TransactionId  MultiXactCutoff 
)

Definition at line 7483 of file heapam.c.

7486{
7488 bool do_freeze;
7489 bool totally_frozen;
7490 struct VacuumCutoffs cutoffs;
7491 HeapPageFreeze pagefrz;
7492
7493 cutoffs.relfrozenxid = relfrozenxid;
7494 cutoffs.relminmxid = relminmxid;
7495 cutoffs.OldestXmin = FreezeLimit;
7496 cutoffs.OldestMxact = MultiXactCutoff;
7497 cutoffs.FreezeLimit = FreezeLimit;
7498 cutoffs.MultiXactCutoff = MultiXactCutoff;
7499
7500 pagefrz.freeze_required = true;
7501 pagefrz.FreezePageRelfrozenXid = FreezeLimit;
7502 pagefrz.FreezePageRelminMxid = MultiXactCutoff;
7503 pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
7504 pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
7505
7506 do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
7507 &pagefrz, &frz, &totally_frozen);
7508
7509 /*
7510 * Note that because this is not a WAL-logged operation, we don't need to
7511 * fill in the offset in the freeze record.
7512 */
7513
7514 if (do_freeze)
7516 return do_freeze;
7517}

References fb(), VacuumCutoffs::FreezeLimit, heap_execute_freeze_tuple(), heap_prepare_freeze_tuple(), VacuumCutoffs::MultiXactCutoff, VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, VacuumCutoffs::relfrozenxid, and VacuumCutoffs::relminmxid.

Referenced by rewrite_heap_tuple().

◆ heap_get_latest_tid()

void heap_get_latest_tid ( TableScanDesc  sscan,
ItemPointer  tid 
)

Definition at line 1931 of file heapam.c.

1933{
1934 Relation relation = sscan->rs_rd;
1935 Snapshot snapshot = sscan->rs_snapshot;
1936 ItemPointerData ctid;
1938
1939 /*
1940 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1941 * Assume that t_ctid links are valid however - there shouldn't be invalid
1942 * ones in the table.
1943 */
1945
1946 /*
1947 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1948 * need to examine, and *tid is the TID we will return if ctid turns out
1949 * to be bogus.
1950 *
1951 * Note that we will loop until we reach the end of the t_ctid chain.
1952 * Depending on the snapshot passed, there might be at most one visible
1953 * version of the row, but we don't try to optimize for that.
1954 */
1955 ctid = *tid;
1956 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1957 for (;;)
1958 {
1959 Buffer buffer;
1960 Page page;
1961 OffsetNumber offnum;
1962 ItemId lp;
1963 HeapTupleData tp;
1964 bool valid;
1965
1966 /*
1967 * Read, pin, and lock the page.
1968 */
1969 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1971 page = BufferGetPage(buffer);
1972
1973 /*
1974 * Check for bogus item number. This is not treated as an error
1975 * condition because it can happen while following a t_ctid link. We
1976 * just assume that the prior tid is OK and return it unchanged.
1977 */
1978 offnum = ItemPointerGetOffsetNumber(&ctid);
1980 {
1981 UnlockReleaseBuffer(buffer);
1982 break;
1983 }
1984 lp = PageGetItemId(page, offnum);
1985 if (!ItemIdIsNormal(lp))
1986 {
1987 UnlockReleaseBuffer(buffer);
1988 break;
1989 }
1990
1991 /* OK to access the tuple */
1992 tp.t_self = ctid;
1993 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1994 tp.t_len = ItemIdGetLength(lp);
1995 tp.t_tableOid = RelationGetRelid(relation);
1996
1997 /*
1998 * After following a t_ctid link, we might arrive at an unrelated
1999 * tuple. Check for XMIN match.
2000 */
2003 {
2004 UnlockReleaseBuffer(buffer);
2005 break;
2006 }
2007
2008 /*
2009 * Check tuple visibility; if visible, set it as the new result
2010 * candidate.
2011 */
2012 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2013 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2014 if (valid)
2015 *tid = ctid;
2016
2017 /*
2018 * If there's a valid t_ctid link, follow it, else we're done.
2019 */
2020 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2024 {
2025 UnlockReleaseBuffer(buffer);
2026 break;
2027 }
2028
2029 ctid = tp.t_data->t_ctid;
2031 UnlockReleaseBuffer(buffer);
2032 } /* end of loop */
2033}

References Assert, BUFFER_LOCK_SHARE, BufferGetPage(), fb(), HEAP_XMAX_INVALID, HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIndicatesMovedPartitions(), HeapTupleHeaderIsOnlyLocked(), HeapTupleSatisfiesVisibility(), InvalidTransactionId, ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), ReadBuffer(), RelationGetRelid, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TransactionIdEquals, TransactionIdIsValid, and UnlockReleaseBuffer().

◆ heap_getnext()

HeapTuple heap_getnext ( TableScanDesc  sscan,
ScanDirection  direction 
)

Definition at line 1410 of file heapam.c.

1411{
1413
1414 /*
1415 * This is still widely used directly, without going through table AM, so
1416 * add a safety check. It's possible we should, at a later point,
1417 * downgrade this to an assert. The reason for checking the AM routine,
1418 * rather than the AM oid, is that this allows to write regression tests
1419 * that create another AM reusing the heap handler.
1420 */
1421 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1422 ereport(ERROR,
1424 errmsg_internal("only heap AM is supported")));
1425
1426 /* Note: no locking manipulations needed */
1427
1429 heapgettup_pagemode(scan, direction,
1430 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1431 else
1432 heapgettup(scan, direction,
1433 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1434
1435 if (scan->rs_ctup.t_data == NULL)
1436 return NULL;
1437
1438 /*
1439 * if we get here it means we have a new current scan tuple, so point to
1440 * the proper return buffer and return the tuple.
1441 */
1442
1444
1445 return &scan->rs_ctup;
1446}

References ereport, errcode(), errmsg_internal(), ERROR, fb(), GetHeapamTableAmRoutine(), heapgettup(), heapgettup_pagemode(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_ctup, TableScanDescData::rs_flags, TableScanDescData::rs_key, TableScanDescData::rs_nkeys, TableScanDescData::rs_rd, SO_ALLOW_PAGEMODE, HeapTupleData::t_data, and unlikely.

Referenced by AlterTableMoveAll(), AlterTableSpaceOptions(), check_db_file_conflict(), CreateDatabaseUsingFileCopy(), do_autovacuum(), DropSetting(), DropTableSpace(), find_typed_table_dependencies(), get_all_vacuum_rels(), get_database_list(), get_subscription_list(), get_tables_to_cluster(), get_tablespace_name(), get_tablespace_oid(), GetAllPublicationRelations(), getRelationsInNamespace(), GetSchemaPublicationRelations(), heapam_index_build_range_scan(), heapam_index_validate_scan(), objectsInSchemaToOids(), pgrowlocks(), pgstat_heap(), populate_typ_list(), ReindexMultipleTables(), remove_dbtablespaces(), RemoveSubscriptionRel(), RenameTableSpace(), ThereIsAtLeastOneRole(), and vac_truncate_clog().

◆ heap_getnextslot()

bool heap_getnextslot ( TableScanDesc  sscan,
ScanDirection  direction,
TupleTableSlot slot 
)

Definition at line 1449 of file heapam.c.

1450{
1452
1453 /* Note: no locking manipulations needed */
1454
1455 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1456 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1457 else
1458 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1459
1460 if (scan->rs_ctup.t_data == NULL)
1461 {
1462 ExecClearTuple(slot);
1463 return false;
1464 }
1465
1466 /*
1467 * if we get here it means we have a new current scan tuple, so point to
1468 * the proper return buffer and return the tuple.
1469 */
1470
1472
1473 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1474 scan->rs_cbuf);
1475 return true;
1476}

References ExecClearTuple(), ExecStoreBufferHeapTuple(), fb(), heapgettup(), heapgettup_pagemode(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_rd, SO_ALLOW_PAGEMODE, and HeapTupleData::t_data.

◆ heap_getnextslot_tidrange()

bool heap_getnextslot_tidrange ( TableScanDesc  sscan,
ScanDirection  direction,
TupleTableSlot slot 
)

Definition at line 1552 of file heapam.c.

1554{
1556 ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
1557 ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
1558
1559 /* Note: no locking manipulations needed */
1560 for (;;)
1561 {
1562 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1563 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1564 else
1565 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1566
1567 if (scan->rs_ctup.t_data == NULL)
1568 {
1569 ExecClearTuple(slot);
1570 return false;
1571 }
1572
1573 /*
1574 * heap_set_tidrange will have used heap_setscanlimits to limit the
1575 * range of pages we scan to only ones that can contain the TID range
1576 * we're scanning for. Here we must filter out any tuples from these
1577 * pages that are outside of that range.
1578 */
1579 if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1580 {
1581 ExecClearTuple(slot);
1582
1583 /*
1584 * When scanning backwards, the TIDs will be in descending order.
1585 * Future tuples in this direction will be lower still, so we can
1586 * just return false to indicate there will be no more tuples.
1587 */
1588 if (ScanDirectionIsBackward(direction))
1589 return false;
1590
1591 continue;
1592 }
1593
1594 /*
1595 * Likewise for the final page, we must filter out TIDs greater than
1596 * maxtid.
1597 */
1598 if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1599 {
1600 ExecClearTuple(slot);
1601
1602 /*
1603 * When scanning forward, the TIDs will be in ascending order.
1604 * Future tuples in this direction will be higher still, so we can
1605 * just return false to indicate there will be no more tuples.
1606 */
1607 if (ScanDirectionIsForward(direction))
1608 return false;
1609 continue;
1610 }
1611
1612 break;
1613 }
1614
1615 /*
1616 * if we get here it means we have a new current scan tuple, so point to
1617 * the proper return buffer and return the tuple.
1618 */
1620
1621 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1622 return true;
1623}

References ExecClearTuple(), ExecStoreBufferHeapTuple(), fb(), heapgettup(), heapgettup_pagemode(), ItemPointerCompare(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_rd, ScanDirectionIsBackward, ScanDirectionIsForward, SO_ALLOW_PAGEMODE, HeapTupleData::t_data, and HeapTupleData::t_self.

◆ heap_hot_search_buffer()

bool heap_hot_search_buffer ( ItemPointer  tid,
Relation  relation,
Buffer  buffer,
Snapshot  snapshot,
HeapTuple  heapTuple,
bool all_dead,
bool  first_call 
)

Definition at line 1779 of file heapam.c.

1782{
1783 Page page = BufferGetPage(buffer);
1785 BlockNumber blkno;
1786 OffsetNumber offnum;
1787 bool at_chain_start;
1788 bool valid;
1789 bool skip;
1790 GlobalVisState *vistest = NULL;
1791
1792 /* If this is not the first call, previous call returned a (live!) tuple */
1793 if (all_dead)
1795
1796 blkno = ItemPointerGetBlockNumber(tid);
1797 offnum = ItemPointerGetOffsetNumber(tid);
1799 skip = !first_call;
1800
1801 /* XXX: we should assert that a snapshot is pushed or registered */
1803 Assert(BufferGetBlockNumber(buffer) == blkno);
1804
1805 /* Scan through possible multiple members of HOT-chain */
1806 for (;;)
1807 {
1808 ItemId lp;
1809
1810 /* check for bogus TID */
1812 break;
1813
1814 lp = PageGetItemId(page, offnum);
1815
1816 /* check for unused, dead, or redirected items */
1817 if (!ItemIdIsNormal(lp))
1818 {
1819 /* We should only see a redirect at start of chain */
1821 {
1822 /* Follow the redirect */
1823 offnum = ItemIdGetRedirect(lp);
1824 at_chain_start = false;
1825 continue;
1826 }
1827 /* else must be end of chain */
1828 break;
1829 }
1830
1831 /*
1832 * Update heapTuple to point to the element of the HOT chain we're
1833 * currently investigating. Having t_self set correctly is important
1834 * because the SSI checks and the *Satisfies routine for historical
1835 * MVCC snapshots need the correct tid to decide about the visibility.
1836 */
1837 heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1838 heapTuple->t_len = ItemIdGetLength(lp);
1839 heapTuple->t_tableOid = RelationGetRelid(relation);
1840 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1841
1842 /*
1843 * Shouldn't see a HEAP_ONLY tuple at chain start.
1844 */
1846 break;
1847
1848 /*
1849 * The xmin should match the previous xmax value, else chain is
1850 * broken.
1851 */
1855 break;
1856
1857 /*
1858 * When first_call is true (and thus, skip is initially false) we'll
1859 * return the first tuple we find. But on later passes, heapTuple
1860 * will initially be pointing to the tuple we returned last time.
1861 * Returning it again would be incorrect (and would loop forever), so
1862 * we skip it and return the next match we find.
1863 */
1864 if (!skip)
1865 {
1866 /* If it's visible per the snapshot, we must return it */
1867 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1869 buffer, snapshot);
1870
1871 if (valid)
1872 {
1873 ItemPointerSetOffsetNumber(tid, offnum);
1874 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1876 if (all_dead)
1877 *all_dead = false;
1878 return true;
1879 }
1880 }
1881 skip = false;
1882
1883 /*
1884 * If we can't see it, maybe no one else can either. At caller
1885 * request, check whether all chain members are dead to all
1886 * transactions.
1887 *
1888 * Note: if you change the criterion here for what is "dead", fix the
1889 * planner's get_actual_variable_range() function to match.
1890 */
1891 if (all_dead && *all_dead)
1892 {
1893 if (!vistest)
1894 vistest = GlobalVisTestFor(relation);
1895
1896 if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1897 *all_dead = false;
1898 }
1899
1900 /*
1901 * Check to see if HOT chain continues past this tuple; if so fetch
1902 * the next offnum and loop around.
1903 */
1905 {
1906 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1907 blkno);
1908 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1909 at_chain_start = false;
1911 }
1912 else
1913 break; /* end of chain */
1914 }
1915
1916 return false;
1917}

References Assert, BufferGetBlockNumber(), BufferGetPage(), fb(), GlobalVisTestFor(), HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleIsHeapOnly(), HeapTupleIsHotUpdated(), HeapTupleIsSurelyDead(), HeapTupleSatisfiesVisibility(), InvalidTransactionId, ItemIdGetLength, ItemIdGetRedirect, ItemIdIsNormal, ItemIdIsRedirected, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerSet(), ItemPointerSetOffsetNumber(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PredicateLockTID(), RecentXmin, RelationGetRelid, skip, TransactionIdEquals, and TransactionIdIsValid.

Referenced by BitmapHeapScanNextBlock(), heap_index_delete_tuples(), and heapam_index_fetch_tuple().

◆ heap_index_delete_tuples()

TransactionId heap_index_delete_tuples ( Relation  rel,
TM_IndexDeleteOp delstate 
)

Definition at line 8199 of file heapam.c.

8200{
8201 /* Initial assumption is that earlier pruning took care of conflict */
8202 TransactionId snapshotConflictHorizon = InvalidTransactionId;
8205 Page page = NULL;
8208#ifdef USE_PREFETCH
8211#endif
8213 int finalndeltids = 0,
8214 nblocksaccessed = 0;
8215
8216 /* State that's only used in bottom-up index deletion case */
8217 int nblocksfavorable = 0;
8218 int curtargetfreespace = delstate->bottomupfreespace,
8219 lastfreespace = 0,
8220 actualfreespace = 0;
8221 bool bottomup_final_block = false;
8222
8224
8225 /* Sort caller's deltids array by TID for further processing */
8227
8228 /*
8229 * Bottom-up case: resort deltids array in an order attuned to where the
8230 * greatest number of promising TIDs are to be found, and determine how
8231 * many blocks from the start of sorted array should be considered
8232 * favorable. This will also shrink the deltids array in order to
8233 * eliminate completely unfavorable blocks up front.
8234 */
8235 if (delstate->bottomup)
8237
8238#ifdef USE_PREFETCH
8239 /* Initialize prefetch state. */
8241 prefetch_state.next_item = 0;
8242 prefetch_state.ndeltids = delstate->ndeltids;
8243 prefetch_state.deltids = delstate->deltids;
8244
8245 /*
8246 * Determine the prefetch distance that we will attempt to maintain.
8247 *
8248 * Since the caller holds a buffer lock somewhere in rel, we'd better make
8249 * sure that isn't a catalog relation before we call code that does
8250 * syscache lookups, to avoid risk of deadlock.
8251 */
8252 if (IsCatalogRelation(rel))
8254 else
8257
8258 /* Cap initial prefetch distance for bottom-up deletion caller */
8259 if (delstate->bottomup)
8260 {
8264 }
8265
8266 /* Start prefetching. */
8268#endif
8269
8270 /* Iterate over deltids, determine which to delete, check their horizon */
8271 Assert(delstate->ndeltids > 0);
8272 for (int i = 0; i < delstate->ndeltids; i++)
8273 {
8274 TM_IndexDelete *ideltid = &delstate->deltids[i];
8275 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8276 ItemPointer htid = &ideltid->tid;
8277 OffsetNumber offnum;
8278
8279 /*
8280 * Read buffer, and perform required extra steps each time a new block
8281 * is encountered. Avoid refetching if it's the same block as the one
8282 * from the last htid.
8283 */
8284 if (blkno == InvalidBlockNumber ||
8286 {
8287 /*
8288 * Consider giving up early for bottom-up index deletion caller
8289 * first. (Only prefetch next-next block afterwards, when it
8290 * becomes clear that we're at least going to access the next
8291 * block in line.)
8292 *
8293 * Sometimes the first block frees so much space for bottom-up
8294 * caller that the deletion process can end without accessing any
8295 * more blocks. It is usually necessary to access 2 or 3 blocks
8296 * per bottom-up deletion operation, though.
8297 */
8298 if (delstate->bottomup)
8299 {
8300 /*
8301 * We often allow caller to delete a few additional items
8302 * whose entries we reached after the point that space target
8303 * from caller was satisfied. The cost of accessing the page
8304 * was already paid at that point, so it made sense to finish
8305 * it off. When that happened, we finalize everything here
8306 * (by finishing off the whole bottom-up deletion operation
8307 * without needlessly paying the cost of accessing any more
8308 * blocks).
8309 */
8311 break;
8312
8313 /*
8314 * Give up when we didn't enable our caller to free any
8315 * additional space as a result of processing the page that we
8316 * just finished up with. This rule is the main way in which
8317 * we keep the cost of bottom-up deletion under control.
8318 */
8320 break;
8321 lastfreespace = actualfreespace; /* for next time */
8322
8323 /*
8324 * Deletion operation (which is bottom-up) will definitely
8325 * access the next block in line. Prepare for that now.
8326 *
8327 * Decay target free space so that we don't hang on for too
8328 * long with a marginal case. (Space target is only truly
8329 * helpful when it allows us to recognize that we don't need
8330 * to access more than 1 or 2 blocks to satisfy caller due to
8331 * agreeable workload characteristics.)
8332 *
8333 * We are a bit more patient when we encounter contiguous
8334 * blocks, though: these are treated as favorable blocks. The
8335 * decay process is only applied when the next block in line
8336 * is not a favorable/contiguous block. This is not an
8337 * exception to the general rule; we still insist on finding
8338 * at least one deletable item per block accessed. See
8339 * bottomup_nblocksfavorable() for full details of the theory
8340 * behind favorable blocks and heap block locality in general.
8341 *
8342 * Note: The first block in line is always treated as a
8343 * favorable block, so the earliest possible point that the
8344 * decay can be applied is just before we access the second
8345 * block in line. The Assert() verifies this for us.
8346 */
8348 if (nblocksfavorable > 0)
8350 else
8351 curtargetfreespace /= 2;
8352 }
8353
8354 /* release old buffer */
8355 if (BufferIsValid(buf))
8357
8359 buf = ReadBuffer(rel, blkno);
8361 Assert(!delstate->bottomup ||
8363
8364#ifdef USE_PREFETCH
8365
8366 /*
8367 * To maintain the prefetch distance, prefetch one more page for
8368 * each page we read.
8369 */
8371#endif
8372
8374
8375 page = BufferGetPage(buf);
8376 maxoff = PageGetMaxOffsetNumber(page);
8377 }
8378
8379 /*
8380 * In passing, detect index corruption involving an index page with a
8381 * TID that points to a location in the heap that couldn't possibly be
8382 * correct. We only do this with actual TIDs from caller's index page
8383 * (not items reached by traversing through a HOT chain).
8384 */
8386
8387 if (istatus->knowndeletable)
8388 Assert(!delstate->bottomup && !istatus->promising);
8389 else
8390 {
8391 ItemPointerData tmp = *htid;
8393
8394 /* Are any tuples from this HOT chain non-vacuumable? */
8396 &heapTuple, NULL, true))
8397 continue; /* can't delete entry */
8398
8399 /* Caller will delete, since whole HOT chain is vacuumable */
8400 istatus->knowndeletable = true;
8401
8402 /* Maintain index free space info for bottom-up deletion case */
8403 if (delstate->bottomup)
8404 {
8405 Assert(istatus->freespace > 0);
8406 actualfreespace += istatus->freespace;
8408 bottomup_final_block = true;
8409 }
8410 }
8411
8412 /*
8413 * Maintain snapshotConflictHorizon value for deletion operation as a
8414 * whole by advancing current value using heap tuple headers. This is
8415 * loosely based on the logic for pruning a HOT chain.
8416 */
8418 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
8419 for (;;)
8420 {
8421 ItemId lp;
8422 HeapTupleHeader htup;
8423
8424 /* Sanity check (pure paranoia) */
8425 if (offnum < FirstOffsetNumber)
8426 break;
8427
8428 /*
8429 * An offset past the end of page's line pointer array is possible
8430 * when the array was truncated
8431 */
8432 if (offnum > maxoff)
8433 break;
8434
8435 lp = PageGetItemId(page, offnum);
8437 {
8438 offnum = ItemIdGetRedirect(lp);
8439 continue;
8440 }
8441
8442 /*
8443 * We'll often encounter LP_DEAD line pointers (especially with an
8444 * entry marked knowndeletable by our caller up front). No heap
8445 * tuple headers get examined for an htid that leads us to an
8446 * LP_DEAD item. This is okay because the earlier pruning
8447 * operation that made the line pointer LP_DEAD in the first place
8448 * must have considered the original tuple header as part of
8449 * generating its own snapshotConflictHorizon value.
8450 *
8451 * Relying on XLOG_HEAP2_PRUNE_VACUUM_SCAN records like this is
8452 * the same strategy that index vacuuming uses in all cases. Index
8453 * VACUUM WAL records don't even have a snapshotConflictHorizon
8454 * field of their own for this reason.
8455 */
8456 if (!ItemIdIsNormal(lp))
8457 break;
8458
8459 htup = (HeapTupleHeader) PageGetItem(page, lp);
8460
8461 /*
8462 * Check the tuple XMIN against prior XMAX, if any
8463 */
8466 break;
8467
8469 &snapshotConflictHorizon);
8470
8471 /*
8472 * If the tuple is not HOT-updated, then we are at the end of this
8473 * HOT-chain. No need to visit later tuples from the same update
8474 * chain (they get their own index entries) -- just move on to
8475 * next htid from index AM caller.
8476 */
8477 if (!HeapTupleHeaderIsHotUpdated(htup))
8478 break;
8479
8480 /* Advance to next HOT chain member */
8481 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
8482 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
8484 }
8485
8486 /* Enable further/final shrinking of deltids for caller */
8487 finalndeltids = i + 1;
8488 }
8489
8491
8492 /*
8493 * Shrink deltids array to exclude non-deletable entries at the end. This
8494 * is not just a minor optimization. Final deltids array size might be
8495 * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
8496 * ndeltids being zero in all cases with zero total deletable entries.
8497 */
8498 Assert(finalndeltids > 0 || delstate->bottomup);
8499 delstate->ndeltids = finalndeltids;
8500
8501 return snapshotConflictHorizon;
8502}

References Assert, BOTTOMUP_MAX_NBLOCKS, bottomup_sort_and_shrink(), buf, BUFFER_LOCK_SHARE, BufferGetPage(), BufferIsValid(), fb(), FirstOffsetNumber, get_tablespace_maintenance_io_concurrency(), GlobalVisTestFor(), heap_hot_search_buffer(), HeapTupleHeaderAdvanceConflictHorizon(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIsHotUpdated(), i, index_delete_check_htid(), index_delete_sort(), InitNonVacuumableSnapshot, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, InvalidTransactionId, IsCatalogRelation(), ItemIdGetRedirect, ItemIdIsNormal, ItemIdIsRedirected, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), maintenance_io_concurrency, Min, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationData::rd_rel, ReadBuffer(), HeapTupleHeaderData::t_ctid, TransactionIdEquals, TransactionIdIsValid, and UnlockReleaseBuffer().

◆ heap_inplace_lock()

bool heap_inplace_lock ( Relation  relation,
HeapTuple  oldtup_ptr,
Buffer  buffer,
void(*)(void *)  release_callback,
void arg 
)

Definition at line 6437 of file heapam.c.

6440{
6441 HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */
6442 TM_Result result;
6443 bool ret;
6444
6445#ifdef USE_ASSERT_CHECKING
6446 if (RelationGetRelid(relation) == RelationRelationId)
6448#endif
6449
6450 Assert(BufferIsValid(buffer));
6451
6452 /*
6453 * Register shared cache invals if necessary. Other sessions may finish
6454 * inplace updates of this tuple between this step and LockTuple(). Since
6455 * inplace updates don't change cache keys, that's harmless.
6456 *
6457 * While it's tempting to register invals only after confirming we can
6458 * return true, the following obstacle precludes reordering steps that
6459 * way. Registering invals might reach a CatalogCacheInitializeCache()
6460 * that locks "buffer". That would hang indefinitely if running after our
6461 * own LockBuffer(). Hence, we must register invals before LockBuffer().
6462 */
6464
6465 LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6467
6468 /*----------
6469 * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
6470 *
6471 * - wait unconditionally
6472 * - already locked tuple above, since inplace needs that unconditionally
6473 * - don't recheck header after wait: simpler to defer to next iteration
6474 * - don't try to continue even if the updater aborts: likewise
6475 * - no crosscheck
6476 */
6478 buffer);
6479
6480 if (result == TM_Invisible)
6481 {
6482 /* no known way this can happen */
6483 ereport(ERROR,
6485 errmsg_internal("attempted to overwrite invisible tuple")));
6486 }
6487 else if (result == TM_SelfModified)
6488 {
6489 /*
6490 * CREATE INDEX might reach this if an expression is silly enough to
6491 * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
6492 * statements might get here after a heap_update() of the same row, in
6493 * the absence of an intervening CommandCounterIncrement().
6494 */
6495 ereport(ERROR,
6497 errmsg("tuple to be updated was already modified by an operation triggered by the current command")));
6498 }
6499 else if (result == TM_BeingModified)
6500 {
6503
6505 infomask = oldtup.t_data->t_infomask;
6506
6508 {
6511 int remain;
6512
6514 lockmode, NULL))
6515 {
6518 ret = false;
6520 relation, &oldtup.t_self, XLTW_Update,
6521 &remain);
6522 }
6523 else
6524 ret = true;
6525 }
6527 ret = true;
6529 ret = true;
6530 else
6531 {
6534 ret = false;
6535 XactLockTableWait(xwait, relation, &oldtup.t_self,
6536 XLTW_Update);
6537 }
6538 }
6539 else
6540 {
6541 ret = (result == TM_Ok);
6542 if (!ret)
6543 {
6546 }
6547 }
6548
6549 /*
6550 * GetCatalogSnapshot() relies on invalidation messages to know when to
6551 * take a new snapshot. COMMIT of xwait is responsible for sending the
6552 * invalidation. We're not acquiring heavyweight locks sufficient to
6553 * block if not yet sent, so we must take a new snapshot to ensure a later
6554 * attempt has a fair chance. While we don't need this if xwait aborted,
6555 * don't bother optimizing that.
6556 */
6557 if (!ret)
6558 {
6559 UnlockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6562 }
6563 return ret;
6564}

References arg, Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsValid(), CacheInvalidateHeapTupleInplace(), DoesMultiXactIdConflict(), ereport, errcode(), errmsg(), errmsg_internal(), ERROR, fb(), ForgetInplace_Inval(), GetCurrentCommandId(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleSatisfiesUpdate(), InplaceUpdateTupleLock, InvalidateCatalogSnapshot(), LockBuffer(), LockTuple(), LockTupleNoKeyExclusive, MultiXactIdWait(), MultiXactStatusNoKeyUpdate, RelationGetRelid, TM_BeingModified, TM_Invisible, TM_Ok, TM_SelfModified, TransactionIdIsCurrentTransactionId(), UnlockTuple(), XactLockTableWait(), and XLTW_Update.

Referenced by systable_inplace_update_begin().

◆ heap_inplace_unlock()

void heap_inplace_unlock ( Relation  relation,
HeapTuple  oldtup,
Buffer  buffer 
)

◆ heap_inplace_update_and_unlock()

void heap_inplace_update_and_unlock ( Relation  relation,
HeapTuple  oldtup,
HeapTuple  tuple,
Buffer  buffer 
)

Definition at line 6575 of file heapam.c.

6578{
6579 HeapTupleHeader htup = oldtup->t_data;
6580 uint32 oldlen;
6581 uint32 newlen;
6582 char *dst;
6583 char *src;
6584 int nmsgs = 0;
6586 bool RelcacheInitFileInval = false;
6587
6588 Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
6589 oldlen = oldtup->t_len - htup->t_hoff;
6590 newlen = tuple->t_len - tuple->t_data->t_hoff;
6591 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6592 elog(ERROR, "wrong tuple length");
6593
6594 dst = (char *) htup + htup->t_hoff;
6595 src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6596
6597 /* Like RecordTransactionCommit(), log only if needed */
6600 &RelcacheInitFileInval);
6601
6602 /*
6603 * Unlink relcache init files as needed. If unlinking, acquire
6604 * RelCacheInitLock until after associated invalidations. By doing this
6605 * in advance, if we checkpoint and then crash between inplace
6606 * XLogInsert() and inval, we don't rely on StartupXLOG() ->
6607 * RelationCacheInitFileRemove(). That uses elevel==LOG, so replay would
6608 * neglect to PANIC on EIO.
6609 */
6611
6612 /*----------
6613 * NO EREPORT(ERROR) from here till changes are complete
6614 *
6615 * Our buffer lock won't stop a reader having already pinned and checked
6616 * visibility for this tuple. Hence, we write WAL first, then mutate the
6617 * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6618 * checkpoint delay makes that acceptable. With the usual order of
6619 * changes, a crash after memcpy() and before XLogInsert() could allow
6620 * datfrozenxid to overtake relfrozenxid:
6621 *
6622 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6623 * ["R" is a VACUUM tbl]
6624 * D: vac_update_datfrozenxid() -> systable_beginscan(pg_class)
6625 * D: systable_getnext() returns pg_class tuple of tbl
6626 * R: memcpy() into pg_class tuple of tbl
6627 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6628 * [crash]
6629 * [recovery restores datfrozenxid w/o relfrozenxid]
6630 *
6631 * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
6632 * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
6633 * The stack copy facilitates a FPI of the post-mutation block before we
6634 * accept other sessions seeing it. DELAY_CHKPT_START allows us to
6635 * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
6636 * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
6637 * This function, however, likely could avoid it with the following order
6638 * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
6639 * DELAY_CHKPT_START here, too, as a way to have fewer distinct code
6640 * patterns to analyze. Inplace update isn't so frequent that it should
6641 * pursue the small optimization of skipping DELAY_CHKPT_START.
6642 */
6646
6647 /* XLOG stuff */
6648 if (RelationNeedsWAL(relation))
6649 {
6652 char *origdata = (char *) BufferGetBlock(buffer);
6653 Page page = BufferGetPage(buffer);
6654 uint16 lower = ((PageHeader) page)->pd_lower;
6655 uint16 upper = ((PageHeader) page)->pd_upper;
6657 RelFileLocator rlocator;
6658 ForkNumber forkno;
6659 BlockNumber blkno;
6661
6662 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6663 xlrec.dbId = MyDatabaseId;
6665 xlrec.relcacheInitFileInval = RelcacheInitFileInval;
6666 xlrec.nmsgs = nmsgs;
6667
6670 if (nmsgs != 0)
6672 nmsgs * sizeof(SharedInvalidationMessage));
6673
6674 /* register block matching what buffer will look like after changes */
6679 BufferGetTag(buffer, &rlocator, &forkno, &blkno);
6680 Assert(forkno == MAIN_FORKNUM);
6681 XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
6683 XLogRegisterBufData(0, src, newlen);
6684
6685 /* inplace updates aren't decoded atm, don't log the origin */
6686
6688
6689 PageSetLSN(page, recptr);
6690 }
6691
6692 memcpy(dst, src, newlen);
6693
6694 MarkBufferDirty(buffer);
6695
6697
6698 /*
6699 * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we
6700 * do this before UnlockTuple().
6701 */
6703
6706 UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
6707
6708 AcceptInvalidationMessages(); /* local processing of just-sent inval */
6709
6710 /*
6711 * Queue a transactional inval, for logical decoding and for third-party
6712 * code that might have been relying on it since long before inplace
6713 * update adopted immediate invalidation. See README.tuplock section
6714 * "Reading inplace-updated columns" for logical decoding details.
6715 */
6717 CacheInvalidateHeapTuple(relation, tuple, NULL);
6718}

References AcceptInvalidationMessages(), Assert, AtInplace_Inval(), BUFFER_LOCK_UNLOCK, BufferGetBlock(), BufferGetPage(), BufferGetTag(), CacheInvalidateHeapTuple(), DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog, END_CRIT_SECTION, ERROR, fb(), inplaceGetInvalidationMessages(), InplaceUpdateTupleLock, IsBootstrapProcessingMode, ItemPointerEquals(), ItemPointerGetOffsetNumber(), LockBuffer(), lower(), MAIN_FORKNUM, MarkBufferDirty(), MinSizeOfHeapInplace, MyDatabaseId, MyDatabaseTableSpace, MyProc, PageSetLSN(), PreInplace_Inval(), REGBUF_STANDARD, RelationNeedsWAL, START_CRIT_SECTION, HeapTupleData::t_data, HeapTupleHeaderData::t_hoff, HeapTupleData::t_len, HeapTupleData::t_self, UnlockTuple(), upper(), XLOG_HEAP_INPLACE, XLogBeginInsert(), XLogInsert(), XLogRegisterBlock(), XLogRegisterBufData(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by systable_inplace_update_finish().

◆ heap_insert()

void heap_insert ( Relation  relation,
HeapTuple  tup,
CommandId  cid,
int  options,
BulkInsertState  bistate 
)

Definition at line 2142 of file heapam.c.

2144{
2147 Buffer buffer;
2148 Buffer vmbuffer = InvalidBuffer;
2149 bool all_visible_cleared = false;
2150
2151 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2154
2155 AssertHasSnapshotForToast(relation);
2156
2157 /*
2158 * Fill in tuple header fields and toast the tuple if necessary.
2159 *
2160 * Note: below this point, heaptup is the data we actually intend to store
2161 * into the relation; tup is the caller's original untoasted data.
2162 */
2163 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2164
2165 /*
2166 * Find buffer to insert this tuple into. If the page is all visible,
2167 * this will also pin the requisite visibility map page.
2168 */
2169 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2170 InvalidBuffer, options, bistate,
2171 &vmbuffer, NULL,
2172 0);
2173
2174 /*
2175 * We're about to do the actual insert -- but check for conflict first, to
2176 * avoid possibly having to roll back work we've just done.
2177 *
2178 * This is safe without a recheck as long as there is no possibility of
2179 * another process scanning the page between this check and the insert
2180 * being visible to the scan (i.e., an exclusive buffer content lock is
2181 * continuously held from this point until the tuple insert is visible).
2182 *
2183 * For a heap insert, we only need to check for table-level SSI locks. Our
2184 * new tuple can't possibly conflict with existing tuple locks, and heap
2185 * page locks are only consolidated versions of tuple locks; they do not
2186 * lock "gaps" as index page locks do. So we don't need to specify a
2187 * buffer when making the call, which makes for a faster check.
2188 */
2190
2191 /* NO EREPORT(ERROR) from here till changes are logged */
2193
2194 RelationPutHeapTuple(relation, buffer, heaptup,
2196
2197 if (PageIsAllVisible(BufferGetPage(buffer)))
2198 {
2199 all_visible_cleared = true;
2201 visibilitymap_clear(relation,
2203 vmbuffer, VISIBILITYMAP_VALID_BITS);
2204 }
2205
2206 /*
2207 * XXX Should we set PageSetPrunable on this page ?
2208 *
2209 * The inserting transaction may eventually abort thus making this tuple
2210 * DEAD and hence available for pruning. Though we don't want to optimize
2211 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2212 * aborted tuple will never be pruned until next vacuum is triggered.
2213 *
2214 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2215 */
2216
2217 MarkBufferDirty(buffer);
2218
2219 /* XLOG stuff */
2220 if (RelationNeedsWAL(relation))
2221 {
2225 Page page = BufferGetPage(buffer);
2226 uint8 info = XLOG_HEAP_INSERT;
2227 int bufflags = 0;
2228
2229 /*
2230 * If this is a catalog, we need to transmit combo CIDs to properly
2231 * decode, so log that as well.
2232 */
2234 log_heap_new_cid(relation, heaptup);
2235
2236 /*
2237 * If this is the single and first tuple on page, we can reinit the
2238 * page instead of restoring the whole thing. Set flag, and hide
2239 * buffer references from XLogInsert.
2240 */
2243 {
2244 info |= XLOG_HEAP_INIT_PAGE;
2246 }
2247
2248 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2249 xlrec.flags = 0;
2255
2256 /*
2257 * For logical decoding, we need the tuple even if we're doing a full
2258 * page write, so make sure it's included even if we take a full-page
2259 * image. (XXX We could alternatively store a pointer into the FPW).
2260 */
2261 if (RelationIsLogicallyLogged(relation) &&
2263 {
2266
2267 if (IsToastRelation(relation))
2269 }
2270
2273
2274 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2275 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2276 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2277
2278 /*
2279 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2280 * write the whole page to the xlog, we don't need to store
2281 * xl_heap_header in the xlog.
2282 */
2285 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2287 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2289
2290 /* filtering by origin on a row level is much more efficient */
2292
2293 recptr = XLogInsert(RM_HEAP_ID, info);
2294
2295 PageSetLSN(page, recptr);
2296 }
2297
2299
2300 UnlockReleaseBuffer(buffer);
2301 if (vmbuffer != InvalidBuffer)
2302 ReleaseBuffer(vmbuffer);
2303
2304 /*
2305 * If tuple is cacheable, mark it for invalidation from the caches in case
2306 * we abort. Note it is OK to do this after releasing the buffer, because
2307 * the heaptup data structure is all in local memory, not in the shared
2308 * buffer.
2309 */
2311
2312 /* Note: speculative insertions are counted too, even if aborted later */
2313 pgstat_count_heap_insert(relation, 1);
2314
2315 /*
2316 * If heaptup is a private copy, release it. Don't forget to copy t_self
2317 * back to the caller's image, too.
2318 */
2319 if (heaptup != tup)
2320 {
2321 tup->t_self = heaptup->t_self;
2323 }
2324}

References Assert, AssertHasSnapshotForToast(), BufferGetBlockNumber(), BufferGetPage(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), END_CRIT_SECTION, fb(), FirstOffsetNumber, GetCurrentTransactionId(), heap_freetuple(), HEAP_INSERT_NO_LOGICAL, HEAP_INSERT_SPECULATIVE, heap_prepare_insert(), HeapTupleHeaderGetNatts, InvalidBlockNumber, InvalidBuffer, IsToastRelation(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), log_heap_new_cid(), MarkBufferDirty(), PageClearAllVisible(), PageGetMaxOffsetNumber(), PageIsAllVisible(), PageSetLSN(), pgstat_count_heap_insert(), REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetBufferForTuple(), RelationGetNumberOfAttributes, RelationIsAccessibleInLogicalDecoding, RelationIsLogicallyLogged, RelationNeedsWAL, RelationPutHeapTuple(), ReleaseBuffer(), SizeOfHeapHeader, SizeOfHeapInsert, SizeofHeapTupleHeader, START_CRIT_SECTION, UnlockReleaseBuffer(), visibilitymap_clear(), VISIBILITYMAP_VALID_BITS, XLH_INSERT_ALL_VISIBLE_CLEARED, XLH_INSERT_CONTAINS_NEW_TUPLE, XLH_INSERT_IS_SPECULATIVE, XLH_INSERT_ON_TOAST_RELATION, XLOG_HEAP_INIT_PAGE, XLOG_HEAP_INSERT, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heapam_tuple_insert(), heapam_tuple_insert_speculative(), simple_heap_insert(), and toast_save_datum().

◆ heap_lock_tuple()

TM_Result heap_lock_tuple ( Relation  relation,
HeapTuple  tuple,
CommandId  cid,
LockTupleMode  mode,
LockWaitPolicy  wait_policy,
bool  follow_updates,
Buffer buffer,
TM_FailureData tmfd 
)

Definition at line 4644 of file heapam.c.

4648{
4649 TM_Result result;
4650 ItemPointer tid = &(tuple->t_self);
4651 ItemId lp;
4652 Page page;
4653 Buffer vmbuffer = InvalidBuffer;
4654 BlockNumber block;
4655 TransactionId xid,
4656 xmax;
4660 bool first_time = true;
4661 bool skip_tuple_lock = false;
4662 bool have_tuple_lock = false;
4663 bool cleared_all_frozen = false;
4664
4665 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4666 block = ItemPointerGetBlockNumber(tid);
4667
4668 /*
4669 * Before locking the buffer, pin the visibility map page if it appears to
4670 * be necessary. Since we haven't got the lock yet, someone else might be
4671 * in the middle of changing this, so we'll need to recheck after we have
4672 * the lock.
4673 */
4674 if (PageIsAllVisible(BufferGetPage(*buffer)))
4675 visibilitymap_pin(relation, block, &vmbuffer);
4676
4678
4679 page = BufferGetPage(*buffer);
4682
4683 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4684 tuple->t_len = ItemIdGetLength(lp);
4685 tuple->t_tableOid = RelationGetRelid(relation);
4686
4687l3:
4688 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4689
4690 if (result == TM_Invisible)
4691 {
4692 /*
4693 * This is possible, but only when locking a tuple for ON CONFLICT DO
4694 * SELECT/UPDATE. We return this value here rather than throwing an
4695 * error in order to give that case the opportunity to throw a more
4696 * specific error.
4697 */
4698 result = TM_Invisible;
4699 goto out_locked;
4700 }
4701 else if (result == TM_BeingModified ||
4702 result == TM_Updated ||
4703 result == TM_Deleted)
4704 {
4708 bool require_sleep;
4709 ItemPointerData t_ctid;
4710
4711 /* must copy state data before unlocking buffer */
4713 infomask = tuple->t_data->t_infomask;
4714 infomask2 = tuple->t_data->t_infomask2;
4715 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4716
4718
4719 /*
4720 * If any subtransaction of the current top transaction already holds
4721 * a lock as strong as or stronger than what we're requesting, we
4722 * effectively hold the desired lock already. We *must* succeed
4723 * without trying to take the tuple lock, else we will deadlock
4724 * against anyone wanting to acquire a stronger lock.
4725 *
4726 * Note we only do this the first time we loop on the HTSU result;
4727 * there is no point in testing in subsequent passes, because
4728 * evidently our own transaction cannot have acquired a new lock after
4729 * the first time we checked.
4730 */
4731 if (first_time)
4732 {
4733 first_time = false;
4734
4736 {
4737 int i;
4738 int nmembers;
4739 MultiXactMember *members;
4740
4741 /*
4742 * We don't need to allow old multixacts here; if that had
4743 * been the case, HeapTupleSatisfiesUpdate would have returned
4744 * MayBeUpdated and we wouldn't be here.
4745 */
4746 nmembers =
4747 GetMultiXactIdMembers(xwait, &members, false,
4749
4750 for (i = 0; i < nmembers; i++)
4751 {
4752 /* only consider members of our own transaction */
4753 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4754 continue;
4755
4756 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4757 {
4758 pfree(members);
4759 result = TM_Ok;
4760 goto out_unlocked;
4761 }
4762 else
4763 {
4764 /*
4765 * Disable acquisition of the heavyweight tuple lock.
4766 * Otherwise, when promoting a weaker lock, we might
4767 * deadlock with another locker that has acquired the
4768 * heavyweight tuple lock and is waiting for our
4769 * transaction to finish.
4770 *
4771 * Note that in this case we still need to wait for
4772 * the multixact if required, to avoid acquiring
4773 * conflicting locks.
4774 */
4775 skip_tuple_lock = true;
4776 }
4777 }
4778
4779 if (members)
4780 pfree(members);
4781 }
4783 {
4784 switch (mode)
4785 {
4786 case LockTupleKeyShare:
4790 result = TM_Ok;
4791 goto out_unlocked;
4792 case LockTupleShare:
4795 {
4796 result = TM_Ok;
4797 goto out_unlocked;
4798 }
4799 break;
4802 {
4803 result = TM_Ok;
4804 goto out_unlocked;
4805 }
4806 break;
4807 case LockTupleExclusive:
4810 {
4811 result = TM_Ok;
4812 goto out_unlocked;
4813 }
4814 break;
4815 }
4816 }
4817 }
4818
4819 /*
4820 * Initially assume that we will have to wait for the locking
4821 * transaction(s) to finish. We check various cases below in which
4822 * this can be turned off.
4823 */
4824 require_sleep = true;
4825 if (mode == LockTupleKeyShare)
4826 {
4827 /*
4828 * If we're requesting KeyShare, and there's no update present, we
4829 * don't need to wait. Even if there is an update, we can still
4830 * continue if the key hasn't been modified.
4831 *
4832 * However, if there are updates, we need to walk the update chain
4833 * to mark future versions of the row as locked, too. That way,
4834 * if somebody deletes that future version, we're protected
4835 * against the key going away. This locking of future versions
4836 * could block momentarily, if a concurrent transaction is
4837 * deleting a key; or it could return a value to the effect that
4838 * the transaction deleting the key has already committed. So we
4839 * do this before re-locking the buffer; otherwise this would be
4840 * prone to deadlocks.
4841 *
4842 * Note that the TID we're locking was grabbed before we unlocked
4843 * the buffer. For it to change while we're not looking, the
4844 * other properties we're testing for below after re-locking the
4845 * buffer would also change, in which case we would restart this
4846 * loop above.
4847 */
4849 {
4850 bool updated;
4851
4853
4854 /*
4855 * If there are updates, follow the update chain; bail out if
4856 * that cannot be done.
4857 */
4858 if (follow_updates && updated &&
4859 !ItemPointerEquals(&tuple->t_self, &t_ctid))
4860 {
4861 TM_Result res;
4862
4863 res = heap_lock_updated_tuple(relation,
4864 infomask, xwait, &t_ctid,
4866 mode);
4867 if (res != TM_Ok)
4868 {
4869 result = res;
4870 /* recovery code expects to have buffer lock held */
4872 goto failed;
4873 }
4874 }
4875
4877
4878 /*
4879 * Make sure it's still an appropriate lock, else start over.
4880 * Also, if it wasn't updated before we released the lock, but
4881 * is updated now, we start over too; the reason is that we
4882 * now need to follow the update chain to lock the new
4883 * versions.
4884 */
4885 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4886 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4887 !updated))
4888 goto l3;
4889
4890 /* Things look okay, so we can skip sleeping */
4891 require_sleep = false;
4892
4893 /*
4894 * Note we allow Xmax to change here; other updaters/lockers
4895 * could have modified it before we grabbed the buffer lock.
4896 * However, this is not a problem, because with the recheck we
4897 * just did we ensure that they still don't conflict with the
4898 * lock we want.
4899 */
4900 }
4901 }
4902 else if (mode == LockTupleShare)
4903 {
4904 /*
4905 * If we're requesting Share, we can similarly avoid sleeping if
4906 * there's no update and no exclusive lock present.
4907 */
4910 {
4912
4913 /*
4914 * Make sure it's still an appropriate lock, else start over.
4915 * See above about allowing xmax to change.
4916 */
4919 goto l3;
4920 require_sleep = false;
4921 }
4922 }
4923 else if (mode == LockTupleNoKeyExclusive)
4924 {
4925 /*
4926 * If we're requesting NoKeyExclusive, we might also be able to
4927 * avoid sleeping; just ensure that there no conflicting lock
4928 * already acquired.
4929 */
4931 {
4933 mode, NULL))
4934 {
4935 /*
4936 * No conflict, but if the xmax changed under us in the
4937 * meantime, start over.
4938 */
4942 xwait))
4943 goto l3;
4944
4945 /* otherwise, we're good */
4946 require_sleep = false;
4947 }
4948 }
4950 {
4952
4953 /* if the xmax changed in the meantime, start over */
4956 xwait))
4957 goto l3;
4958 /* otherwise, we're good */
4959 require_sleep = false;
4960 }
4961 }
4962
4963 /*
4964 * As a check independent from those above, we can also avoid sleeping
4965 * if the current transaction is the sole locker of the tuple. Note
4966 * that the strength of the lock already held is irrelevant; this is
4967 * not about recording the lock in Xmax (which will be done regardless
4968 * of this optimization, below). Also, note that the cases where we
4969 * hold a lock stronger than we are requesting are already handled
4970 * above by not doing anything.
4971 *
4972 * Note we only deal with the non-multixact case here; MultiXactIdWait
4973 * is well equipped to deal with this situation on its own.
4974 */
4977 {
4978 /* ... but if the xmax changed in the meantime, start over */
4982 xwait))
4983 goto l3;
4985 require_sleep = false;
4986 }
4987
4988 /*
4989 * Time to sleep on the other transaction/multixact, if necessary.
4990 *
4991 * If the other transaction is an update/delete that's already
4992 * committed, then sleeping cannot possibly do any good: if we're
4993 * required to sleep, get out to raise an error instead.
4994 *
4995 * By here, we either have already acquired the buffer exclusive lock,
4996 * or we must wait for the locking transaction or multixact; so below
4997 * we ensure that we grab buffer lock after the sleep.
4998 */
4999 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
5000 {
5002 goto failed;
5003 }
5004 else if (require_sleep)
5005 {
5006 /*
5007 * Acquire tuple lock to establish our priority for the tuple, or
5008 * die trying. LockTuple will release us when we are next-in-line
5009 * for the tuple. We must do this even if we are share-locking,
5010 * but not if we already have a weaker lock on the tuple.
5011 *
5012 * If we are forced to "start over" below, we keep the tuple lock;
5013 * this arranges that we stay at the head of the line while
5014 * rechecking tuple state.
5015 */
5016 if (!skip_tuple_lock &&
5017 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
5019 {
5020 /*
5021 * This can only happen if wait_policy is Skip and the lock
5022 * couldn't be obtained.
5023 */
5024 result = TM_WouldBlock;
5025 /* recovery code expects to have buffer lock held */
5027 goto failed;
5028 }
5029
5031 {
5033
5034 /* We only ever lock tuples, never update them */
5035 if (status >= MultiXactStatusNoKeyUpdate)
5036 elog(ERROR, "invalid lock mode in heap_lock_tuple");
5037
5038 /* wait for multixact to end, or die trying */
5039 switch (wait_policy)
5040 {
5041 case LockWaitBlock:
5043 relation, &tuple->t_self, XLTW_Lock, NULL);
5044 break;
5045 case LockWaitSkip:
5047 status, infomask, relation,
5048 NULL, false))
5049 {
5050 result = TM_WouldBlock;
5051 /* recovery code expects to have buffer lock held */
5053 goto failed;
5054 }
5055 break;
5056 case LockWaitError:
5058 status, infomask, relation,
5060 ereport(ERROR,
5062 errmsg("could not obtain lock on row in relation \"%s\"",
5063 RelationGetRelationName(relation))));
5064
5065 break;
5066 }
5067
5068 /*
5069 * Of course, the multixact might not be done here: if we're
5070 * requesting a light lock mode, other transactions with light
5071 * locks could still be alive, as well as locks owned by our
5072 * own xact or other subxacts of this backend. We need to
5073 * preserve the surviving MultiXact members. Note that it
5074 * isn't absolutely necessary in the latter case, but doing so
5075 * is simpler.
5076 */
5077 }
5078 else
5079 {
5080 /* wait for regular transaction to end, or die trying */
5081 switch (wait_policy)
5082 {
5083 case LockWaitBlock:
5084 XactLockTableWait(xwait, relation, &tuple->t_self,
5085 XLTW_Lock);
5086 break;
5087 case LockWaitSkip:
5089 {
5090 result = TM_WouldBlock;
5091 /* recovery code expects to have buffer lock held */
5093 goto failed;
5094 }
5095 break;
5096 case LockWaitError:
5098 ereport(ERROR,
5100 errmsg("could not obtain lock on row in relation \"%s\"",
5101 RelationGetRelationName(relation))));
5102 break;
5103 }
5104 }
5105
5106 /* if there are updates, follow the update chain */
5108 !ItemPointerEquals(&tuple->t_self, &t_ctid))
5109 {
5110 TM_Result res;
5111
5112 res = heap_lock_updated_tuple(relation,
5113 infomask, xwait, &t_ctid,
5115 mode);
5116 if (res != TM_Ok)
5117 {
5118 result = res;
5119 /* recovery code expects to have buffer lock held */
5121 goto failed;
5122 }
5123 }
5124
5126
5127 /*
5128 * xwait is done, but if xwait had just locked the tuple then some
5129 * other xact could update this tuple before we get to this point.
5130 * Check for xmax change, and start over if so.
5131 */
5134 xwait))
5135 goto l3;
5136
5138 {
5139 /*
5140 * Otherwise check if it committed or aborted. Note we cannot
5141 * be here if the tuple was only locked by somebody who didn't
5142 * conflict with us; that would have been handled above. So
5143 * that transaction must necessarily be gone by now. But
5144 * don't check for this in the multixact case, because some
5145 * locker transactions might still be running.
5146 */
5147 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5148 }
5149 }
5150
5151 /* By here, we're certain that we hold buffer exclusive lock again */
5152
5153 /*
5154 * We may lock if previous xmax aborted, or if it committed but only
5155 * locked the tuple without updating it; or if we didn't have to wait
5156 * at all for whatever reason.
5157 */
5158 if (!require_sleep ||
5159 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5162 result = TM_Ok;
5163 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
5164 result = TM_Updated;
5165 else
5166 result = TM_Deleted;
5167 }
5168
5169failed:
5170 if (result != TM_Ok)
5171 {
5172 Assert(result == TM_SelfModified || result == TM_Updated ||
5173 result == TM_Deleted || result == TM_WouldBlock);
5174
5175 /*
5176 * When locking a tuple under LockWaitSkip semantics and we fail with
5177 * TM_WouldBlock above, it's possible for concurrent transactions to
5178 * release the lock and set HEAP_XMAX_INVALID in the meantime. So
5179 * this assert is slightly different from the equivalent one in
5180 * heap_delete and heap_update.
5181 */
5182 Assert((result == TM_WouldBlock) ||
5183 !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5184 Assert(result != TM_Updated ||
5185 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
5186 tmfd->ctid = tuple->t_data->t_ctid;
5187 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5188 if (result == TM_SelfModified)
5189 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5190 else
5191 tmfd->cmax = InvalidCommandId;
5192 goto out_locked;
5193 }
5194
5195 /*
5196 * If we didn't pin the visibility map page and the page has become all
5197 * visible while we were busy locking the buffer, or during some
5198 * subsequent window during which we had it unlocked, we'll have to unlock
5199 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5200 * unfortunate, especially since we'll now have to recheck whether the
5201 * tuple has been locked or updated under us, but hopefully it won't
5202 * happen very often.
5203 */
5204 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5205 {
5207 visibilitymap_pin(relation, block, &vmbuffer);
5209 goto l3;
5210 }
5211
5212 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5213 old_infomask = tuple->t_data->t_infomask;
5214
5215 /*
5216 * If this is the first possibly-multixact-able operation in the current
5217 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5218 * certain that the transaction will never become a member of any older
5219 * MultiXactIds than that. (We have to do this even if we end up just
5220 * using our own TransactionId below, since some other backend could
5221 * incorporate our XID into a MultiXact immediately afterwards.)
5222 */
5224
5225 /*
5226 * Compute the new xmax and infomask to store into the tuple. Note we do
5227 * not modify the tuple just yet, because that would leave it in the wrong
5228 * state if multixact.c elogs.
5229 */
5231 GetCurrentTransactionId(), mode, false,
5232 &xid, &new_infomask, &new_infomask2);
5233
5235
5236 /*
5237 * Store transaction information of xact locking the tuple.
5238 *
5239 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5240 * possibly generating a useless combo CID. Moreover, if we're locking a
5241 * previously updated tuple, it's important to preserve the Cmax.
5242 *
5243 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5244 * we would break the HOT chain.
5245 */
5248 tuple->t_data->t_infomask |= new_infomask;
5249 tuple->t_data->t_infomask2 |= new_infomask2;
5252 HeapTupleHeaderSetXmax(tuple->t_data, xid);
5253
5254 /*
5255 * Make sure there is no forward chain link in t_ctid. Note that in the
5256 * cases where the tuple has been updated, we must not overwrite t_ctid,
5257 * because it was set by the updater. Moreover, if the tuple has been
5258 * updated, we need to follow the update chain to lock the new versions of
5259 * the tuple as well.
5260 */
5262 tuple->t_data->t_ctid = *tid;
5263
5264 /* Clear only the all-frozen bit on visibility map if needed */
5265 if (PageIsAllVisible(page) &&
5266 visibilitymap_clear(relation, block, vmbuffer,
5268 cleared_all_frozen = true;
5269
5270
5271 MarkBufferDirty(*buffer);
5272
5273 /*
5274 * XLOG stuff. You might think that we don't need an XLOG record because
5275 * there is no state change worth restoring after a crash. You would be
5276 * wrong however: we have just written either a TransactionId or a
5277 * MultiXactId that may never have been seen on disk before, and we need
5278 * to make sure that there are XLOG entries covering those ID numbers.
5279 * Else the same IDs might be re-used after a crash, which would be
5280 * disastrous if this page made it to disk before the crash. Essentially
5281 * we have to enforce the WAL log-before-data rule even in this case.
5282 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5283 * entries for everything anyway.)
5284 */
5285 if (RelationNeedsWAL(relation))
5286 {
5289
5292
5293 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5294 xlrec.xmax = xid;
5295 xlrec.infobits_set = compute_infobits(new_infomask,
5296 tuple->t_data->t_infomask2);
5299
5300 /* we don't decode row locks atm, so no need to log the origin */
5301
5303
5304 PageSetLSN(page, recptr);
5305 }
5306
5308
5309 result = TM_Ok;
5310
5313
5315 if (BufferIsValid(vmbuffer))
5316 ReleaseBuffer(vmbuffer);
5317
5318 /*
5319 * Don't update the visibility map here. Locking a tuple doesn't change
5320 * visibility info.
5321 */
5322
5323 /*
5324 * Now that we have successfully marked the tuple as locked, we can
5325 * release the lmgr tuple lock, if we had it.
5326 */
5327 if (have_tuple_lock)
5328 UnlockTupleTuplock(relation, tid, mode);
5329
5330 return result;
5331}

References Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), BufferIsValid(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), ConditionalMultiXactIdWait(), ConditionalXactLockTableWait(), TM_FailureData::ctid, DoesMultiXactIdConflict(), elog, END_CRIT_SECTION, ereport, errcode(), errmsg(), ERROR, fb(), get_mxact_status_for_lock(), GetCurrentTransactionId(), GetMultiXactIdMembers(), heap_acquire_tuplock(), HEAP_KEYS_UPDATED, heap_lock_updated_tuple(), HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HeapTupleHeaderClearHotUpdated(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetXmax(), HeapTupleSatisfiesUpdate(), i, InvalidBuffer, InvalidCommandId, ItemIdGetLength, ItemIdIsNormal, ItemPointerCopy(), ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, LockWaitBlock, LockWaitError, LockWaitSkip, log_lock_failures, MarkBufferDirty(), mode, MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusNoKeyUpdate, PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), pfree(), ReadBuffer(), REGBUF_STANDARD, RelationGetRelationName, RelationGetRelid, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapLock, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TM_WouldBlock, TransactionIdEquals, TransactionIdIsCurrentTransactionId(), TUPLOCK_from_mxstatus, UnlockTupleTuplock, UpdateXmaxHintBits(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP_LOCK, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLTW_Lock, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_lock().

◆ heap_lock_updated_tuple()

static TM_Result heap_lock_updated_tuple ( Relation  rel,
uint16  prior_infomask,
TransactionId  prior_raw_xmax,
const ItemPointerData prior_ctid,
TransactionId  xid,
LockTupleMode  mode 
)
static

Definition at line 6115 of file heapam.c.

6120{
6121 INJECTION_POINT("heap_lock_updated_tuple", NULL);
6122
6123 /*
6124 * If the tuple has moved into another partition (effectively a delete)
6125 * stop here.
6126 */
6128 {
6130
6131 /*
6132 * If this is the first possibly-multixact-able operation in the
6133 * current transaction, set my per-backend OldestMemberMXactId
6134 * setting. We can be certain that the transaction will never become a
6135 * member of any older MultiXactIds than that. (We have to do this
6136 * even if we end up just using our own TransactionId below, since
6137 * some other backend could incorporate our XID into a MultiXact
6138 * immediately afterwards.)
6139 */
6141
6145 }
6146
6147 /* nothing to lock */
6148 return TM_Ok;
6149}

References fb(), heap_lock_updated_tuple_rec(), HEAP_XMAX_IS_MULTI, INJECTION_POINT, ItemPointerIndicatesMovedPartitions(), mode, MultiXactIdGetUpdateXid(), MultiXactIdSetOldestMember(), and TM_Ok.

Referenced by heap_lock_tuple().

◆ heap_lock_updated_tuple_rec()

static TM_Result heap_lock_updated_tuple_rec ( Relation  rel,
TransactionId  priorXmax,
const ItemPointerData tid,
TransactionId  xid,
LockTupleMode  mode 
)
static

Definition at line 5767 of file heapam.c.

5770{
5771 TM_Result result;
5774 Buffer buf;
5779 TransactionId xmax,
5780 new_xmax;
5781 bool cleared_all_frozen = false;
5783 Buffer vmbuffer = InvalidBuffer;
5784 BlockNumber block;
5785
5786 ItemPointerCopy(tid, &tupid);
5787
5788 for (;;)
5789 {
5790 new_infomask = 0;
5791 new_xmax = InvalidTransactionId;
5793 ItemPointerCopy(&tupid, &(mytup.t_self));
5794
5795 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
5796 {
5797 /*
5798 * if we fail to find the updated version of the tuple, it's
5799 * because it was vacuumed/pruned away after its creator
5800 * transaction aborted. So behave as if we got to the end of the
5801 * chain, and there's no further tuple to lock: return success to
5802 * caller.
5803 */
5804 result = TM_Ok;
5805 goto out_unlocked;
5806 }
5807
5808l4:
5810
5811 /*
5812 * Before locking the buffer, pin the visibility map page if it
5813 * appears to be necessary. Since we haven't got the lock yet,
5814 * someone else might be in the middle of changing this, so we'll need
5815 * to recheck after we have the lock.
5816 */
5818 {
5819 visibilitymap_pin(rel, block, &vmbuffer);
5820 pinned_desired_page = true;
5821 }
5822 else
5823 pinned_desired_page = false;
5824
5826
5827 /*
5828 * If we didn't pin the visibility map page and the page has become
5829 * all visible while we were busy locking the buffer, we'll have to
5830 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5831 * That's a bit unfortunate, but hopefully shouldn't happen often.
5832 *
5833 * Note: in some paths through this function, we will reach here
5834 * holding a pin on a vm page that may or may not be the one matching
5835 * this page. If this page isn't all-visible, we won't use the vm
5836 * page, but we hold onto such a pin till the end of the function.
5837 */
5839 {
5841 visibilitymap_pin(rel, block, &vmbuffer);
5843 }
5844
5845 /*
5846 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5847 * end of the chain, we're done, so return success.
5848 */
5851 priorXmax))
5852 {
5853 result = TM_Ok;
5854 goto out_locked;
5855 }
5856
5857 /*
5858 * Also check Xmin: if this tuple was created by an aborted
5859 * (sub)transaction, then we already locked the last live one in the
5860 * chain, thus we're done, so return success.
5861 */
5863 {
5864 result = TM_Ok;
5865 goto out_locked;
5866 }
5867
5868 old_infomask = mytup.t_data->t_infomask;
5869 old_infomask2 = mytup.t_data->t_infomask2;
5870 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5871
5872 /*
5873 * If this tuple version has been updated or locked by some concurrent
5874 * transaction(s), what we do depends on whether our lock mode
5875 * conflicts with what those other transactions hold, and also on the
5876 * status of them.
5877 */
5879 {
5881 bool needwait;
5882
5885 {
5886 int nmembers;
5887 int i;
5888 MultiXactMember *members;
5889
5890 /*
5891 * We don't need a test for pg_upgrade'd tuples: this is only
5892 * applied to tuples after the first in an update chain. Said
5893 * first tuple in the chain may well be locked-in-9.2-and-
5894 * pg_upgraded, but that one was already locked by our caller,
5895 * not us; and any subsequent ones cannot be because our
5896 * caller must necessarily have obtained a snapshot later than
5897 * the pg_upgrade itself.
5898 */
5899 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5900
5901 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5903 for (i = 0; i < nmembers; i++)
5904 {
5905 result = test_lockmode_for_conflict(members[i].status,
5906 members[i].xid,
5907 mode,
5908 &mytup,
5909 &needwait);
5910
5911 /*
5912 * If the tuple was already locked by ourselves in a
5913 * previous iteration of this (say heap_lock_tuple was
5914 * forced to restart the locking loop because of a change
5915 * in xmax), then we hold the lock already on this tuple
5916 * version and we don't need to do anything; and this is
5917 * not an error condition either. We just need to skip
5918 * this tuple and continue locking the next version in the
5919 * update chain.
5920 */
5921 if (result == TM_SelfModified)
5922 {
5923 pfree(members);
5924 goto next;
5925 }
5926
5927 if (needwait)
5928 {
5930 XactLockTableWait(members[i].xid, rel,
5931 &mytup.t_self,
5933 pfree(members);
5934 goto l4;
5935 }
5936 if (result != TM_Ok)
5937 {
5938 pfree(members);
5939 goto out_locked;
5940 }
5941 }
5942 if (members)
5943 pfree(members);
5944 }
5945 else
5946 {
5947 MultiXactStatus status;
5948
5949 /*
5950 * For a non-multi Xmax, we first need to compute the
5951 * corresponding MultiXactStatus by using the infomask bits.
5952 */
5954 {
5958 status = MultiXactStatusForShare;
5960 {
5962 status = MultiXactStatusForUpdate;
5963 else
5965 }
5966 else
5967 {
5968 /*
5969 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5970 * as share-locked in the old cluster) shouldn't be
5971 * seen in the middle of an update chain.
5972 */
5973 elog(ERROR, "invalid lock status in tuple");
5974 }
5975 }
5976 else
5977 {
5978 /* it's an update, but which kind? */
5980 status = MultiXactStatusUpdate;
5981 else
5983 }
5984
5985 result = test_lockmode_for_conflict(status, rawxmax, mode,
5986 &mytup, &needwait);
5987
5988 /*
5989 * If the tuple was already locked by ourselves in a previous
5990 * iteration of this (say heap_lock_tuple was forced to
5991 * restart the locking loop because of a change in xmax), then
5992 * we hold the lock already on this tuple version and we don't
5993 * need to do anything; and this is not an error condition
5994 * either. We just need to skip this tuple and continue
5995 * locking the next version in the update chain.
5996 */
5997 if (result == TM_SelfModified)
5998 goto next;
5999
6000 if (needwait)
6001 {
6003 XactLockTableWait(rawxmax, rel, &mytup.t_self,
6005 goto l4;
6006 }
6007 if (result != TM_Ok)
6008 {
6009 goto out_locked;
6010 }
6011 }
6012 }
6013
6014 /* compute the new Xmax and infomask values for the tuple ... */
6015 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
6016 xid, mode, false,
6017 &new_xmax, &new_infomask, &new_infomask2);
6018
6020 visibilitymap_clear(rel, block, vmbuffer,
6022 cleared_all_frozen = true;
6023
6025
6026 /* ... and set them */
6027 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
6028 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
6029 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6030 mytup.t_data->t_infomask |= new_infomask;
6031 mytup.t_data->t_infomask2 |= new_infomask2;
6032
6034
6035 /* XLOG stuff */
6036 if (RelationNeedsWAL(rel))
6037 {
6040 Page page = BufferGetPage(buf);
6041
6044
6045 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
6046 xlrec.xmax = new_xmax;
6048 xlrec.flags =
6050
6052
6054
6055 PageSetLSN(page, recptr);
6056 }
6057
6059
6060next:
6061 /* if we find the end of update chain, we're done. */
6062 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
6064 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
6066 {
6067 result = TM_Ok;
6068 goto out_locked;
6069 }
6070
6071 /* tail recursion */
6073 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
6075 }
6076
6077 result = TM_Ok;
6078
6081
6083 if (vmbuffer != InvalidBuffer)
6084 ReleaseBuffer(vmbuffer);
6085
6086 return result;
6087}

References Assert, buf, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), CHECK_FOR_INTERRUPTS, compute_infobits(), compute_new_xmax_infomask(), elog, END_CRIT_SECTION, ERROR, fb(), GetMultiXactIdMembers(), heap_fetch(), HEAP_KEYS_UPDATED, HEAP_LOCKED_UPGRADED(), HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIndicatesMovedPartitions(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetXmax(), i, InvalidBuffer, InvalidTransactionId, ItemPointerCopy(), ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), MarkBufferDirty(), mode, MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, next, PageIsAllVisible(), PageSetLSN(), pfree(), REGBUF_STANDARD, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapLockUpdated, SnapshotAny, START_CRIT_SECTION, test_lockmode_for_conflict(), TM_Ok, TM_SelfModified, TransactionIdDidAbort(), TransactionIdEquals, TransactionIdIsValid, UnlockReleaseBuffer(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP2_LOCK_UPDATED, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), and XLTW_LockUpdated.

Referenced by heap_lock_updated_tuple().

◆ heap_multi_insert()

void heap_multi_insert ( Relation  relation,
TupleTableSlot **  slots,
int  ntuples,
CommandId  cid,
int  options,
BulkInsertState  bistate 
)

Definition at line 2413 of file heapam.c.

2415{
2418 int i;
2419 int ndone;
2421 Page page;
2422 Buffer vmbuffer = InvalidBuffer;
2423 bool needwal;
2427 bool starting_with_empty_page = false;
2428 int npages = 0;
2429 int npages_used = 0;
2430
2431 /* currently not needed (thus unsupported) for heap_multi_insert() */
2433
2434 AssertHasSnapshotForToast(relation);
2435
2436 needwal = RelationNeedsWAL(relation);
2439
2440 /* Toast and set header data in all the slots */
2441 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2442 for (i = 0; i < ntuples; i++)
2443 {
2444 HeapTuple tuple;
2445
2446 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2447 slots[i]->tts_tableOid = RelationGetRelid(relation);
2448 tuple->t_tableOid = slots[i]->tts_tableOid;
2449 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2450 options);
2451 }
2452
2453 /*
2454 * We're about to do the actual inserts -- but check for conflict first,
2455 * to minimize the possibility of having to roll back work we've just
2456 * done.
2457 *
2458 * A check here does not definitively prevent a serialization anomaly;
2459 * that check MUST be done at least past the point of acquiring an
2460 * exclusive buffer content lock on every buffer that will be affected,
2461 * and MAY be done after all inserts are reflected in the buffers and
2462 * those locks are released; otherwise there is a race condition. Since
2463 * multiple buffers can be locked and unlocked in the loop below, and it
2464 * would not be feasible to identify and lock all of those buffers before
2465 * the loop, we must do a final check at the end.
2466 *
2467 * The check here could be omitted with no loss of correctness; it is
2468 * present strictly as an optimization.
2469 *
2470 * For heap inserts, we only need to check for table-level SSI locks. Our
2471 * new tuples can't possibly conflict with existing tuple locks, and heap
2472 * page locks are only consolidated versions of tuple locks; they do not
2473 * lock "gaps" as index page locks do. So we don't need to specify a
2474 * buffer when making the call, which makes for a faster check.
2475 */
2477
2478 ndone = 0;
2479 while (ndone < ntuples)
2480 {
2481 Buffer buffer;
2482 bool all_visible_cleared = false;
2483 bool all_frozen_set = false;
2484 int nthispage;
2485
2487
2488 /*
2489 * Compute number of pages needed to fit the to-be-inserted tuples in
2490 * the worst case. This will be used to determine how much to extend
2491 * the relation by in RelationGetBufferForTuple(), if needed. If we
2492 * filled a prior page from scratch, we can just update our last
2493 * computation, but if we started with a partially filled page,
2494 * recompute from scratch, the number of potentially required pages
2495 * can vary due to tuples needing to fit onto the page, page headers
2496 * etc.
2497 */
2498 if (ndone == 0 || !starting_with_empty_page)
2499 {
2500 npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2502 npages_used = 0;
2503 }
2504 else
2505 npages_used++;
2506
2507 /*
2508 * Find buffer where at least the next tuple will fit. If the page is
2509 * all-visible, this will also pin the requisite visibility map page.
2510 *
2511 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2512 * empty page. See all_frozen_set below.
2513 */
2514 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2515 InvalidBuffer, options, bistate,
2516 &vmbuffer, NULL,
2517 npages - npages_used);
2518 page = BufferGetPage(buffer);
2519
2521
2523 {
2524 all_frozen_set = true;
2525 /* Lock the vmbuffer before entering the critical section */
2527 }
2528
2529 /* NO EREPORT(ERROR) from here till changes are logged */
2531
2532 /*
2533 * RelationGetBufferForTuple has ensured that the first tuple fits.
2534 * Put that on the page, and then as many other tuples as fit.
2535 */
2536 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2537
2538 /*
2539 * For logical decoding we need combo CIDs to properly decode the
2540 * catalog.
2541 */
2542 if (needwal && need_cids)
2543 log_heap_new_cid(relation, heaptuples[ndone]);
2544
2545 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2546 {
2548
2549 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2550 break;
2551
2552 RelationPutHeapTuple(relation, buffer, heaptup, false);
2553
2554 /*
2555 * For logical decoding we need combo CIDs to properly decode the
2556 * catalog.
2557 */
2558 if (needwal && need_cids)
2559 log_heap_new_cid(relation, heaptup);
2560 }
2561
2562 /*
2563 * If the page is all visible, need to clear that, unless we're only
2564 * going to add further frozen rows to it.
2565 *
2566 * If we're only adding already frozen rows to a previously empty
2567 * page, mark it as all-frozen and update the visibility map. We're
2568 * already holding a pin on the vmbuffer.
2569 */
2571 {
2572 all_visible_cleared = true;
2573 PageClearAllVisible(page);
2574 visibilitymap_clear(relation,
2575 BufferGetBlockNumber(buffer),
2576 vmbuffer, VISIBILITYMAP_VALID_BITS);
2577 }
2578 else if (all_frozen_set)
2579 {
2580 PageSetAllVisible(page);
2582 vmbuffer,
2585 relation->rd_locator);
2586 }
2587
2588 /*
2589 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2590 */
2591
2592 MarkBufferDirty(buffer);
2593
2594 /* XLOG stuff */
2595 if (needwal)
2596 {
2600 char *tupledata;
2601 int totaldatalen;
2602 char *scratchptr = scratch.data;
2603 bool init;
2604 int bufflags = 0;
2605
2606 /*
2607 * If the page was previously empty, we can reinit the page
2608 * instead of restoring the whole thing.
2609 */
2611
2612 /* allocate xl_heap_multi_insert struct from the scratch area */
2615
2616 /*
2617 * Allocate offsets array. Unless we're reinitializing the page,
2618 * in that case the tuples are stored in order starting at
2619 * FirstOffsetNumber and we don't need to store the offsets
2620 * explicitly.
2621 */
2622 if (!init)
2623 scratchptr += nthispage * sizeof(OffsetNumber);
2624
2625 /* the rest of the scratch space is used for tuple data */
2626 tupledata = scratchptr;
2627
2628 /* check that the mutually exclusive flags are not both set */
2630
2631 xlrec->flags = 0;
2634
2635 /*
2636 * We don't have to worry about including a conflict xid in the
2637 * WAL record, as HEAP_INSERT_FROZEN intentionally violates
2638 * visibility rules.
2639 */
2640 if (all_frozen_set)
2642
2643 xlrec->ntuples = nthispage;
2644
2645 /*
2646 * Write out an xl_multi_insert_tuple and the tuple data itself
2647 * for each tuple.
2648 */
2649 for (i = 0; i < nthispage; i++)
2650 {
2652 xl_multi_insert_tuple *tuphdr;
2653 int datalen;
2654
2655 if (!init)
2656 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2657 /* xl_multi_insert_tuple needs two-byte alignment. */
2659 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2660
2661 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2662 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2663 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2664
2665 /* write bitmap [+ padding] [+ oid] + data */
2666 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2668 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2669 datalen);
2670 tuphdr->datalen = datalen;
2671 scratchptr += datalen;
2672 }
2673 totaldatalen = scratchptr - tupledata;
2674 Assert((scratchptr - scratch.data) < BLCKSZ);
2675
2676 if (need_tuple_data)
2678
2679 /*
2680 * Signal that this is the last xl_heap_multi_insert record
2681 * emitted by this call to heap_multi_insert(). Needed for logical
2682 * decoding so it knows when to cleanup temporary data.
2683 */
2684 if (ndone + nthispage == ntuples)
2686
2687 if (init)
2688 {
2689 info |= XLOG_HEAP_INIT_PAGE;
2691 }
2692
2693 /*
2694 * If we're doing logical decoding, include the new tuple data
2695 * even if we take a full-page image of the page.
2696 */
2697 if (need_tuple_data)
2699
2701 XLogRegisterData(xlrec, tupledata - scratch.data);
2703 if (all_frozen_set)
2704 XLogRegisterBuffer(1, vmbuffer, 0);
2705
2706 XLogRegisterBufData(0, tupledata, totaldatalen);
2707
2708 /* filtering by origin on a row level is much more efficient */
2710
2711 recptr = XLogInsert(RM_HEAP2_ID, info);
2712
2713 PageSetLSN(page, recptr);
2714 if (all_frozen_set)
2715 {
2716 Assert(BufferIsDirty(vmbuffer));
2717 PageSetLSN(BufferGetPage(vmbuffer), recptr);
2718 }
2719 }
2720
2722
2723 if (all_frozen_set)
2724 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
2725
2726 UnlockReleaseBuffer(buffer);
2727 ndone += nthispage;
2728
2729 /*
2730 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2731 * likely that we'll insert into subsequent heap pages that are likely
2732 * to use the same vm page.
2733 */
2734 }
2735
2736 /* We're done with inserting all tuples, so release the last vmbuffer. */
2737 if (vmbuffer != InvalidBuffer)
2738 ReleaseBuffer(vmbuffer);
2739
2740 /*
2741 * We're done with the actual inserts. Check for conflicts again, to
2742 * ensure that all rw-conflicts in to these inserts are detected. Without
2743 * this final check, a sequential scan of the heap may have locked the
2744 * table after the "before" check, missing one opportunity to detect the
2745 * conflict, and then scanned the table before the new tuples were there,
2746 * missing the other chance to detect the conflict.
2747 *
2748 * For heap inserts, we only need to check for table-level SSI locks. Our
2749 * new tuples can't possibly conflict with existing tuple locks, and heap
2750 * page locks are only consolidated versions of tuple locks; they do not
2751 * lock "gaps" as index page locks do. So we don't need to specify a
2752 * buffer when making the call.
2753 */
2755
2756 /*
2757 * If tuples are cacheable, mark them for invalidation from the caches in
2758 * case we abort. Note it is OK to do this after releasing the buffer,
2759 * because the heaptuples data structure is all in local memory, not in
2760 * the shared buffer.
2761 */
2762 if (IsCatalogRelation(relation))
2763 {
2764 for (i = 0; i < ntuples; i++)
2766 }
2767
2768 /* copy t_self fields back to the caller's slots */
2769 for (i = 0; i < ntuples; i++)
2770 slots[i]->tts_tid = heaptuples[i]->t_self;
2771
2772 pgstat_count_heap_insert(relation, ntuples);
2773}

References Assert, AssertHasSnapshotForToast(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), BufferIsDirty(), CacheInvalidateHeapTuple(), CHECK_FOR_INTERRUPTS, CheckForSerializableConflictIn(), xl_multi_insert_tuple::datalen, END_CRIT_SECTION, ExecFetchSlotHeapTuple(), fb(), GetCurrentTransactionId(), HEAP_DEFAULT_FILLFACTOR, HEAP_INSERT_FROZEN, HEAP_INSERT_NO_LOGICAL, heap_multi_insert_pages(), heap_prepare_insert(), i, init, InvalidBlockNumber, InvalidBuffer, IsCatalogRelation(), ItemPointerGetOffsetNumber(), LockBuffer(), log_heap_new_cid(), MarkBufferDirty(), MAXALIGN, PageClearAllVisible(), PageGetHeapFreeSpace(), PageGetMaxOffsetNumber(), PageIsAllVisible(), PageSetAllVisible(), PageSetLSN(), palloc(), pgstat_count_heap_insert(), RelationData::rd_locator, REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetBufferForTuple(), RelationGetRelid, RelationGetTargetPageFreeSpace, RelationIsAccessibleInLogicalDecoding, RelationIsLogicallyLogged, RelationNeedsWAL, RelationPutHeapTuple(), ReleaseBuffer(), SHORTALIGN, SizeOfHeapMultiInsert, SizeofHeapTupleHeader, SizeOfMultiInsertTuple, START_CRIT_SECTION, xl_multi_insert_tuple::t_hoff, xl_multi_insert_tuple::t_infomask, xl_multi_insert_tuple::t_infomask2, HeapTupleData::t_tableOid, TupleTableSlot::tts_tableOid, UnlockReleaseBuffer(), VISIBILITYMAP_ALL_FROZEN, VISIBILITYMAP_ALL_VISIBLE, visibilitymap_clear(), visibilitymap_set_vmbits(), VISIBILITYMAP_VALID_BITS, XLH_INSERT_ALL_FROZEN_SET, XLH_INSERT_ALL_VISIBLE_CLEARED, XLH_INSERT_CONTAINS_NEW_TUPLE, XLH_INSERT_LAST_IN_MULTI, XLOG_HEAP2_MULTI_INSERT, XLOG_HEAP_INIT_PAGE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by CatalogTuplesMultiInsertWithInfo().

◆ heap_multi_insert_pages()

static int heap_multi_insert_pages ( HeapTuple heaptuples,
int  done,
int  ntuples,
Size  saveFreeSpace 
)
static

Definition at line 2381 of file heapam.c.

2382{
2384 int npages = 1;
2385
2386 for (int i = done; i < ntuples; i++)
2387 {
2388 size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2389
2390 if (page_avail < tup_sz)
2391 {
2392 npages++;
2394 }
2395 page_avail -= tup_sz;
2396 }
2397
2398 return npages;
2399}

References fb(), i, MAXALIGN, and SizeOfPageHeaderData.

Referenced by heap_multi_insert().

◆ heap_pre_freeze_checks()

void heap_pre_freeze_checks ( Buffer  buffer,
HeapTupleFreeze tuples,
int  ntuples 
)

Definition at line 7408 of file heapam.c.

7410{
7411 Page page = BufferGetPage(buffer);
7412
7413 for (int i = 0; i < ntuples; i++)
7414 {
7415 HeapTupleFreeze *frz = tuples + i;
7416 ItemId itemid = PageGetItemId(page, frz->offset);
7417 HeapTupleHeader htup;
7418
7419 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7420
7421 /* Deliberately avoid relying on tuple hint bits here */
7422 if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
7423 {
7425
7427 if (unlikely(!TransactionIdDidCommit(xmin)))
7428 ereport(ERROR,
7430 errmsg_internal("uncommitted xmin %u needs to be frozen",
7431 xmin)));
7432 }
7433
7434 /*
7435 * TransactionIdDidAbort won't work reliably in the presence of XIDs
7436 * left behind by transactions that were in progress during a crash,
7437 * so we can only check that xmax didn't commit
7438 */
7439 if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
7440 {
7442
7445 ereport(ERROR,
7447 errmsg_internal("cannot freeze committed xmax %u",
7448 xmax)));
7449 }
7450 }
7451}

References Assert, BufferGetPage(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errmsg_internal(), ERROR, fb(), HEAP_FREEZE_CHECK_XMAX_ABORTED, HEAP_FREEZE_CHECK_XMIN_COMMITTED, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetRawXmin(), HeapTupleHeaderXminFrozen(), i, PageGetItem(), PageGetItemId(), TransactionIdDidCommit(), TransactionIdIsNormal, and unlikely.

Referenced by heap_page_will_freeze().

◆ heap_prepare_freeze_tuple()

bool heap_prepare_freeze_tuple ( HeapTupleHeader  tuple,
const struct VacuumCutoffs cutoffs,
HeapPageFreeze pagefrz,
HeapTupleFreeze frz,
bool totally_frozen 
)

Definition at line 7135 of file heapam.c.

7139{
7140 bool xmin_already_frozen = false,
7141 xmax_already_frozen = false;
7142 bool freeze_xmin = false,
7143 replace_xvac = false,
7144 replace_xmax = false,
7145 freeze_xmax = false;
7146 TransactionId xid;
7147
7148 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
7149 frz->t_infomask2 = tuple->t_infomask2;
7150 frz->t_infomask = tuple->t_infomask;
7151 frz->frzflags = 0;
7152 frz->checkflags = 0;
7153
7154 /*
7155 * Process xmin, while keeping track of whether it's already frozen, or
7156 * will become frozen iff our freeze plan is executed by caller (could be
7157 * neither).
7158 */
7159 xid = HeapTupleHeaderGetXmin(tuple);
7160 if (!TransactionIdIsNormal(xid))
7161 xmin_already_frozen = true;
7162 else
7163 {
7164 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7165 ereport(ERROR,
7167 errmsg_internal("found xmin %u from before relfrozenxid %u",
7168 xid, cutoffs->relfrozenxid)));
7169
7170 /* Will set freeze_xmin flags in freeze plan below */
7172
7173 /* Verify that xmin committed if and when freeze plan is executed */
7174 if (freeze_xmin)
7176 }
7177
7178 /*
7179 * Old-style VACUUM FULL is gone, but we have to process xvac for as long
7180 * as we support having MOVED_OFF/MOVED_IN tuples in the database
7181 */
7182 xid = HeapTupleHeaderGetXvac(tuple);
7183 if (TransactionIdIsNormal(xid))
7184 {
7186 Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
7187
7188 /*
7189 * For Xvac, we always freeze proactively. This allows totally_frozen
7190 * tracking to ignore xvac.
7191 */
7192 replace_xvac = pagefrz->freeze_required = true;
7193
7194 /* Will set replace_xvac flags in freeze plan below */
7195 }
7196
7197 /* Now process xmax */
7198 xid = frz->xmax;
7199 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7200 {
7201 /* Raw xmax is a MultiXactId */
7203 uint16 flags;
7204
7205 /*
7206 * We will either remove xmax completely (in the "freeze_xmax" path),
7207 * process xmax by replacing it (in the "replace_xmax" path), or
7208 * perform no-op xmax processing. The only constraint is that the
7209 * FreezeLimit/MultiXactCutoff postcondition must never be violated.
7210 */
7211 newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
7212 &flags, pagefrz);
7213
7214 if (flags & FRM_NOOP)
7215 {
7216 /*
7217 * xmax is a MultiXactId, and nothing about it changes for now.
7218 * This is the only case where 'freeze_required' won't have been
7219 * set for us by FreezeMultiXactId, as well as the only case where
7220 * neither freeze_xmax nor replace_xmax are set (given a multi).
7221 *
7222 * This is a no-op, but the call to FreezeMultiXactId might have
7223 * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
7224 * for us (the "freeze page" variants, specifically). That'll
7225 * make it safe for our caller to freeze the page later on, while
7226 * leaving this particular xmax undisturbed.
7227 *
7228 * FreezeMultiXactId is _not_ responsible for the "no freeze"
7229 * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
7230 * job. A call to heap_tuple_should_freeze for this same tuple
7231 * will take place below if 'freeze_required' isn't set already.
7232 * (This repeats work from FreezeMultiXactId, but allows "no
7233 * freeze" tracker maintenance to happen in only one place.)
7234 */
7237 }
7238 else if (flags & FRM_RETURN_IS_XID)
7239 {
7240 /*
7241 * xmax will become an updater Xid (original MultiXact's updater
7242 * member Xid will be carried forward as a simple Xid in Xmax).
7243 */
7245
7246 /*
7247 * NB -- some of these transformations are only valid because we
7248 * know the return Xid is a tuple updater (i.e. not merely a
7249 * locker.) Also note that the only reason we don't explicitly
7250 * worry about HEAP_KEYS_UPDATED is because it lives in
7251 * t_infomask2 rather than t_infomask.
7252 */
7253 frz->t_infomask &= ~HEAP_XMAX_BITS;
7254 frz->xmax = newxmax;
7255 if (flags & FRM_MARK_COMMITTED)
7256 frz->t_infomask |= HEAP_XMAX_COMMITTED;
7257 replace_xmax = true;
7258 }
7259 else if (flags & FRM_RETURN_IS_MULTI)
7260 {
7263
7264 /*
7265 * xmax is an old MultiXactId that we have to replace with a new
7266 * MultiXactId, to carry forward two or more original member XIDs.
7267 */
7269
7270 /*
7271 * We can't use GetMultiXactIdHintBits directly on the new multi
7272 * here; that routine initializes the masks to all zeroes, which
7273 * would lose other bits we need. Doing it this way ensures all
7274 * unrelated bits remain untouched.
7275 */
7276 frz->t_infomask &= ~HEAP_XMAX_BITS;
7277 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7279 frz->t_infomask |= newbits;
7280 frz->t_infomask2 |= newbits2;
7281 frz->xmax = newxmax;
7282 replace_xmax = true;
7283 }
7284 else
7285 {
7286 /*
7287 * Freeze plan for tuple "freezes xmax" in the strictest sense:
7288 * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
7289 */
7290 Assert(flags & FRM_INVALIDATE_XMAX);
7292
7293 /* Will set freeze_xmax flags in freeze plan below */
7294 freeze_xmax = true;
7295 }
7296
7297 /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
7298 Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
7299 }
7300 else if (TransactionIdIsNormal(xid))
7301 {
7302 /* Raw xmax is normal XID */
7303 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7304 ereport(ERROR,
7306 errmsg_internal("found xmax %u from before relfrozenxid %u",
7307 xid, cutoffs->relfrozenxid)));
7308
7309 /* Will set freeze_xmax flags in freeze plan below */
7311
7312 /*
7313 * Verify that xmax aborted if and when freeze plan is executed,
7314 * provided it's from an update. (A lock-only xmax can be removed
7315 * independent of this, since the lock is released at xact end.)
7316 */
7318 frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
7319 }
7320 else if (!TransactionIdIsValid(xid))
7321 {
7322 /* Raw xmax is InvalidTransactionId XID */
7323 Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
7324 xmax_already_frozen = true;
7325 }
7326 else
7327 ereport(ERROR,
7329 errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
7330 xid, tuple->t_infomask)));
7331
7332 if (freeze_xmin)
7333 {
7335
7336 frz->t_infomask |= HEAP_XMIN_FROZEN;
7337 }
7338 if (replace_xvac)
7339 {
7340 /*
7341 * If a MOVED_OFF tuple is not dead, the xvac transaction must have
7342 * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
7343 * transaction succeeded.
7344 */
7345 Assert(pagefrz->freeze_required);
7346 if (tuple->t_infomask & HEAP_MOVED_OFF)
7347 frz->frzflags |= XLH_INVALID_XVAC;
7348 else
7349 frz->frzflags |= XLH_FREEZE_XVAC;
7350 }
7351 if (replace_xmax)
7352 {
7354 Assert(pagefrz->freeze_required);
7355
7356 /* Already set replace_xmax flags in freeze plan earlier */
7357 }
7358 if (freeze_xmax)
7359 {
7361
7362 frz->xmax = InvalidTransactionId;
7363
7364 /*
7365 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7366 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
7367 * Also get rid of the HEAP_KEYS_UPDATED bit.
7368 */
7369 frz->t_infomask &= ~HEAP_XMAX_BITS;
7370 frz->t_infomask |= HEAP_XMAX_INVALID;
7371 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7372 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7373 }
7374
7375 /*
7376 * Determine if this tuple is already totally frozen, or will become
7377 * totally frozen (provided caller executes freeze plans for the page)
7378 */
7381
7382 if (!pagefrz->freeze_required && !(xmin_already_frozen &&
7384 {
7385 /*
7386 * So far no previous tuple from the page made freezing mandatory.
7387 * Does this tuple force caller to freeze the entire page?
7388 */
7389 pagefrz->freeze_required =
7390 heap_tuple_should_freeze(tuple, cutoffs,
7391 &pagefrz->NoFreezePageRelfrozenXid,
7392 &pagefrz->NoFreezePageRelminMxid);
7393 }
7394
7395 /* Tell caller if this tuple has a usable freeze plan set in *frz */
7397}

References Assert, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errmsg_internal(), ERROR, fb(), HeapPageFreeze::freeze_required, FreezeMultiXactId(), FRM_INVALIDATE_XMAX, FRM_MARK_COMMITTED, FRM_NOOP, FRM_RETURN_IS_MULTI, FRM_RETURN_IS_XID, GetMultiXactIdHintBits(), HEAP_FREEZE_CHECK_XMAX_ABORTED, HEAP_FREEZE_CHECK_XMIN_COMMITTED, HEAP_MOVED_OFF, heap_tuple_should_freeze(), HEAP_XMAX_COMMITTED, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMIN_FROZEN, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), InvalidTransactionId, VacuumCutoffs::MultiXactCutoff, MultiXactIdIsValid, MultiXactIdPrecedes(), HeapPageFreeze::NoFreezePageRelfrozenXid, HeapPageFreeze::NoFreezePageRelminMxid, VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, VacuumCutoffs::relfrozenxid, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, TransactionIdIsNormal, TransactionIdIsValid, TransactionIdPrecedes(), TransactionIdPrecedesOrEquals(), XLH_FREEZE_XVAC, and XLH_INVALID_XVAC.

Referenced by heap_freeze_tuple(), and heap_prune_record_unchanged_lp_normal().

◆ heap_prepare_insert()

static HeapTuple heap_prepare_insert ( Relation  relation,
HeapTuple  tup,
TransactionId  xid,
CommandId  cid,
int  options 
)
static

Definition at line 2333 of file heapam.c.

2335{
2336 /*
2337 * To allow parallel inserts, we need to ensure that they are safe to be
2338 * performed in workers. We have the infrastructure to allow parallel
2339 * inserts in general except for the cases where inserts generate a new
2340 * CommandId (eg. inserts into a table having a foreign key column).
2341 */
2342 if (IsParallelWorker())
2343 ereport(ERROR,
2345 errmsg("cannot insert tuples in a parallel worker")));
2346
2347 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2348 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2349 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2350 HeapTupleHeaderSetXmin(tup->t_data, xid);
2353
2354 HeapTupleHeaderSetCmin(tup->t_data, cid);
2355 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2356 tup->t_tableOid = RelationGetRelid(relation);
2357
2358 /*
2359 * If the new tuple is too big for storage or contains already toasted
2360 * out-of-line attributes from some other relation, invoke the toaster.
2361 */
2362 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2363 relation->rd_rel->relkind != RELKIND_MATVIEW)
2364 {
2365 /* toast table entries should never be recursively toasted */
2367 return tup;
2368 }
2369 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2370 return heap_toast_insert_or_update(relation, tup, NULL, options);
2371 else
2372 return tup;
2373}

References Assert, ereport, errcode(), errmsg(), ERROR, fb(), HEAP2_XACT_MASK, HEAP_INSERT_FROZEN, heap_toast_insert_or_update(), HEAP_XACT_MASK, HEAP_XMAX_INVALID, HeapTupleHasExternal(), HeapTupleHeaderSetCmin(), HeapTupleHeaderSetXmax(), HeapTupleHeaderSetXmin(), HeapTupleHeaderSetXminFrozen(), IsParallelWorker, RelationData::rd_rel, RelationGetRelid, and TOAST_TUPLE_THRESHOLD.

Referenced by heap_insert(), and heap_multi_insert().

◆ heap_prepare_pagescan()

void heap_prepare_pagescan ( TableScanDesc  sscan)

Definition at line 616 of file heapam.c.

617{
619 Buffer buffer = scan->rs_cbuf;
620 BlockNumber block = scan->rs_cblock;
621 Snapshot snapshot;
622 Page page;
623 int lines;
624 bool all_visible;
626
627 Assert(BufferGetBlockNumber(buffer) == block);
628
629 /* ensure we're not accidentally being used when not in pagemode */
631 snapshot = scan->rs_base.rs_snapshot;
632
633 /*
634 * Prune and repair fragmentation for the whole page, if possible.
635 */
636 heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
637
638 /*
639 * We must hold share lock on the buffer content while examining tuple
640 * visibility. Afterwards, however, the tuples we have found to be
641 * visible are guaranteed good as long as we hold the buffer pin.
642 */
644
645 page = BufferGetPage(buffer);
646 lines = PageGetMaxOffsetNumber(page);
647
648 /*
649 * If the all-visible flag indicates that all tuples on the page are
650 * visible to everyone, we can skip the per-tuple visibility tests.
651 *
652 * Note: In hot standby, a tuple that's already visible to all
653 * transactions on the primary might still be invisible to a read-only
654 * transaction in the standby. We partly handle this problem by tracking
655 * the minimum xmin of visible tuples as the cut-off XID while marking a
656 * page all-visible on the primary and WAL log that along with the
657 * visibility map SET operation. In hot standby, we wait for (or abort)
658 * all transactions that can potentially may not see one or more tuples on
659 * the page. That's how index-only scans work fine in hot standby. A
660 * crucial difference between index-only scans and heap scans is that the
661 * index-only scan completely relies on the visibility map where as heap
662 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
663 * the page-level flag can be trusted in the same way, because it might
664 * get propagated somehow without being explicitly WAL-logged, e.g. via a
665 * full page write. Until we can prove that beyond doubt, let's check each
666 * tuple for visibility the hard way.
667 */
668 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
671
672 /*
673 * We call page_collect_tuples() with constant arguments, to get the
674 * compiler to constant fold the constant arguments. Separate calls with
675 * constant arguments, rather than variables, are needed on several
676 * compilers to actually perform constant folding.
677 */
678 if (likely(all_visible))
679 {
681 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
682 block, lines, true, false);
683 else
684 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
685 block, lines, true, true);
686 }
687 else
688 {
690 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
691 block, lines, false, false);
692 else
693 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
694 block, lines, false, true);
695 }
696
698}

References Assert, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), CheckForSerializableConflictOutNeeded(), fb(), heap_page_prune_opt(), likely, LockBuffer(), page_collect_tuples(), PageGetMaxOffsetNumber(), PageIsAllVisible(), HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, HeapScanDescData::rs_ntuples, TableScanDescData::rs_rd, TableScanDescData::rs_snapshot, SO_ALLOW_PAGEMODE, and SnapshotData::takenDuringRecovery.

Referenced by heapam_scan_sample_next_block(), and heapgettup_pagemode().

◆ heap_rescan()

void heap_rescan ( TableScanDesc  sscan,
ScanKey  key,
bool  set_params,
bool  allow_strat,
bool  allow_sync,
bool  allow_pagemode 
)

Definition at line 1318 of file heapam.c.

1320{
1322
1323 if (set_params)
1324 {
1325 if (allow_strat)
1327 else
1329
1330 if (allow_sync)
1332 else
1334
1335 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1338 else
1340 }
1341
1342 /*
1343 * unpin scan buffers
1344 */
1345 if (BufferIsValid(scan->rs_cbuf))
1346 {
1347 ReleaseBuffer(scan->rs_cbuf);
1348 scan->rs_cbuf = InvalidBuffer;
1349 }
1350
1351 /*
1352 * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any
1353 * additional data vs a normal HeapScan
1354 */
1355
1356 /*
1357 * The read stream is reset on rescan. This must be done before
1358 * initscan(), as some state referred to by read_stream_reset() is reset
1359 * in initscan().
1360 */
1361 if (scan->rs_read_stream)
1363
1364 /*
1365 * reinitialize scan descriptor
1366 */
1367 initscan(scan, key, true);
1368}

References BufferIsValid(), fb(), initscan(), InvalidBuffer, IsMVCCSnapshot, read_stream_reset(), ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, SO_ALLOW_PAGEMODE, SO_ALLOW_STRAT, and SO_ALLOW_SYNC.

◆ heap_scan_stream_read_next_parallel()

static BlockNumber heap_scan_stream_read_next_parallel ( ReadStream stream,
void callback_private_data,
void per_buffer_data 
)
static

◆ heap_scan_stream_read_next_serial()

static BlockNumber heap_scan_stream_read_next_serial ( ReadStream stream,
void callback_private_data,
void per_buffer_data 
)
static

Definition at line 292 of file heapam.c.

295{
296 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
297
298 if (unlikely(!scan->rs_inited))
299 {
301 scan->rs_inited = true;
302 }
303 else
305 scan->rs_prefetch_block,
306 scan->rs_dir);
307
308 return scan->rs_prefetch_block;
309}

References heapgettup_advance_block(), heapgettup_initial_block(), HeapScanDescData::rs_dir, HeapScanDescData::rs_inited, HeapScanDescData::rs_prefetch_block, and unlikely.

Referenced by heap_beginscan().

◆ heap_set_tidrange()

void heap_set_tidrange ( TableScanDesc  sscan,
ItemPointer  mintid,
ItemPointer  maxtid 
)

Definition at line 1479 of file heapam.c.

1481{
1487
1488 /*
1489 * For relations without any pages, we can simply leave the TID range
1490 * unset. There will be no tuples to scan, therefore no tuples outside
1491 * the given TID range.
1492 */
1493 if (scan->rs_nblocks == 0)
1494 return;
1495
1496 /*
1497 * Set up some ItemPointers which point to the first and last possible
1498 * tuples in the heap.
1499 */
1502
1503 /*
1504 * If the given maximum TID is below the highest possible TID in the
1505 * relation, then restrict the range to that, otherwise we scan to the end
1506 * of the relation.
1507 */
1510
1511 /*
1512 * If the given minimum TID is above the lowest possible TID in the
1513 * relation, then restrict the range to only scan for TIDs above that.
1514 */
1517
1518 /*
1519 * Check for an empty range and protect from would be negative results
1520 * from the numBlks calculation below.
1521 */
1523 {
1524 /* Set an empty range of blocks to scan */
1526 return;
1527 }
1528
1529 /*
1530 * Calculate the first block and the number of blocks we must scan. We
1531 * could be more aggressive here and perform some more validation to try
1532 * and further narrow the scope of blocks to scan by checking if the
1533 * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1534 * advance startBlk by one. Likewise, if highestItem has an offset of 0
1535 * we could scan one fewer blocks. However, such an optimization does not
1536 * seem worth troubling over, currently.
1537 */
1539
1542
1543 /* Set the start block and number of blocks to scan */
1545
1546 /* Finally, set the TID range in sscan */
1547 ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
1548 ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
1549}

References fb(), FirstOffsetNumber, heap_setscanlimits(), ItemPointerCompare(), ItemPointerCopy(), ItemPointerGetBlockNumberNoCheck(), ItemPointerSet(), MaxOffsetNumber, and HeapScanDescData::rs_nblocks.

◆ heap_setscanlimits()

void heap_setscanlimits ( TableScanDesc  sscan,
BlockNumber  startBlk,
BlockNumber  numBlks 
)

Definition at line 500 of file heapam.c.

501{
503
504 Assert(!scan->rs_inited); /* else too late to change */
505 /* else rs_startblock is significant */
507
508 /* Check startBlk is valid (but allow case of zero blocks...) */
509 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
510
511 scan->rs_startblock = startBlk;
512 scan->rs_numblocks = numBlks;
513}

References Assert, fb(), HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, HeapScanDescData::rs_numblocks, HeapScanDescData::rs_startblock, and SO_ALLOW_SYNC.

Referenced by heap_set_tidrange(), and heapam_index_build_range_scan().

◆ heap_tuple_needs_eventual_freeze()

bool heap_tuple_needs_eventual_freeze ( HeapTupleHeader  tuple)

Definition at line 7891 of file heapam.c.

7892{
7893 TransactionId xid;
7894
7895 /*
7896 * If xmin is a normal transaction ID, this tuple is definitely not
7897 * frozen.
7898 */
7899 xid = HeapTupleHeaderGetXmin(tuple);
7900 if (TransactionIdIsNormal(xid))
7901 return true;
7902
7903 /*
7904 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7905 */
7906 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7907 {
7908 MultiXactId multi;
7909
7910 multi = HeapTupleHeaderGetRawXmax(tuple);
7911 if (MultiXactIdIsValid(multi))
7912 return true;
7913 }
7914 else
7915 {
7916 xid = HeapTupleHeaderGetRawXmax(tuple);
7917 if (TransactionIdIsNormal(xid))
7918 return true;
7919 }
7920
7921 if (tuple->t_infomask & HEAP_MOVED)
7922 {
7923 xid = HeapTupleHeaderGetXvac(tuple);
7924 if (TransactionIdIsNormal(xid))
7925 return true;
7926 }
7927
7928 return false;
7929}

References HEAP_MOVED, HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), MultiXactIdIsValid, HeapTupleHeaderData::t_infomask, and TransactionIdIsNormal.

Referenced by collect_corrupt_items(), and heap_page_would_be_all_visible().

◆ heap_tuple_should_freeze()

bool heap_tuple_should_freeze ( HeapTupleHeader  tuple,
const struct VacuumCutoffs cutoffs,
TransactionId NoFreezePageRelfrozenXid,
MultiXactId NoFreezePageRelminMxid 
)

Definition at line 7946 of file heapam.c.

7950{
7951 TransactionId xid;
7952 MultiXactId multi;
7953 bool freeze = false;
7954
7955 /* First deal with xmin */
7956 xid = HeapTupleHeaderGetXmin(tuple);
7957 if (TransactionIdIsNormal(xid))
7958 {
7960 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7961 *NoFreezePageRelfrozenXid = xid;
7962 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7963 freeze = true;
7964 }
7965
7966 /* Now deal with xmax */
7968 multi = InvalidMultiXactId;
7969 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7970 multi = HeapTupleHeaderGetRawXmax(tuple);
7971 else
7972 xid = HeapTupleHeaderGetRawXmax(tuple);
7973
7974 if (TransactionIdIsNormal(xid))
7975 {
7977 /* xmax is a non-permanent XID */
7978 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7979 *NoFreezePageRelfrozenXid = xid;
7980 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7981 freeze = true;
7982 }
7983 else if (!MultiXactIdIsValid(multi))
7984 {
7985 /* xmax is a permanent XID or invalid MultiXactId/XID */
7986 }
7987 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7988 {
7989 /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
7990 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
7991 *NoFreezePageRelminMxid = multi;
7992 /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
7993 freeze = true;
7994 }
7995 else
7996 {
7997 /* xmax is a MultiXactId that may have an updater XID */
7998 MultiXactMember *members;
7999 int nmembers;
8000
8002 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
8003 *NoFreezePageRelminMxid = multi;
8004 if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
8005 freeze = true;
8006
8007 /* need to check whether any member of the mxact is old */
8008 nmembers = GetMultiXactIdMembers(multi, &members, false,
8010
8011 for (int i = 0; i < nmembers; i++)
8012 {
8013 xid = members[i].xid;
8015 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8016 *NoFreezePageRelfrozenXid = xid;
8017 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
8018 freeze = true;
8019 }
8020 if (nmembers > 0)
8021 pfree(members);
8022 }
8023
8024 if (tuple->t_infomask & HEAP_MOVED)
8025 {
8026 xid = HeapTupleHeaderGetXvac(tuple);
8027 if (TransactionIdIsNormal(xid))
8028 {
8030 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8031 *NoFreezePageRelfrozenXid = xid;
8032 /* heap_prepare_freeze_tuple forces xvac freezing */
8033 freeze = true;
8034 }
8035 }
8036
8037 return freeze;
8038}

References Assert, VacuumCutoffs::FreezeLimit, GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_MOVED, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), i, InvalidMultiXactId, InvalidTransactionId, VacuumCutoffs::MultiXactCutoff, MultiXactIdIsValid, MultiXactIdPrecedes(), MultiXactIdPrecedesOrEquals(), pfree(), VacuumCutoffs::relfrozenxid, VacuumCutoffs::relminmxid, HeapTupleHeaderData::t_infomask, TransactionIdIsNormal, TransactionIdPrecedes(), TransactionIdPrecedesOrEquals(), and MultiXactMember::xid.

Referenced by heap_prepare_freeze_tuple(), and lazy_scan_noprune().

◆ heap_update()

TM_Result heap_update ( Relation  relation,
const ItemPointerData otid,
HeapTuple  newtup,
CommandId  cid,
Snapshot  crosscheck,
bool  wait,
TM_FailureData tmfd,
LockTupleMode lockmode,
TU_UpdateIndexes update_indexes 
)

Definition at line 3312 of file heapam.c.

3316{
3317 TM_Result result;
3325 ItemId lp;
3329 bool old_key_copied = false;
3330 Page page;
3331 BlockNumber block;
3333 Buffer buffer,
3334 newbuf,
3335 vmbuffer = InvalidBuffer,
3337 bool need_toast;
3339 pagefree;
3340 bool have_tuple_lock = false;
3341 bool iscombo;
3342 bool use_hot_update = false;
3343 bool summarized_update = false;
3344 bool key_intact;
3345 bool all_visible_cleared = false;
3346 bool all_visible_cleared_new = false;
3347 bool checked_lockers;
3348 bool locker_remains;
3349 bool id_has_external = false;
3356
3358
3359 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3362
3363 AssertHasSnapshotForToast(relation);
3364
3365 /*
3366 * Forbid this during a parallel operation, lest it allocate a combo CID.
3367 * Other workers might need that combo CID for visibility checks, and we
3368 * have no provision for broadcasting it to them.
3369 */
3370 if (IsInParallelMode())
3371 ereport(ERROR,
3373 errmsg("cannot update tuples during a parallel operation")));
3374
3375#ifdef USE_ASSERT_CHECKING
3377#endif
3378
3379 /*
3380 * Fetch the list of attributes to be checked for various operations.
3381 *
3382 * For HOT considerations, this is wasted effort if we fail to update or
3383 * have to put the new tuple on a different page. But we must compute the
3384 * list before obtaining buffer lock --- in the worst case, if we are
3385 * doing an update on one of the relevant system catalogs, we could
3386 * deadlock if we try to fetch the list later. In any case, the relcache
3387 * caches the data so this is usually pretty cheap.
3388 *
3389 * We also need columns used by the replica identity and columns that are
3390 * considered the "key" of rows in the table.
3391 *
3392 * Note that we get copies of each bitmap, so we need not worry about
3393 * relcache flush happening midway through.
3394 */
3407
3409 INJECTION_POINT("heap_update-before-pin", NULL);
3410 buffer = ReadBuffer(relation, block);
3411 page = BufferGetPage(buffer);
3412
3413 /*
3414 * Before locking the buffer, pin the visibility map page if it appears to
3415 * be necessary. Since we haven't got the lock yet, someone else might be
3416 * in the middle of changing this, so we'll need to recheck after we have
3417 * the lock.
3418 */
3419 if (PageIsAllVisible(page))
3420 visibilitymap_pin(relation, block, &vmbuffer);
3421
3423
3425
3426 /*
3427 * Usually, a buffer pin and/or snapshot blocks pruning of otid, ensuring
3428 * we see LP_NORMAL here. When the otid origin is a syscache, we may have
3429 * neither a pin nor a snapshot. Hence, we may see other LP_ states, each
3430 * of which indicates concurrent pruning.
3431 *
3432 * Failing with TM_Updated would be most accurate. However, unlike other
3433 * TM_Updated scenarios, we don't know the successor ctid in LP_UNUSED and
3434 * LP_DEAD cases. While the distinction between TM_Updated and TM_Deleted
3435 * does matter to SQL statements UPDATE and MERGE, those SQL statements
3436 * hold a snapshot that ensures LP_NORMAL. Hence, the choice between
3437 * TM_Updated and TM_Deleted affects only the wording of error messages.
3438 * Settle on TM_Deleted, for two reasons. First, it avoids complicating
3439 * the specification of when tmfd->ctid is valid. Second, it creates
3440 * error log evidence that we took this branch.
3441 *
3442 * Since it's possible to see LP_UNUSED at otid, it's also possible to see
3443 * LP_NORMAL for a tuple that replaced LP_UNUSED. If it's a tuple for an
3444 * unrelated row, we'll fail with "duplicate key value violates unique".
3445 * XXX if otid is the live, newer version of the newtup row, we'll discard
3446 * changes originating in versions of this catalog row after the version
3447 * the caller got from syscache. See syscache-update-pruned.spec.
3448 */
3449 if (!ItemIdIsNormal(lp))
3450 {
3452
3453 UnlockReleaseBuffer(buffer);
3455 if (vmbuffer != InvalidBuffer)
3456 ReleaseBuffer(vmbuffer);
3457 tmfd->ctid = *otid;
3458 tmfd->xmax = InvalidTransactionId;
3459 tmfd->cmax = InvalidCommandId;
3461
3466 /* modified_attrs not yet initialized */
3468 return TM_Deleted;
3469 }
3470
3471 /*
3472 * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3473 * properly.
3474 */
3475 oldtup.t_tableOid = RelationGetRelid(relation);
3476 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3477 oldtup.t_len = ItemIdGetLength(lp);
3478 oldtup.t_self = *otid;
3479
3480 /* the new tuple is ready, except for this: */
3481 newtup->t_tableOid = RelationGetRelid(relation);
3482
3483 /*
3484 * Determine columns modified by the update. Additionally, identify
3485 * whether any of the unmodified replica identity key attributes in the
3486 * old tuple is externally stored or not. This is required because for
3487 * such attributes the flattened value won't be WAL logged as part of the
3488 * new tuple so we must include it as part of the old_key_tuple. See
3489 * ExtractReplicaIdentity.
3490 */
3492 id_attrs, &oldtup,
3494
3495 /*
3496 * If we're not updating any "key" column, we can grab a weaker lock type.
3497 * This allows for more concurrency when we are running simultaneously
3498 * with foreign key checks.
3499 *
3500 * Note that if a column gets detoasted while executing the update, but
3501 * the value ends up being the same, this test will fail and we will use
3502 * the stronger lock. This is acceptable; the important case to optimize
3503 * is updates that don't manipulate key columns, not those that
3504 * serendipitously arrive at the same key values.
3505 */
3507 {
3508 *lockmode = LockTupleNoKeyExclusive;
3510 key_intact = true;
3511
3512 /*
3513 * If this is the first possibly-multixact-able operation in the
3514 * current transaction, set my per-backend OldestMemberMXactId
3515 * setting. We can be certain that the transaction will never become a
3516 * member of any older MultiXactIds than that. (We have to do this
3517 * even if we end up just using our own TransactionId below, since
3518 * some other backend could incorporate our XID into a MultiXact
3519 * immediately afterwards.)
3520 */
3522 }
3523 else
3524 {
3525 *lockmode = LockTupleExclusive;
3527 key_intact = false;
3528 }
3529
3530 /*
3531 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3532 * otid may very well point at newtup->t_self, which we will overwrite
3533 * with the new tuple's location, so there's great risk of confusion if we
3534 * use otid anymore.
3535 */
3536
3537l2:
3538 checked_lockers = false;
3539 locker_remains = false;
3540 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3541
3542 /* see below about the "no wait" case */
3543 Assert(result != TM_BeingModified || wait);
3544
3545 if (result == TM_Invisible)
3546 {
3547 UnlockReleaseBuffer(buffer);
3548 ereport(ERROR,
3550 errmsg("attempted to update invisible tuple")));
3551 }
3552 else if (result == TM_BeingModified && wait)
3553 {
3556 bool can_continue = false;
3557
3558 /*
3559 * XXX note that we don't consider the "no wait" case here. This
3560 * isn't a problem currently because no caller uses that case, but it
3561 * should be fixed if such a caller is introduced. It wasn't a
3562 * problem previously because this code would always wait, but now
3563 * that some tuple locks do not conflict with one of the lock modes we
3564 * use, it is possible that this case is interesting to handle
3565 * specially.
3566 *
3567 * This may cause failures with third-party code that calls
3568 * heap_update directly.
3569 */
3570
3571 /* must copy state data before unlocking buffer */
3573 infomask = oldtup.t_data->t_infomask;
3574
3575 /*
3576 * Now we have to do something about the existing locker. If it's a
3577 * multi, sleep on it; we might be awakened before it is completely
3578 * gone (or even not sleep at all in some cases); we need to preserve
3579 * it as locker, unless it is gone completely.
3580 *
3581 * If it's not a multi, we need to check for sleeping conditions
3582 * before actually going to sleep. If the update doesn't conflict
3583 * with the locks, we just continue without sleeping (but making sure
3584 * it is preserved).
3585 *
3586 * Before sleeping, we need to acquire tuple lock to establish our
3587 * priority for the tuple (see heap_lock_tuple). LockTuple will
3588 * release us when we are next-in-line for the tuple. Note we must
3589 * not acquire the tuple lock until we're sure we're going to sleep;
3590 * otherwise we're open for race conditions with other transactions
3591 * holding the tuple lock which sleep on us.
3592 *
3593 * If we are forced to "start over" below, we keep the tuple lock;
3594 * this arranges that we stay at the head of the line while rechecking
3595 * tuple state.
3596 */
3598 {
3600 int remain;
3601 bool current_is_member = false;
3602
3604 *lockmode, &current_is_member))
3605 {
3607
3608 /*
3609 * Acquire the lock, if necessary (but skip it when we're
3610 * requesting a lock and already have one; avoids deadlock).
3611 */
3612 if (!current_is_member)
3613 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3615
3616 /* wait for multixact */
3618 relation, &oldtup.t_self, XLTW_Update,
3619 &remain);
3620 checked_lockers = true;
3621 locker_remains = remain != 0;
3623
3624 /*
3625 * If xwait had just locked the tuple then some other xact
3626 * could update this tuple before we get to this point. Check
3627 * for xmax change, and start over if so.
3628 */
3629 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3630 infomask) ||
3632 xwait))
3633 goto l2;
3634 }
3635
3636 /*
3637 * Note that the multixact may not be done by now. It could have
3638 * surviving members; our own xact or other subxacts of this
3639 * backend, and also any other concurrent transaction that locked
3640 * the tuple with LockTupleKeyShare if we only got
3641 * LockTupleNoKeyExclusive. If this is the case, we have to be
3642 * careful to mark the updated tuple with the surviving members in
3643 * Xmax.
3644 *
3645 * Note that there could have been another update in the
3646 * MultiXact. In that case, we need to check whether it committed
3647 * or aborted. If it aborted we are safe to update it again;
3648 * otherwise there is an update conflict, and we have to return
3649 * TableTuple{Deleted, Updated} below.
3650 *
3651 * In the LockTupleExclusive case, we still need to preserve the
3652 * surviving members: those would include the tuple locks we had
3653 * before this one, which are important to keep in case this
3654 * subxact aborts.
3655 */
3656 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3658 else
3660
3661 /*
3662 * There was no UPDATE in the MultiXact; or it aborted. No
3663 * TransactionIdIsInProgress() call needed here, since we called
3664 * MultiXactIdWait() above.
3665 */
3668 can_continue = true;
3669 }
3671 {
3672 /*
3673 * The only locker is ourselves; we can avoid grabbing the tuple
3674 * lock here, but must preserve our locking information.
3675 */
3676 checked_lockers = true;
3677 locker_remains = true;
3678 can_continue = true;
3679 }
3681 {
3682 /*
3683 * If it's just a key-share locker, and we're not changing the key
3684 * columns, we don't need to wait for it to end; but we need to
3685 * preserve it as locker.
3686 */
3687 checked_lockers = true;
3688 locker_remains = true;
3689 can_continue = true;
3690 }
3691 else
3692 {
3693 /*
3694 * Wait for regular transaction to end; but first, acquire tuple
3695 * lock.
3696 */
3698 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3700 XactLockTableWait(xwait, relation, &oldtup.t_self,
3701 XLTW_Update);
3702 checked_lockers = true;
3704
3705 /*
3706 * xwait is done, but if xwait had just locked the tuple then some
3707 * other xact could update this tuple before we get to this point.
3708 * Check for xmax change, and start over if so.
3709 */
3710 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3713 goto l2;
3714
3715 /* Otherwise check if it committed or aborted */
3716 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3717 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3718 can_continue = true;
3719 }
3720
3721 if (can_continue)
3722 result = TM_Ok;
3723 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3724 result = TM_Updated;
3725 else
3726 result = TM_Deleted;
3727 }
3728
3729 /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3730 if (result != TM_Ok)
3731 {
3732 Assert(result == TM_SelfModified ||
3733 result == TM_Updated ||
3734 result == TM_Deleted ||
3735 result == TM_BeingModified);
3736 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3737 Assert(result != TM_Updated ||
3738 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3739 }
3740
3741 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3742 {
3743 /* Perform additional check for transaction-snapshot mode RI updates */
3745 result = TM_Updated;
3746 }
3747
3748 if (result != TM_Ok)
3749 {
3750 tmfd->ctid = oldtup.t_data->t_ctid;
3751 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3752 if (result == TM_SelfModified)
3753 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3754 else
3755 tmfd->cmax = InvalidCommandId;
3756 UnlockReleaseBuffer(buffer);
3757 if (have_tuple_lock)
3758 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3759 if (vmbuffer != InvalidBuffer)
3760 ReleaseBuffer(vmbuffer);
3762
3769 return result;
3770 }
3771
3772 /*
3773 * If we didn't pin the visibility map page and the page has become all
3774 * visible while we were busy locking the buffer, or during some
3775 * subsequent window during which we had it unlocked, we'll have to unlock
3776 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3777 * bit unfortunate, especially since we'll now have to recheck whether the
3778 * tuple has been locked or updated under us, but hopefully it won't
3779 * happen very often.
3780 */
3781 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3782 {
3784 visibilitymap_pin(relation, block, &vmbuffer);
3786 goto l2;
3787 }
3788
3789 /* Fill in transaction status data */
3790
3791 /*
3792 * If the tuple we're updating is locked, we need to preserve the locking
3793 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3794 */
3796 oldtup.t_data->t_infomask,
3797 oldtup.t_data->t_infomask2,
3798 xid, *lockmode, true,
3801
3802 /*
3803 * And also prepare an Xmax value for the new copy of the tuple. If there
3804 * was no xmax previously, or there was one but all lockers are now gone,
3805 * then use InvalidTransactionId; otherwise, get the xmax from the old
3806 * tuple. (In rare cases that might also be InvalidTransactionId and yet
3807 * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3808 */
3809 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3810 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3813 else
3815
3817 {
3820 }
3821 else
3822 {
3823 /*
3824 * If we found a valid Xmax for the new tuple, then the infomask bits
3825 * to use on the new tuple depend on what was there on the old one.
3826 * Note that since we're doing an update, the only possibility is that
3827 * the lockers had FOR KEY SHARE lock.
3828 */
3829 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3830 {
3833 }
3834 else
3835 {
3838 }
3839 }
3840
3841 /*
3842 * Prepare the new tuple with the appropriate initial values of Xmin and
3843 * Xmax, as well as initial infomask bits as computed above.
3844 */
3845 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3846 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3847 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3849 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3850 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3852
3853 /*
3854 * Replace cid with a combo CID if necessary. Note that we already put
3855 * the plain cid into the new tuple.
3856 */
3858
3859 /*
3860 * If the toaster needs to be activated, OR if the new tuple will not fit
3861 * on the same page as the old, then we need to release the content lock
3862 * (but not the pin!) on the old tuple's buffer while we are off doing
3863 * TOAST and/or table-file-extension work. We must mark the old tuple to
3864 * show that it's locked, else other processes may try to update it
3865 * themselves.
3866 *
3867 * We need to invoke the toaster if there are already any out-of-line
3868 * toasted values present, or if the new tuple is over-threshold.
3869 */
3870 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3871 relation->rd_rel->relkind != RELKIND_MATVIEW)
3872 {
3873 /* toast table entries should never be recursively toasted */
3876 need_toast = false;
3877 }
3878 else
3881 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3882
3884
3885 newtupsize = MAXALIGN(newtup->t_len);
3886
3888 {
3892 bool cleared_all_frozen = false;
3893
3894 /*
3895 * To prevent concurrent sessions from updating the tuple, we have to
3896 * temporarily mark it locked, while we release the page-level lock.
3897 *
3898 * To satisfy the rule that any xid potentially appearing in a buffer
3899 * written out to disk, we unfortunately have to WAL log this
3900 * temporary modification. We can reuse xl_heap_lock for this
3901 * purpose. If we crash/error before following through with the
3902 * actual update, xmax will be of an aborted transaction, allowing
3903 * other sessions to proceed.
3904 */
3905
3906 /*
3907 * Compute xmax / infomask appropriate for locking the tuple. This has
3908 * to be done separately from the combo that's going to be used for
3909 * updating, because the potentially created multixact would otherwise
3910 * be wrong.
3911 */
3913 oldtup.t_data->t_infomask,
3914 oldtup.t_data->t_infomask2,
3915 xid, *lockmode, false,
3918
3920
3922
3923 /* Clear obsolete visibility flags ... */
3924 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3925 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3927 /* ... and store info about transaction updating this tuple */
3930 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3931 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3933
3934 /* temporarily make it look not-updated, but locked */
3935 oldtup.t_data->t_ctid = oldtup.t_self;
3936
3937 /*
3938 * Clear all-frozen bit on visibility map if needed. We could
3939 * immediately reset ALL_VISIBLE, but given that the WAL logging
3940 * overhead would be unchanged, that doesn't seem necessarily
3941 * worthwhile.
3942 */
3943 if (PageIsAllVisible(page) &&
3944 visibilitymap_clear(relation, block, vmbuffer,
3946 cleared_all_frozen = true;
3947
3948 MarkBufferDirty(buffer);
3949
3950 if (RelationNeedsWAL(relation))
3951 {
3954
3957
3958 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3960 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3961 oldtup.t_data->t_infomask2);
3962 xlrec.flags =
3966 PageSetLSN(page, recptr);
3967 }
3968
3970
3972
3973 /*
3974 * Let the toaster do its thing, if needed.
3975 *
3976 * Note: below this point, heaptup is the data we actually intend to
3977 * store into the relation; newtup is the caller's original untoasted
3978 * data.
3979 */
3980 if (need_toast)
3981 {
3982 /* Note we always use WAL and FSM during updates */
3984 newtupsize = MAXALIGN(heaptup->t_len);
3985 }
3986 else
3987 heaptup = newtup;
3988
3989 /*
3990 * Now, do we need a new page for the tuple, or not? This is a bit
3991 * tricky since someone else could have added tuples to the page while
3992 * we weren't looking. We have to recheck the available space after
3993 * reacquiring the buffer lock. But don't bother to do that if the
3994 * former amount of free space is still not enough; it's unlikely
3995 * there's more free now than before.
3996 *
3997 * What's more, if we need to get a new page, we will need to acquire
3998 * buffer locks on both old and new pages. To avoid deadlock against
3999 * some other backend trying to get the same two locks in the other
4000 * order, we must be consistent about the order we get the locks in.
4001 * We use the rule "lock the lower-numbered page of the relation
4002 * first". To implement this, we must do RelationGetBufferForTuple
4003 * while not holding the lock on the old page, and we must rely on it
4004 * to get the locks on both pages in the correct order.
4005 *
4006 * Another consideration is that we need visibility map page pin(s) if
4007 * we will have to clear the all-visible flag on either page. If we
4008 * call RelationGetBufferForTuple, we rely on it to acquire any such
4009 * pins; but if we don't, we have to handle that here. Hence we need
4010 * a loop.
4011 */
4012 for (;;)
4013 {
4014 if (newtupsize > pagefree)
4015 {
4016 /* It doesn't fit, must use RelationGetBufferForTuple. */
4017 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4018 buffer, 0, NULL,
4019 &vmbuffer_new, &vmbuffer,
4020 0);
4021 /* We're all done. */
4022 break;
4023 }
4024 /* Acquire VM page pin if needed and we don't have it. */
4025 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4026 visibilitymap_pin(relation, block, &vmbuffer);
4027 /* Re-acquire the lock on the old tuple's page. */
4029 /* Re-check using the up-to-date free space */
4031 if (newtupsize > pagefree ||
4032 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
4033 {
4034 /*
4035 * Rats, it doesn't fit anymore, or somebody just now set the
4036 * all-visible flag. We must now unlock and loop to avoid
4037 * deadlock. Fortunately, this path should seldom be taken.
4038 */
4040 }
4041 else
4042 {
4043 /* We're all done. */
4044 newbuf = buffer;
4045 break;
4046 }
4047 }
4048 }
4049 else
4050 {
4051 /* No TOAST work needed, and it'll fit on same page */
4052 newbuf = buffer;
4053 heaptup = newtup;
4054 }
4055
4056 /*
4057 * We're about to do the actual update -- check for conflict first, to
4058 * avoid possibly having to roll back work we've just done.
4059 *
4060 * This is safe without a recheck as long as there is no possibility of
4061 * another process scanning the pages between this check and the update
4062 * being visible to the scan (i.e., exclusive buffer content lock(s) are
4063 * continuously held from this point until the tuple update is visible).
4064 *
4065 * For the new tuple the only check needed is at the relation level, but
4066 * since both tuples are in the same relation and the check for oldtup
4067 * will include checking the relation level, there is no benefit to a
4068 * separate check for the new tuple.
4069 */
4070 CheckForSerializableConflictIn(relation, &oldtup.t_self,
4071 BufferGetBlockNumber(buffer));
4072
4073 /*
4074 * At this point newbuf and buffer are both pinned and locked, and newbuf
4075 * has enough space for the new tuple. If they are the same buffer, only
4076 * one pin is held.
4077 */
4078
4079 if (newbuf == buffer)
4080 {
4081 /*
4082 * Since the new tuple is going into the same page, we might be able
4083 * to do a HOT update. Check if any of the index columns have been
4084 * changed.
4085 */
4087 {
4088 use_hot_update = true;
4089
4090 /*
4091 * If none of the columns that are used in hot-blocking indexes
4092 * were updated, we can apply HOT, but we do still need to check
4093 * if we need to update the summarizing indexes, and update those
4094 * indexes if the columns were updated, or we may fail to detect
4095 * e.g. value bound changes in BRIN minmax indexes.
4096 */
4098 summarized_update = true;
4099 }
4100 }
4101 else
4102 {
4103 /* Set a hint that the old page could use prune/defrag */
4104 PageSetFull(page);
4105 }
4106
4107 /*
4108 * Compute replica identity tuple before entering the critical section so
4109 * we don't PANIC upon a memory allocation failure.
4110 * ExtractReplicaIdentity() will return NULL if nothing needs to be
4111 * logged. Pass old key required as true only if the replica identity key
4112 * columns are modified or it has external data.
4113 */
4118
4119 /* NO EREPORT(ERROR) from here till changes are logged */
4121
4122 /*
4123 * If this transaction commits, the old tuple will become DEAD sooner or
4124 * later. Set flag that this page is a candidate for pruning once our xid
4125 * falls below the OldestXmin horizon. If the transaction finally aborts,
4126 * the subsequent page pruning will be a no-op and the hint will be
4127 * cleared.
4128 *
4129 * XXX Should we set hint on newbuf as well? If the transaction aborts,
4130 * there would be a prunable tuple in the newbuf; but for now we choose
4131 * not to optimize for aborts. Note that heap_xlog_update must be kept in
4132 * sync if this decision changes.
4133 */
4134 PageSetPrunable(page, xid);
4135
4136 if (use_hot_update)
4137 {
4138 /* Mark the old tuple as HOT-updated */
4140 /* And mark the new tuple as heap-only */
4142 /* Mark the caller's copy too, in case different from heaptup */
4144 }
4145 else
4146 {
4147 /* Make sure tuples are correctly marked as not-HOT */
4151 }
4152
4153 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4154
4155
4156 /* Clear obsolete visibility flags, possibly set by ourselves above... */
4157 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4158 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4159 /* ... and store info about transaction updating this tuple */
4162 oldtup.t_data->t_infomask |= infomask_old_tuple;
4163 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4165
4166 /* record address of new tuple in t_ctid of old one */
4167 oldtup.t_data->t_ctid = heaptup->t_self;
4168
4169 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4170 if (PageIsAllVisible(BufferGetPage(buffer)))
4171 {
4172 all_visible_cleared = true;
4174 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4175 vmbuffer, VISIBILITYMAP_VALID_BITS);
4176 }
4177 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4178 {
4183 }
4184
4185 if (newbuf != buffer)
4187 MarkBufferDirty(buffer);
4188
4189 /* XLOG stuff */
4190 if (RelationNeedsWAL(relation))
4191 {
4193
4194 /*
4195 * For logical decoding we need combo CIDs to properly decode the
4196 * catalog.
4197 */
4199 {
4200 log_heap_new_cid(relation, &oldtup);
4201 log_heap_new_cid(relation, heaptup);
4202 }
4203
4204 recptr = log_heap_update(relation, buffer,
4209 if (newbuf != buffer)
4210 {
4212 }
4214 }
4215
4217
4218 if (newbuf != buffer)
4221
4222 /*
4223 * Mark old tuple for invalidation from system caches at next command
4224 * boundary, and mark the new tuple for invalidation in case we abort. We
4225 * have to do this before releasing the buffer because oldtup is in the
4226 * buffer. (heaptup is all in local memory, but it's necessary to process
4227 * both tuple versions in one call to inval.c so we can avoid redundant
4228 * sinval messages.)
4229 */
4231
4232 /* Now we can release the buffer(s) */
4233 if (newbuf != buffer)
4235 ReleaseBuffer(buffer);
4238 if (BufferIsValid(vmbuffer))
4239 ReleaseBuffer(vmbuffer);
4240
4241 /*
4242 * Release the lmgr tuple lock, if we had it.
4243 */
4244 if (have_tuple_lock)
4245 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4246
4247 pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
4248
4249 /*
4250 * If heaptup is a private copy, release it. Don't forget to copy t_self
4251 * back to the caller's image, too.
4252 */
4253 if (heaptup != newtup)
4254 {
4255 newtup->t_self = heaptup->t_self;
4257 }
4258
4259 /*
4260 * If it is a HOT update, the update may still need to update summarized
4261 * indexes, lest we fail to update those summaries and get incorrect
4262 * results (for example, minmax bounds of the block may change with this
4263 * update).
4264 */
4265 if (use_hot_update)
4266 {
4269 else
4271 }
4272 else
4274
4277
4284
4285 return TM_Ok;
4286}

References Assert, AssertHasSnapshotForToast(), bms_add_members(), bms_free(), bms_overlap(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), TM_FailureData::ctid, DoesMultiXactIdConflict(), END_CRIT_SECTION, ereport, errcode(), errmsg(), ERROR, ExtractReplicaIdentity(), fb(), GetCurrentTransactionId(), GetMultiXactIdHintBits(), HEAP2_XACT_MASK, heap_acquire_tuplock(), heap_freetuple(), HEAP_LOCKED_UPGRADED(), HEAP_MOVED, heap_toast_insert_or_update(), HEAP_UPDATED, HEAP_XACT_MASK, HEAP_XMAX_BITS, HEAP_XMAX_INVALID, HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HeapDetermineColumnsInfo(), HeapTupleClearHeapOnly(), HeapTupleClearHotUpdated(), HeapTupleGetUpdateXid(), HeapTupleHasExternal(), HeapTupleHeaderAdjustCmax(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetNatts, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderSetCmax(), HeapTupleHeaderSetCmin(), HeapTupleHeaderSetXmax(), HeapTupleHeaderSetXmin(), HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesVisibility(), HeapTupleSetHeapOnly(), HeapTupleSetHotUpdated(), INDEX_ATTR_BITMAP_HOT_BLOCKING, INDEX_ATTR_BITMAP_IDENTITY_KEY, INDEX_ATTR_BITMAP_KEY, INDEX_ATTR_BITMAP_SUMMARIZED, INJECTION_POINT, InvalidBuffer, InvalidCommandId, InvalidSnapshot, InvalidTransactionId, IsInParallelMode(), ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), LockTupleExclusive, LockTupleNoKeyExclusive, LockWaitBlock, log_heap_new_cid(), log_heap_update(), MarkBufferDirty(), MAXALIGN, MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, PageClearAllVisible(), PageGetHeapFreeSpace(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetFull(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_update(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetBufferForTuple(), RelationGetIndexAttrBitmap(), RelationGetNumberOfAttributes, RelationGetRelid, RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationPutHeapTuple(), RelationSupportsSysCache(), ReleaseBuffer(), SizeOfHeapLock, START_CRIT_SECTION, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TOAST_TUPLE_THRESHOLD, TransactionIdDidAbort(), TransactionIdEquals, TransactionIdIsCurrentTransactionId(), TransactionIdIsValid, TU_All, TU_None, TU_Summarizing, UnlockReleaseBuffer(), UnlockTupleTuplock, UpdateXmaxHintBits(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), VISIBILITYMAP_VALID_BITS, XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP_LOCK, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLTW_Update, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_update(), and simple_heap_update().

◆ HeapCheckForSerializableConflictOut()

void HeapCheckForSerializableConflictOut ( bool  visible,
Relation  relation,
HeapTuple  tuple,
Buffer  buffer,
Snapshot  snapshot 
)

Definition at line 9326 of file heapam.c.

9329{
9330 TransactionId xid;
9332
9333 if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9334 return;
9335
9336 /*
9337 * Check to see whether the tuple has been written to by a concurrent
9338 * transaction, either to create it not visible to us, or to delete it
9339 * while it is visible to us. The "visible" bool indicates whether the
9340 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9341 * is going on with it.
9342 *
9343 * In the event of a concurrently inserted tuple that also happens to have
9344 * been concurrently updated (by a separate transaction), the xmin of the
9345 * tuple will be used -- not the updater's xid.
9346 */
9348 switch (htsvResult)
9349 {
9350 case HEAPTUPLE_LIVE:
9351 if (visible)
9352 return;
9353 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9354 break;
9357 if (visible)
9358 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9359 else
9360 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9361
9363 {
9364 /* This is like the HEAPTUPLE_DEAD case */
9365 Assert(!visible);
9366 return;
9367 }
9368 break;
9370 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9371 break;
9372 case HEAPTUPLE_DEAD:
9373 Assert(!visible);
9374 return;
9375 default:
9376
9377 /*
9378 * The only way to get to this default clause is if a new value is
9379 * added to the enum type without adding it to this switch
9380 * statement. That's a bug, so elog.
9381 */
9382 elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9383
9384 /*
9385 * In spite of having all enum values covered and calling elog on
9386 * this default, some compilers think this is a code path which
9387 * allows xid to be used below without initialization. Silence
9388 * that warning.
9389 */
9391 }
9392
9395
9396 /*
9397 * Find top level xid. Bail out if xid is too early to be a conflict, or
9398 * if it's our own xid.
9399 */
9401 return;
9404 return;
9405
9406 CheckForSerializableConflictOut(relation, xid, snapshot);
9407}

References Assert, CheckForSerializableConflictOut(), CheckForSerializableConflictOutNeeded(), elog, ERROR, fb(), GetTopTransactionIdIfAny(), HEAPTUPLE_DEAD, HEAPTUPLE_DELETE_IN_PROGRESS, HEAPTUPLE_INSERT_IN_PROGRESS, HEAPTUPLE_LIVE, HEAPTUPLE_RECENTLY_DEAD, HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleSatisfiesVacuum(), InvalidTransactionId, SubTransGetTopmostTransaction(), HeapTupleData::t_data, TransactionIdEquals, TransactionIdFollowsOrEquals(), TransactionIdIsValid, TransactionIdPrecedes(), and TransactionXmin.

Referenced by BitmapHeapScanNextBlock(), heap_fetch(), heap_get_latest_tid(), heap_hot_search_buffer(), heapam_scan_sample_next_tuple(), heapgettup(), and page_collect_tuples().

◆ HeapDetermineColumnsInfo()

static Bitmapset * HeapDetermineColumnsInfo ( Relation  relation,
Bitmapset interesting_cols,
Bitmapset external_cols,
HeapTuple  oldtup,
HeapTuple  newtup,
bool has_external 
)
static

Definition at line 4466 of file heapam.c.

4471{
4472 int attidx;
4474 TupleDesc tupdesc = RelationGetDescr(relation);
4475
4476 attidx = -1;
4477 while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
4478 {
4479 /* attidx is zero-based, attrnum is the normal attribute number */
4481 Datum value1,
4482 value2;
4483 bool isnull1,
4484 isnull2;
4485
4486 /*
4487 * If it's a whole-tuple reference, say "not equal". It's not really
4488 * worth supporting this case, since it could only succeed after a
4489 * no-op update, which is hardly a case worth optimizing for.
4490 */
4491 if (attrnum == 0)
4492 {
4493 modified = bms_add_member(modified, attidx);
4494 continue;
4495 }
4496
4497 /*
4498 * Likewise, automatically say "not equal" for any system attribute
4499 * other than tableOID; we cannot expect these to be consistent in a
4500 * HOT chain, or even to be set correctly yet in the new tuple.
4501 */
4502 if (attrnum < 0)
4503 {
4504 if (attrnum != TableOidAttributeNumber)
4505 {
4506 modified = bms_add_member(modified, attidx);
4507 continue;
4508 }
4509 }
4510
4511 /*
4512 * Extract the corresponding values. XXX this is pretty inefficient
4513 * if there are many indexed columns. Should we do a single
4514 * heap_deform_tuple call on each tuple, instead? But that doesn't
4515 * work for system columns ...
4516 */
4517 value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4518 value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4519
4520 if (!heap_attr_equals(tupdesc, attrnum, value1,
4521 value2, isnull1, isnull2))
4522 {
4523 modified = bms_add_member(modified, attidx);
4524 continue;
4525 }
4526
4527 /*
4528 * No need to check attributes that can't be stored externally. Note
4529 * that system attributes can't be stored externally.
4530 */
4531 if (attrnum < 0 || isnull1 ||
4532 TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
4533 continue;
4534
4535 /*
4536 * Check if the old tuple's attribute is stored externally and is a
4537 * member of external_cols.
4538 */
4541 *has_external = true;
4542 }
4543
4544 return modified;
4545}

References attlen, bms_add_member(), bms_is_member(), bms_next_member(), DatumGetPointer(), fb(), FirstLowInvalidHeapAttributeNumber, heap_attr_equals(), heap_getattr(), RelationGetDescr, TableOidAttributeNumber, TupleDescCompactAttr(), and VARATT_IS_EXTERNAL().

Referenced by heap_update().

◆ heapgettup()

static void heapgettup ( HeapScanDesc  scan,
ScanDirection  dir,
int  nkeys,
ScanKey  key 
)
static

Definition at line 960 of file heapam.c.

964{
965 HeapTuple tuple = &(scan->rs_ctup);
966 Page page;
968 int linesleft;
969
970 if (likely(scan->rs_inited))
971 {
972 /* continue from previously returned page/tuple */
974 page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
975 goto continue_page;
976 }
977
978 /*
979 * advance the scan until we find a qualifying tuple or run out of stuff
980 * to scan
981 */
982 while (true)
983 {
984 heap_fetch_next_buffer(scan, dir);
985
986 /* did we run out of blocks to scan? */
987 if (!BufferIsValid(scan->rs_cbuf))
988 break;
989
991
993 page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
995
996 /*
997 * Only continue scanning the page while we have lines left.
998 *
999 * Note that this protects us from accessing line pointers past
1000 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
1001 * table scan, and for when we start scanning a new page.
1002 */
1003 for (; linesleft > 0; linesleft--, lineoff += dir)
1004 {
1005 bool visible;
1007
1008 if (!ItemIdIsNormal(lpp))
1009 continue;
1010
1011 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1012 tuple->t_len = ItemIdGetLength(lpp);
1013 ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
1014
1015 visible = HeapTupleSatisfiesVisibility(tuple,
1016 scan->rs_base.rs_snapshot,
1017 scan->rs_cbuf);
1018
1020 tuple, scan->rs_cbuf,
1021 scan->rs_base.rs_snapshot);
1022
1023 /* skip tuples not visible to this snapshot */
1024 if (!visible)
1025 continue;
1026
1027 /* skip any tuples that don't match the scan key */
1028 if (key != NULL &&
1030 nkeys, key))
1031 continue;
1032
1034 scan->rs_coffset = lineoff;
1035 return;
1036 }
1037
1038 /*
1039 * if we get here, it means we've exhausted the items on this page and
1040 * it's time to move to the next.
1041 */
1043 }
1044
1045 /* end of scan */
1046 if (BufferIsValid(scan->rs_cbuf))
1047 ReleaseBuffer(scan->rs_cbuf);
1048
1049 scan->rs_cbuf = InvalidBuffer;
1052 tuple->t_data = NULL;
1053 scan->rs_inited = false;
1054}

References Assert, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid(), fb(), heap_fetch_next_buffer(), HeapCheckForSerializableConflictOut(), heapgettup_continue_page(), heapgettup_start_page(), HeapKeyTest(), HeapTupleSatisfiesVisibility(), InvalidBlockNumber, InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerSet(), likely, LockBuffer(), PageGetItem(), PageGetItemId(), RelationGetDescr, ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_coffset, HeapScanDescData::rs_ctup, HeapScanDescData::rs_inited, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, TableScanDescData::rs_snapshot, HeapTupleData::t_data, HeapTupleData::t_len, and HeapTupleData::t_self.

Referenced by heap_getnext(), heap_getnextslot(), and heap_getnextslot_tidrange().

◆ heapgettup_advance_block()

static BlockNumber heapgettup_advance_block ( HeapScanDesc  scan,
BlockNumber  block,
ScanDirection  dir 
)
inlinestatic

Definition at line 876 of file heapam.c.

877{
878 Assert(scan->rs_base.rs_parallel == NULL);
879
881 {
882 block++;
883
884 /* wrap back to the start of the heap */
885 if (block >= scan->rs_nblocks)
886 block = 0;
887
888 /*
889 * Report our new scan position for synchronization purposes. We don't
890 * do that when moving backwards, however. That would just mess up any
891 * other forward-moving scanners.
892 *
893 * Note: we do this before checking for end of scan so that the final
894 * state of the position hint is back at the start of the rel. That's
895 * not strictly necessary, but otherwise when you run the same query
896 * multiple times the starting position would shift a little bit
897 * backwards on every invocation, which is confusing. We don't
898 * guarantee any specific ordering in general, though.
899 */
900 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
901 ss_report_location(scan->rs_base.rs_rd, block);
902
903 /* we're done if we're back at where we started */
904 if (block == scan->rs_startblock)
905 return InvalidBlockNumber;
906
907 /* check if the limit imposed by heap_setscanlimits() is met */
908 if (scan->rs_numblocks != InvalidBlockNumber)
909 {
910 if (--scan->rs_numblocks == 0)
911 return InvalidBlockNumber;
912 }
913
914 return block;
915 }
916 else
917 {
918 /* we're done if the last block is the start position */
919 if (block == scan->rs_startblock)
920 return InvalidBlockNumber;
921
922 /* check if the limit imposed by heap_setscanlimits() is met */
923 if (scan->rs_numblocks != InvalidBlockNumber)
924 {
925 if (--scan->rs_numblocks == 0)
926 return InvalidBlockNumber;
927 }
928
929 /* wrap to the end of the heap when the last page was page 0 */
930 if (block == 0)
931 block = scan->rs_nblocks;
932
933 block--;
934
935 return block;
936 }
937}

References Assert, fb(), InvalidBlockNumber, likely, HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_nblocks, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, TableScanDescData::rs_rd, HeapScanDescData::rs_startblock, ScanDirectionIsForward, SO_ALLOW_SYNC, and ss_report_location().

Referenced by heap_scan_stream_read_next_serial().

◆ heapgettup_continue_page()

static Page heapgettup_continue_page ( HeapScanDesc  scan,
ScanDirection  dir,
int linesleft,
OffsetNumber lineoff 
)
inlinestatic

Definition at line 830 of file heapam.c.

832{
833 Page page;
834
835 Assert(scan->rs_inited);
837
838 /* Caller is responsible for ensuring buffer is locked if needed */
839 page = BufferGetPage(scan->rs_cbuf);
840
841 if (ScanDirectionIsForward(dir))
842 {
844 *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
845 }
846 else
847 {
848 /*
849 * The previous returned tuple may have been vacuumed since the
850 * previous scan when we use a non-MVCC snapshot, so we must
851 * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
852 */
854 *linesleft = *lineoff;
855 }
856
857 /* lineoff now references the physically previous or next tid */
858 return page;
859}

References Assert, BufferGetPage(), BufferIsValid(), fb(), Min, OffsetNumberNext, OffsetNumberPrev, PageGetMaxOffsetNumber(), HeapScanDescData::rs_cbuf, HeapScanDescData::rs_coffset, HeapScanDescData::rs_inited, and ScanDirectionIsForward.

Referenced by heapgettup().

◆ heapgettup_initial_block()

static pg_noinline BlockNumber heapgettup_initial_block ( HeapScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 752 of file heapam.c.

753{
754 Assert(!scan->rs_inited);
755 Assert(scan->rs_base.rs_parallel == NULL);
756
757 /* When there are no pages to scan, return InvalidBlockNumber */
758 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
759 return InvalidBlockNumber;
760
761 if (ScanDirectionIsForward(dir))
762 {
763 return scan->rs_startblock;
764 }
765 else
766 {
767 /*
768 * Disable reporting to syncscan logic in a backwards scan; it's not
769 * very likely anyone else is doing the same thing at the same time,
770 * and much more likely that we'll just bollix things for forward
771 * scanners.
772 */
774
775 /*
776 * Start from last page of the scan. Ensure we take into account
777 * rs_numblocks if it's been adjusted by heap_setscanlimits().
778 */
779 if (scan->rs_numblocks != InvalidBlockNumber)
780 return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
781
782 if (scan->rs_startblock > 0)
783 return scan->rs_startblock - 1;
784
785 return scan->rs_nblocks - 1;
786 }
787}

References Assert, fb(), InvalidBlockNumber, HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, HeapScanDescData::rs_nblocks, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, HeapScanDescData::rs_startblock, and ScanDirectionIsForward.

Referenced by heap_scan_stream_read_next_serial().

◆ heapgettup_pagemode()

static void heapgettup_pagemode ( HeapScanDesc  scan,
ScanDirection  dir,
int  nkeys,
ScanKey  key 
)
static

Definition at line 1070 of file heapam.c.

1074{
1075 HeapTuple tuple = &(scan->rs_ctup);
1076 Page page;
1079
1080 if (likely(scan->rs_inited))
1081 {
1082 /* continue from previously returned page/tuple */
1083 page = BufferGetPage(scan->rs_cbuf);
1084
1085 lineindex = scan->rs_cindex + dir;
1086 if (ScanDirectionIsForward(dir))
1087 linesleft = scan->rs_ntuples - lineindex;
1088 else
1089 linesleft = scan->rs_cindex;
1090 /* lineindex now references the next or previous visible tid */
1091
1092 goto continue_page;
1093 }
1094
1095 /*
1096 * advance the scan until we find a qualifying tuple or run out of stuff
1097 * to scan
1098 */
1099 while (true)
1100 {
1101 heap_fetch_next_buffer(scan, dir);
1102
1103 /* did we run out of blocks to scan? */
1104 if (!BufferIsValid(scan->rs_cbuf))
1105 break;
1106
1108
1109 /* prune the page and determine visible tuple offsets */
1111 page = BufferGetPage(scan->rs_cbuf);
1112 linesleft = scan->rs_ntuples;
1114
1115 /* block is the same for all tuples, set it once outside the loop */
1117
1118 /* lineindex now references the next or previous visible tid */
1120
1121 for (; linesleft > 0; linesleft--, lineindex += dir)
1122 {
1123 ItemId lpp;
1125
1126 Assert(lineindex < scan->rs_ntuples);
1128 lpp = PageGetItemId(page, lineoff);
1130
1131 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1132 tuple->t_len = ItemIdGetLength(lpp);
1134
1135 /* skip any tuples that don't match the scan key */
1136 if (key != NULL &&
1138 nkeys, key))
1139 continue;
1140
1141 scan->rs_cindex = lineindex;
1142 return;
1143 }
1144 }
1145
1146 /* end of scan */
1147 if (BufferIsValid(scan->rs_cbuf))
1148 ReleaseBuffer(scan->rs_cbuf);
1149 scan->rs_cbuf = InvalidBuffer;
1152 tuple->t_data = NULL;
1153 scan->rs_inited = false;
1154}

References Assert, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), fb(), heap_fetch_next_buffer(), heap_prepare_pagescan(), HeapKeyTest(), InvalidBlockNumber, InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerSetBlockNumber(), ItemPointerSetOffsetNumber(), likely, PageGetItem(), PageGetItemId(), RelationGetDescr, ReleaseBuffer(), HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_cindex, HeapScanDescData::rs_ctup, HeapScanDescData::rs_inited, HeapScanDescData::rs_ntuples, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, ScanDirectionIsForward, HeapTupleData::t_data, HeapTupleData::t_len, and HeapTupleData::t_self.

Referenced by heap_getnext(), heap_getnextslot(), and heap_getnextslot_tidrange().

◆ heapgettup_start_page()

static Page heapgettup_start_page ( HeapScanDesc  scan,
ScanDirection  dir,
int linesleft,
OffsetNumber lineoff 
)
static

Definition at line 799 of file heapam.c.

801{
802 Page page;
803
804 Assert(scan->rs_inited);
806
807 /* Caller is responsible for ensuring buffer is locked if needed */
808 page = BufferGetPage(scan->rs_cbuf);
809
811
812 if (ScanDirectionIsForward(dir))
814 else
816
817 /* lineoff now references the physically previous or next tid */
818 return page;
819}

References Assert, BufferGetPage(), BufferIsValid(), fb(), FirstOffsetNumber, PageGetMaxOffsetNumber(), HeapScanDescData::rs_cbuf, HeapScanDescData::rs_inited, and ScanDirectionIsForward.

Referenced by heapgettup().

◆ HeapTupleGetUpdateXid()

◆ HeapTupleHeaderAdvanceConflictHorizon()

void HeapTupleHeaderAdvanceConflictHorizon ( HeapTupleHeader  tuple,
TransactionId snapshotConflictHorizon 
)

Definition at line 8054 of file heapam.c.

8056{
8060
8061 if (tuple->t_infomask & HEAP_MOVED)
8062 {
8063 if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
8064 *snapshotConflictHorizon = xvac;
8065 }
8066
8067 /*
8068 * Ignore tuples inserted by an aborted transaction or if the tuple was
8069 * updated/deleted by the inserting transaction.
8070 *
8071 * Look for a committed hint bit, or if no xmin bit is set, check clog.
8072 */
8073 if (HeapTupleHeaderXminCommitted(tuple) ||
8075 {
8076 if (xmax != xmin &&
8077 TransactionIdFollows(xmax, *snapshotConflictHorizon))
8078 *snapshotConflictHorizon = xmax;
8079 }
8080}

References fb(), HEAP_MOVED, HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), HeapTupleHeaderXminCommitted(), HeapTupleHeaderXminInvalid(), HeapTupleHeaderData::t_infomask, TransactionIdDidCommit(), TransactionIdFollows(), and TransactionIdPrecedes().

Referenced by heap_index_delete_tuples(), heap_prune_chain(), and prune_freeze_plan().

◆ index_delete_check_htid()

static void index_delete_check_htid ( TM_IndexDeleteOp delstate,
Page  page,
OffsetNumber  maxoff,
const ItemPointerData htid,
TM_IndexStatus istatus 
)
inlinestatic

Definition at line 8139 of file heapam.c.

8142{
8144 ItemId iid;
8145
8146 Assert(OffsetNumberIsValid(istatus->idxoffnum));
8147
8148 if (unlikely(indexpagehoffnum > maxoff))
8149 ereport(ERROR,
8151 errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
8154 istatus->idxoffnum, delstate->iblknum,
8156
8158 if (unlikely(!ItemIdIsUsed(iid)))
8159 ereport(ERROR,
8161 errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
8164 istatus->idxoffnum, delstate->iblknum,
8166
8167 if (ItemIdHasStorage(iid))
8168 {
8169 HeapTupleHeader htup;
8170
8172 htup = (HeapTupleHeader) PageGetItem(page, iid);
8173
8175 ereport(ERROR,
8177 errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
8180 istatus->idxoffnum, delstate->iblknum,
8182 }
8183}

References Assert, ereport, errcode(), errmsg_internal(), ERROR, fb(), HeapTupleHeaderIsHeapOnly(), ItemIdHasStorage, ItemIdIsNormal, ItemIdIsUsed, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), OffsetNumberIsValid, PageGetItem(), PageGetItemId(), RelationGetRelationName, and unlikely.

Referenced by heap_index_delete_tuples().

◆ index_delete_sort()

static void index_delete_sort ( TM_IndexDeleteOp delstate)
static

Definition at line 8544 of file heapam.c.

8545{
8546 TM_IndexDelete *deltids = delstate->deltids;
8547 int ndeltids = delstate->ndeltids;
8548
8549 /*
8550 * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
8551 *
8552 * This implementation is fast with array sizes up to ~4500. This covers
8553 * all supported BLCKSZ values.
8554 */
8555 const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
8556
8557 /* Think carefully before changing anything here -- keep swaps cheap */
8558 StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
8559 "element size exceeds 8 bytes");
8560
8561 for (int g = 0; g < lengthof(gaps); g++)
8562 {
8563 for (int hi = gaps[g], i = hi; i < ndeltids; i++)
8564 {
8565 TM_IndexDelete d = deltids[i];
8566 int j = i;
8567
8568 while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
8569 {
8570 deltids[j] = deltids[j - hi];
8571 j -= hi;
8572 }
8573 deltids[j] = d;
8574 }
8575 }
8576}

References fb(), i, index_delete_sort_cmp(), j, lengthof, and StaticAssertDecl.

Referenced by heap_index_delete_tuples().

◆ index_delete_sort_cmp()

static int index_delete_sort_cmp ( TM_IndexDelete deltid1,
TM_IndexDelete deltid2 
)
inlinestatic

Definition at line 8508 of file heapam.c.

8509{
8510 ItemPointer tid1 = &deltid1->tid;
8511 ItemPointer tid2 = &deltid2->tid;
8512
8513 {
8516
8517 if (blk1 != blk2)
8518 return (blk1 < blk2) ? -1 : 1;
8519 }
8520 {
8523
8524 if (pos1 != pos2)
8525 return (pos1 < pos2) ? -1 : 1;
8526 }
8527
8528 Assert(false);
8529
8530 return 0;
8531}

References Assert, fb(), ItemPointerGetBlockNumber(), and ItemPointerGetOffsetNumber().

Referenced by index_delete_sort().

◆ initscan()

static void initscan ( HeapScanDesc  scan,
ScanKey  key,
bool  keep_startblock 
)
static

Definition at line 357 of file heapam.c.

358{
360 bool allow_strat;
361 bool allow_sync;
362
363 /*
364 * Determine the number of blocks we have to scan.
365 *
366 * It is sufficient to do this once at scan start, since any tuples added
367 * while the scan is in progress will be invisible to my snapshot anyway.
368 * (That is not true when using a non-MVCC snapshot. However, we couldn't
369 * guarantee to return tuples added after scan start anyway, since they
370 * might go into pages we already scanned. To guarantee consistent
371 * results for a non-MVCC snapshot, the caller must hold some higher-level
372 * lock that ensures the interesting tuple(s) won't change.)
373 */
374 if (scan->rs_base.rs_parallel != NULL)
375 {
377 scan->rs_nblocks = bpscan->phs_nblocks;
378 }
379 else
381
382 /*
383 * If the table is large relative to NBuffers, use a bulk-read access
384 * strategy and enable synchronized scanning (see syncscan.c). Although
385 * the thresholds for these features could be different, we make them the
386 * same so that there are only two behaviors to tune rather than four.
387 * (However, some callers need to be able to disable one or both of these
388 * behaviors, independently of the size of the table; also there is a GUC
389 * variable that can disable synchronized scanning.)
390 *
391 * Note that table_block_parallelscan_initialize has a very similar test;
392 * if you change this, consider changing that one, too.
393 */
395 scan->rs_nblocks > NBuffers / 4)
396 {
398 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
399 }
400 else
401 allow_strat = allow_sync = false;
402
403 if (allow_strat)
404 {
405 /* During a rescan, keep the previous strategy object. */
406 if (scan->rs_strategy == NULL)
408 }
409 else
410 {
411 if (scan->rs_strategy != NULL)
413 scan->rs_strategy = NULL;
414 }
415
416 if (scan->rs_base.rs_parallel != NULL)
417 {
418 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
421 else
423
424 /*
425 * If not rescanning, initialize the startblock. Finding the actual
426 * start location is done in table_block_parallelscan_startblock_init,
427 * based on whether an alternative start location has been set with
428 * heap_setscanlimits, or using the syncscan location, when syncscan
429 * is enabled.
430 */
431 if (!keep_startblock)
433 }
434 else
435 {
436 if (keep_startblock)
437 {
438 /*
439 * When rescanning, we want to keep the previous startblock
440 * setting, so that rewinding a cursor doesn't generate surprising
441 * results. Reset the active syncscan setting, though.
442 */
445 else
447 }
449 {
452 }
453 else
454 {
456 scan->rs_startblock = 0;
457 }
458 }
459
461 scan->rs_inited = false;
462 scan->rs_ctup.t_data = NULL;
464 scan->rs_cbuf = InvalidBuffer;
466 scan->rs_ntuples = 0;
467 scan->rs_cindex = 0;
468
469 /*
470 * Initialize to ForwardScanDirection because it is most common and
471 * because heap scans go forward before going backward (e.g. CURSORs).
472 */
475
476 /* page-at-a-time fields are always invalid when not rs_inited */
477
478 /*
479 * copy the scan key, if appropriate
480 */
481 if (key != NULL && scan->rs_base.rs_nkeys > 0)
482 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
483
484 /*
485 * Currently, we only have a stats counter for sequential heap scans (but
486 * e.g for bitmap scans the underlying bitmap index scans will be counted,
487 * and for sample scans we update stats for tuple fetches).
488 */
489 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
491}

References BAS_BULKREAD, fb(), ForwardScanDirection, FreeAccessStrategy(), GetAccessStrategy(), InvalidBlockNumber, InvalidBuffer, ItemPointerSetInvalid(), NBuffers, pgstat_count_heap_scan, ParallelTableScanDescData::phs_syncscan, RelationGetNumberOfBlocks, RelationUsesLocalBuffers, HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_cindex, HeapScanDescData::rs_ctup, HeapScanDescData::rs_dir, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, TableScanDescData::rs_key, HeapScanDescData::rs_nblocks, TableScanDescData::rs_nkeys, HeapScanDescData::rs_ntuples, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, HeapScanDescData::rs_startblock, HeapScanDescData::rs_strategy, SO_ALLOW_STRAT, SO_ALLOW_SYNC, SO_TYPE_SEQSCAN, ss_get_location(), synchronize_seqscans, HeapTupleData::t_data, and HeapTupleData::t_self.

Referenced by heap_beginscan(), and heap_rescan().

◆ log_heap_new_cid()

static XLogRecPtr log_heap_new_cid ( Relation  relation,
HeapTuple  tup 
)
static

Definition at line 9141 of file heapam.c.

9142{
9144
9146 HeapTupleHeader hdr = tup->t_data;
9147
9148 Assert(ItemPointerIsValid(&tup->t_self));
9149 Assert(tup->t_tableOid != InvalidOid);
9150
9151 xlrec.top_xid = GetTopTransactionId();
9152 xlrec.target_locator = relation->rd_locator;
9153 xlrec.target_tid = tup->t_self;
9154
9155 /*
9156 * If the tuple got inserted & deleted in the same TX we definitely have a
9157 * combo CID, set cmin and cmax.
9158 */
9159 if (hdr->t_infomask & HEAP_COMBOCID)
9160 {
9163 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
9164 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
9165 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
9166 }
9167 /* No combo CID, so only cmin or cmax can be set by this TX */
9168 else
9169 {
9170 /*
9171 * Tuple inserted.
9172 *
9173 * We need to check for LOCK ONLY because multixacts might be
9174 * transferred to the new tuple in case of FOR KEY SHARE updates in
9175 * which case there will be an xmax, although the tuple just got
9176 * inserted.
9177 */
9178 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
9180 {
9182 xlrec.cmax = InvalidCommandId;
9183 }
9184 /* Tuple from a different tx updated or deleted. */
9185 else
9186 {
9187 xlrec.cmin = InvalidCommandId;
9189 }
9190 xlrec.combocid = InvalidCommandId;
9191 }
9192
9193 /*
9194 * Note that we don't need to register the buffer here, because this
9195 * operation does not modify the page. The insert/update/delete that
9196 * called us certainly did, but that's WAL-logged separately.
9197 */
9200
9201 /* will be looked at irrespective of origin */
9202
9204
9205 return recptr;
9206}

References Assert, fb(), GetTopTransactionId(), HEAP_COMBOCID, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetCmin(), HeapTupleHeaderGetRawCommandId(), HeapTupleHeaderXminInvalid(), InvalidCommandId, InvalidOid, ItemPointerIsValid(), RelationData::rd_locator, SizeOfHeapNewCid, HeapTupleHeaderData::t_infomask, XLOG_HEAP2_NEW_CID, XLogBeginInsert(), XLogInsert(), and XLogRegisterData().

Referenced by heap_delete(), heap_insert(), heap_multi_insert(), and heap_update().

◆ log_heap_update()

static XLogRecPtr log_heap_update ( Relation  reln,
Buffer  oldbuf,
Buffer  newbuf,
HeapTuple  oldtup,
HeapTuple  newtup,
HeapTuple  old_key_tuple,
bool  all_visible_cleared,
bool  new_all_visible_cleared 
)
static

Definition at line 8919 of file heapam.c.

8923{
8927 uint8 info;
8929 uint16 prefixlen = 0,
8930 suffixlen = 0;
8932 Page page = BufferGetPage(newbuf);
8934 bool init;
8935 int bufflags;
8936
8937 /* Caller should not call me on a non-WAL-logged relation */
8939
8941
8943 info = XLOG_HEAP_HOT_UPDATE;
8944 else
8945 info = XLOG_HEAP_UPDATE;
8946
8947 /*
8948 * If the old and new tuple are on the same page, we only need to log the
8949 * parts of the new tuple that were changed. That saves on the amount of
8950 * WAL we need to write. Currently, we just count any unchanged bytes in
8951 * the beginning and end of the tuple. That's quick to check, and
8952 * perfectly covers the common case that only one field is updated.
8953 *
8954 * We could do this even if the old and new tuple are on different pages,
8955 * but only if we don't make a full-page image of the old page, which is
8956 * difficult to know in advance. Also, if the old tuple is corrupt for
8957 * some reason, it would allow the corruption to propagate the new page,
8958 * so it seems best to avoid. Under the general assumption that most
8959 * updates tend to create the new tuple version on the same page, there
8960 * isn't much to be gained by doing this across pages anyway.
8961 *
8962 * Skip this if we're taking a full-page image of the new page, as we
8963 * don't include the new tuple in the WAL record in that case. Also
8964 * disable if effective_wal_level='logical', as logical decoding needs to
8965 * be able to read the new tuple in whole from the WAL record alone.
8966 */
8967 if (oldbuf == newbuf && !need_tuple_data &&
8969 {
8970 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8971 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8972 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8973 int newlen = newtup->t_len - newtup->t_data->t_hoff;
8974
8975 /* Check for common prefix between old and new tuple */
8976 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8977 {
8978 if (newp[prefixlen] != oldp[prefixlen])
8979 break;
8980 }
8981
8982 /*
8983 * Storing the length of the prefix takes 2 bytes, so we need to save
8984 * at least 3 bytes or there's no point.
8985 */
8986 if (prefixlen < 3)
8987 prefixlen = 0;
8988
8989 /* Same for suffix */
8991 {
8992 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
8993 break;
8994 }
8995 if (suffixlen < 3)
8996 suffixlen = 0;
8997 }
8998
8999 /* Prepare main WAL data chain */
9000 xlrec.flags = 0;
9005 if (prefixlen > 0)
9007 if (suffixlen > 0)
9009 if (need_tuple_data)
9010 {
9012 if (old_key_tuple)
9013 {
9014 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
9016 else
9018 }
9019 }
9020
9021 /* If new tuple is the single and first tuple on page... */
9024 {
9025 info |= XLOG_HEAP_INIT_PAGE;
9026 init = true;
9027 }
9028 else
9029 init = false;
9030
9031 /* Prepare WAL data for the old page */
9032 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
9033 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
9034 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
9035 oldtup->t_data->t_infomask2);
9036
9037 /* Prepare WAL data for the new page */
9038 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
9039 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
9040
9042 if (init)
9044 if (need_tuple_data)
9046
9048 if (oldbuf != newbuf)
9050
9052
9053 /*
9054 * Prepare WAL data for the new tuple.
9055 */
9056 if (prefixlen > 0 || suffixlen > 0)
9057 {
9058 if (prefixlen > 0 && suffixlen > 0)
9059 {
9062 XLogRegisterBufData(0, &prefix_suffix, sizeof(uint16) * 2);
9063 }
9064 else if (prefixlen > 0)
9065 {
9066 XLogRegisterBufData(0, &prefixlen, sizeof(uint16));
9067 }
9068 else
9069 {
9070 XLogRegisterBufData(0, &suffixlen, sizeof(uint16));
9071 }
9072 }
9073
9074 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
9075 xlhdr.t_infomask = newtup->t_data->t_infomask;
9076 xlhdr.t_hoff = newtup->t_data->t_hoff;
9078
9079 /*
9080 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
9081 *
9082 * The 'data' doesn't include the common prefix or suffix.
9083 */
9085 if (prefixlen == 0)
9086 {
9088 (char *) newtup->t_data + SizeofHeapTupleHeader,
9090 }
9091 else
9092 {
9093 /*
9094 * Have to write the null bitmap and data after the common prefix as
9095 * two separate rdata entries.
9096 */
9097 /* bitmap [+ padding] [+ oid] */
9098 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
9099 {
9101 (char *) newtup->t_data + SizeofHeapTupleHeader,
9102 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
9103 }
9104
9105 /* data after common prefix */
9107 (char *) newtup->t_data + newtup->t_data->t_hoff + prefixlen,
9108 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
9109 }
9110
9111 /* We need to log a tuple identity */
9113 {
9114 /* don't really need this, but its more comfy to decode */
9115 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
9116 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
9117 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
9118
9120
9121 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
9124 }
9125
9126 /* filtering by origin on a row level is much more efficient */
9128
9129 recptr = XLogInsert(RM_HEAP_ID, info);
9130
9131 return recptr;
9132}

References Assert, BufferGetPage(), compute_infobits(), fb(), FirstOffsetNumber, HeapTupleHeaderGetRawXmax(), HeapTupleIsHeapOnly(), init, ItemPointerGetOffsetNumber(), Min, PageGetMaxOffsetNumber(), REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationIsLogicallyLogged, RelationNeedsWAL, SizeOfHeapHeader, SizeofHeapTupleHeader, SizeOfHeapUpdate, XLH_UPDATE_CONTAINS_NEW_TUPLE, XLH_UPDATE_CONTAINS_OLD_KEY, XLH_UPDATE_CONTAINS_OLD_TUPLE, XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED, XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED, XLH_UPDATE_PREFIX_FROM_OLD, XLH_UPDATE_SUFFIX_FROM_OLD, XLOG_HEAP_HOT_UPDATE, XLOG_HEAP_INIT_PAGE, XLOG_HEAP_UPDATE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogCheckBufferNeedsBackup(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heap_update().

◆ log_heap_visible()

XLogRecPtr log_heap_visible ( Relation  rel,
Buffer  heap_buffer,
Buffer  vm_buffer,
TransactionId  snapshotConflictHorizon,
uint8  vmflags 
)

◆ MultiXactIdGetUpdateXid()

static TransactionId MultiXactIdGetUpdateXid ( TransactionId  xmax,
uint16  t_infomask 
)
static

Definition at line 7608 of file heapam.c.

7609{
7611 MultiXactMember *members;
7612 int nmembers;
7613
7614 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7615 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7616
7617 /*
7618 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7619 * pre-pg_upgrade.
7620 */
7621 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7622
7623 if (nmembers > 0)
7624 {
7625 int i;
7626
7627 for (i = 0; i < nmembers; i++)
7628 {
7629 /* Ignore lockers */
7630 if (!ISUPDATE_from_mxstatus(members[i].status))
7631 continue;
7632
7633 /* there can be at most one updater */
7635 update_xact = members[i].xid;
7636#ifndef USE_ASSERT_CHECKING
7637
7638 /*
7639 * in an assert-enabled build, walk the whole array to ensure
7640 * there's no other updater.
7641 */
7642 break;
7643#endif
7644 }
7645
7646 pfree(members);
7647 }
7648
7649 return update_xact;
7650}

References Assert, fb(), GetMultiXactIdMembers(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_LOCK_ONLY, i, InvalidTransactionId, ISUPDATE_from_mxstatus, pfree(), and MultiXactMember::xid.

Referenced by compute_new_xmax_infomask(), FreezeMultiXactId(), heap_lock_updated_tuple(), and HeapTupleGetUpdateXid().

◆ MultiXactIdWait()

static void MultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
Relation  rel,
const ItemPointerData ctid,
XLTW_Oper  oper,
int remaining 
)
static

Definition at line 7854 of file heapam.c.

7857{
7858 (void) Do_MultiXactIdWait(multi, status, infomask, false,
7859 rel, ctid, oper, remaining, false);
7860}

References Do_MultiXactIdWait(), fb(), oper(), and remaining.

Referenced by heap_delete(), heap_inplace_lock(), heap_lock_tuple(), and heap_update().

◆ page_collect_tuples()

static pg_attribute_always_inline int page_collect_tuples ( HeapScanDesc  scan,
Snapshot  snapshot,
Page  page,
Buffer  buffer,
BlockNumber  block,
int  lines,
bool  all_visible,
bool  check_serializable 
)
static

Definition at line 522 of file heapam.c.

526{
527 Oid relid = RelationGetRelid(scan->rs_base.rs_rd);
528 int ntup = 0;
529 int nvis = 0;
531
532 /* page at a time should have been disabled otherwise */
533 Assert(IsMVCCSnapshot(snapshot));
534
535 /* first find all tuples on the page */
537 {
540
542 continue;
543
544 /*
545 * If the page is not all-visible or we need to check serializability,
546 * maintain enough state to be able to refind the tuple efficiently,
547 * without again first needing to fetch the item and then via that the
548 * tuple.
549 */
550 if (!all_visible || check_serializable)
551 {
552 tup = &batchmvcc.tuples[ntup];
553
555 tup->t_len = ItemIdGetLength(lpp);
556 tup->t_tableOid = relid;
557 ItemPointerSet(&(tup->t_self), block, lineoff);
558 }
559
560 /*
561 * If the page is all visible, these fields otherwise won't be
562 * populated in loop below.
563 */
564 if (all_visible)
565 {
567 {
568 batchmvcc.visible[ntup] = true;
569 }
570 scan->rs_vistuples[ntup] = lineoff;
571 }
572
573 ntup++;
574 }
575
577
578 /*
579 * Unless the page is all visible, test visibility for all tuples one go.
580 * That is considerably more efficient than calling
581 * HeapTupleSatisfiesMVCC() one-by-one.
582 */
583 if (all_visible)
584 nvis = ntup;
585 else
586 nvis = HeapTupleSatisfiesMVCCBatch(snapshot, buffer,
587 ntup,
588 &batchmvcc,
589 scan->rs_vistuples);
590
591 /*
592 * So far we don't have batch API for testing serializabilty, so do so
593 * one-by-one.
594 */
596 {
597 for (int i = 0; i < ntup; i++)
598 {
600 scan->rs_base.rs_rd,
601 &batchmvcc.tuples[i],
602 buffer, snapshot);
603 }
604 }
605
606 return nvis;
607}

References Assert, fb(), FirstOffsetNumber, HeapCheckForSerializableConflictOut(), HeapTupleSatisfiesMVCCBatch(), i, IsMVCCSnapshot, ItemIdGetLength, ItemIdIsNormal, ItemPointerSet(), MaxHeapTuplesPerPage, PageGetItem(), PageGetItemId(), RelationGetRelid, HeapScanDescData::rs_base, TableScanDescData::rs_rd, HeapScanDescData::rs_vistuples, HeapTupleData::t_data, and unlikely.

Referenced by heap_prepare_pagescan().

◆ ReleaseBulkInsertStatePin()

void ReleaseBulkInsertStatePin ( BulkInsertState  bistate)

Definition at line 2104 of file heapam.c.

2105{
2106 if (bistate->current_buf != InvalidBuffer)
2107 ReleaseBuffer(bistate->current_buf);
2108 bistate->current_buf = InvalidBuffer;
2109
2110 /*
2111 * Despite the name, we also reset bulk relation extension state.
2112 * Otherwise we can end up erroring out due to looking for free space in
2113 * ->next_free of one partition, even though ->next_free was set when
2114 * extending another partition. It could obviously also be bad for
2115 * efficiency to look at existing blocks at offsets from another
2116 * partition, even if we don't error out.
2117 */
2118 bistate->next_free = InvalidBlockNumber;
2119 bistate->last_free = InvalidBlockNumber;
2120}

References BulkInsertStateData::current_buf, InvalidBlockNumber, InvalidBuffer, BulkInsertStateData::last_free, BulkInsertStateData::next_free, and ReleaseBuffer().

Referenced by CopyFrom().

◆ simple_heap_delete()

void simple_heap_delete ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 3266 of file heapam.c.

3267{
3268 TM_Result result;
3269 TM_FailureData tmfd;
3270
3271 result = heap_delete(relation, tid,
3273 true /* wait for commit */ ,
3274 &tmfd, false /* changingPart */ );
3275 switch (result)
3276 {
3277 case TM_SelfModified:
3278 /* Tuple was already updated in current command? */
3279 elog(ERROR, "tuple already updated by self");
3280 break;
3281
3282 case TM_Ok:
3283 /* done successfully */
3284 break;
3285
3286 case TM_Updated:
3287 elog(ERROR, "tuple concurrently updated");
3288 break;
3289
3290 case TM_Deleted:
3291 elog(ERROR, "tuple concurrently deleted");
3292 break;
3293
3294 default:
3295 elog(ERROR, "unrecognized heap_delete status: %u", result);
3296 break;
3297 }
3298}

References elog, ERROR, GetCurrentCommandId(), heap_delete(), InvalidSnapshot, TM_Deleted, TM_Ok, TM_SelfModified, and TM_Updated.

Referenced by CatalogTupleDelete(), and toast_delete_datum().

◆ simple_heap_insert()

void simple_heap_insert ( Relation  relation,
HeapTuple  tup 
)

Definition at line 2785 of file heapam.c.

2786{
2787 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2788}

References fb(), GetCurrentCommandId(), and heap_insert().

Referenced by CatalogTupleInsert(), CatalogTupleInsertWithInfo(), and InsertOneTuple().

◆ simple_heap_update()

void simple_heap_update ( Relation  relation,
const ItemPointerData otid,
HeapTuple  tup,
TU_UpdateIndexes update_indexes 
)

Definition at line 4556 of file heapam.c.

4558{
4559 TM_Result result;
4560 TM_FailureData tmfd;
4561 LockTupleMode lockmode;
4562
4563 result = heap_update(relation, otid, tup,
4565 true /* wait for commit */ ,
4566 &tmfd, &lockmode, update_indexes);
4567 switch (result)
4568 {
4569 case TM_SelfModified:
4570 /* Tuple was already updated in current command? */
4571 elog(ERROR, "tuple already updated by self");
4572 break;
4573
4574 case TM_Ok:
4575 /* done successfully */
4576 break;
4577
4578 case TM_Updated:
4579 elog(ERROR, "tuple concurrently updated");
4580 break;
4581
4582 case TM_Deleted:
4583 elog(ERROR, "tuple concurrently deleted");
4584 break;
4585
4586 default:
4587 elog(ERROR, "unrecognized heap_update status: %u", result);
4588 break;
4589 }
4590}

References elog, ERROR, fb(), GetCurrentCommandId(), heap_update(), InvalidSnapshot, TM_Deleted, TM_Ok, TM_SelfModified, and TM_Updated.

Referenced by CatalogTupleUpdate(), and CatalogTupleUpdateWithInfo().

◆ test_lockmode_for_conflict()

static TM_Result test_lockmode_for_conflict ( MultiXactStatus  status,
TransactionId  xid,
LockTupleMode  mode,
HeapTuple  tup,
bool needwait 
)
static

Definition at line 5676 of file heapam.c.

5679{
5681
5682 *needwait = false;
5684
5685 /*
5686 * Note: we *must* check TransactionIdIsInProgress before
5687 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5688 * for an explanation.
5689 */
5691 {
5692 /*
5693 * The tuple has already been locked by our own transaction. This is
5694 * very rare but can happen if multiple transactions are trying to
5695 * lock an ancient version of the same tuple.
5696 */
5697 return TM_SelfModified;
5698 }
5699 else if (TransactionIdIsInProgress(xid))
5700 {
5701 /*
5702 * If the locking transaction is running, what we do depends on
5703 * whether the lock modes conflict: if they do, then we must wait for
5704 * it to finish; otherwise we can fall through to lock this tuple
5705 * version without waiting.
5706 */
5709 {
5710 *needwait = true;
5711 }
5712
5713 /*
5714 * If we set needwait above, then this value doesn't matter;
5715 * otherwise, this value signals to caller that it's okay to proceed.
5716 */
5717 return TM_Ok;
5718 }
5719 else if (TransactionIdDidAbort(xid))
5720 return TM_Ok;
5721 else if (TransactionIdDidCommit(xid))
5722 {
5723 /*
5724 * The other transaction committed. If it was only a locker, then the
5725 * lock is completely gone now and we can return success; but if it
5726 * was an update, then what we do depends on whether the two lock
5727 * modes conflict. If they conflict, then we must report error to
5728 * caller. But if they don't, we can fall through to allow the current
5729 * transaction to lock the tuple.
5730 *
5731 * Note: the reason we worry about ISUPDATE here is because as soon as
5732 * a transaction ends, all its locks are gone and meaningless, and
5733 * thus we can ignore them; whereas its updates persist. In the
5734 * TransactionIdIsInProgress case, above, we don't need to check
5735 * because we know the lock is still "alive" and thus a conflict needs
5736 * always be checked.
5737 */
5738 if (!ISUPDATE_from_mxstatus(status))
5739 return TM_Ok;
5740
5743 {
5744 /* bummer */
5745 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5746 return TM_Updated;
5747 else
5748 return TM_Deleted;
5749 }
5750
5751 return TM_Ok;
5752 }
5753
5754 /* Not in progress, not aborted, not committed -- must have crashed */
5755 return TM_Ok;
5756}

References DoLockModesConflict(), fb(), get_mxact_status_for_lock(), ISUPDATE_from_mxstatus, ItemPointerEquals(), LOCKMODE_from_mxstatus, mode, TM_Deleted, TM_Ok, TM_SelfModified, TM_Updated, TransactionIdDidAbort(), TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), and TransactionIdIsInProgress().

Referenced by heap_lock_updated_tuple_rec().

◆ UpdateXmaxHintBits()

◆ xmax_infomask_changed()

static bool xmax_infomask_changed ( uint16  new_infomask,
uint16  old_infomask 
)
inlinestatic

Definition at line 2820 of file heapam.c.

2821{
2822 const uint16 interesting =
2824
2825 if ((new_infomask & interesting) != (old_infomask & interesting))
2826 return true;
2827
2828 return false;
2829}

References fb(), HEAP_LOCK_MASK, HEAP_XMAX_IS_MULTI, and HEAP_XMAX_LOCK_ONLY.

Referenced by heap_delete(), heap_lock_tuple(), and heap_update().

Variable Documentation

◆ hwlock

LOCKMODE hwlock

Definition at line 128 of file heapam.c.

◆ lockstatus

int lockstatus

Definition at line 129 of file heapam.c.

◆ MultiXactStatusLock

const int MultiXactStatusLock[MaxMultiXactStatus+1]
static
Initial value:

Definition at line 207 of file heapam.c.

208{
209 LockTupleKeyShare, /* ForKeyShare */
210 LockTupleShare, /* ForShare */
211 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
212 LockTupleExclusive, /* ForUpdate */
213 LockTupleNoKeyExclusive, /* NoKeyUpdate */
214 LockTupleExclusive /* Update */
215};

◆ [struct]

const struct { ... } tupleLockExtraInfo[]
Initial value:
=
{
.hwlock = AccessShareLock,
.updstatus = -1
},
.hwlock = RowShareLock,
.lockstatus = MultiXactStatusForShare,
.updstatus = -1
},
.hwlock = ExclusiveLock,
},
.lockstatus = MultiXactStatusForUpdate,
.updstatus = MultiXactStatusUpdate
}
}
#define AccessExclusiveLock
Definition lockdefs.h:43
#define ExclusiveLock
Definition lockdefs.h:42
#define RowShareLock
Definition lockdefs.h:37

Referenced by DoesMultiXactIdConflict(), and get_mxact_status_for_lock().

◆ updstatus

int updstatus

Definition at line 130 of file heapam.c.