PostgreSQL Source Code git master
Loading...
Searching...
No Matches
heapam.c File Reference
#include "postgres.h"
#include "access/heapam.h"
#include "access/heaptoast.h"
#include "access/hio.h"
#include "access/multixact.h"
#include "access/subtrans.h"
#include "access/syncscan.h"
#include "access/valid.h"
#include "access/visibilitymap.h"
#include "access/xloginsert.h"
#include "catalog/pg_database.h"
#include "catalog/pg_database_d.h"
#include "commands/vacuum.h"
#include "pgstat.h"
#include "port/pg_bitutils.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "utils/datum.h"
#include "utils/injection_point.h"
#include "utils/inval.h"
#include "utils/spccache.h"
#include "utils/syscache.h"
Include dependency graph for heapam.c:

Go to the source code of this file.

Data Structures

struct  IndexDeleteCounts
 

Macros

#define LOCKMODE_from_mxstatus(status)    (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
 
#define LockTupleTuplock(rel, tup, mode)    LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
 
#define UnlockTupleTuplock(rel, tup, mode)    UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
 
#define ConditionalLockTupleTuplock(rel, tup, mode, log)    ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock, (log))
 
#define BOTTOMUP_MAX_NBLOCKS   6
 
#define BOTTOMUP_TOLERANCE_NBLOCKS   3
 
#define TUPLOCK_from_mxstatus(status)    (MultiXactStatusLock[(status)])
 
#define FRM_NOOP   0x0001
 
#define FRM_INVALIDATE_XMAX   0x0002
 
#define FRM_RETURN_IS_XID   0x0004
 
#define FRM_RETURN_IS_MULTI   0x0008
 
#define FRM_MARK_COMMITTED   0x0010
 

Typedefs

typedef struct IndexDeleteCounts IndexDeleteCounts
 

Functions

static HeapTuple heap_prepare_insert (Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options)
 
static XLogRecPtr log_heap_update (Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared)
 
static BitmapsetHeapDetermineColumnsInfo (Relation relation, Bitmapset *interesting_cols, Bitmapset *external_cols, HeapTuple oldtup, HeapTuple newtup, bool *has_external)
 
static bool heap_acquire_tuplock (Relation relation, const ItemPointerData *tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock)
 
static BlockNumber heapgettup_advance_block (HeapScanDesc scan, BlockNumber block, ScanDirection dir)
 
static pg_noinline BlockNumber heapgettup_initial_block (HeapScanDesc scan, ScanDirection dir)
 
static void compute_new_xmax_infomask (TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2)
 
static TM_Result heap_lock_updated_tuple (Relation rel, uint16 prior_infomask, TransactionId prior_raw_xmax, const ItemPointerData *prior_ctid, TransactionId xid, LockTupleMode mode)
 
static void GetMultiXactIdHintBits (MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2)
 
static TransactionId MultiXactIdGetUpdateXid (TransactionId xmax, uint16 t_infomask)
 
static bool DoesMultiXactIdConflict (MultiXactId multi, uint16 infomask, LockTupleMode lockmode, bool *current_is_member)
 
static void MultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining)
 
static bool ConditionalMultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining, bool logLockFailure)
 
static void index_delete_sort (TM_IndexDeleteOp *delstate)
 
static int bottomup_sort_and_shrink (TM_IndexDeleteOp *delstate)
 
static XLogRecPtr log_heap_new_cid (Relation relation, HeapTuple tup)
 
static HeapTuple ExtractReplicaIdentity (Relation relation, HeapTuple tp, bool key_required, bool *copy)
 
static void AssertHasSnapshotForToast (Relation rel)
 
static BlockNumber heap_scan_stream_read_next_parallel (ReadStream *stream, void *callback_private_data, void *per_buffer_data)
 
static BlockNumber heap_scan_stream_read_next_serial (ReadStream *stream, void *callback_private_data, void *per_buffer_data)
 
static BlockNumber bitmapheap_stream_read_next (ReadStream *pgsr, void *private_data, void *per_buffer_data)
 
static void initscan (HeapScanDesc scan, ScanKey key, bool keep_startblock)
 
void heap_setscanlimits (TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
 
static pg_attribute_always_inline int page_collect_tuples (HeapScanDesc scan, Snapshot snapshot, Page page, Buffer buffer, BlockNumber block, int lines, bool all_visible, bool check_serializable)
 
void heap_prepare_pagescan (TableScanDesc sscan)
 
static void heap_fetch_next_buffer (HeapScanDesc scan, ScanDirection dir)
 
static Page heapgettup_start_page (HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
 
static Page heapgettup_continue_page (HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
 
static void heapgettup (HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
 
static void heapgettup_pagemode (HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
 
TableScanDesc heap_beginscan (Relation relation, Snapshot snapshot, int nkeys, ScanKey key, ParallelTableScanDesc parallel_scan, uint32 flags)
 
void heap_rescan (TableScanDesc sscan, ScanKey key, bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode)
 
void heap_endscan (TableScanDesc sscan)
 
HeapTuple heap_getnext (TableScanDesc sscan, ScanDirection direction)
 
bool heap_getnextslot (TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
 
void heap_set_tidrange (TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid)
 
bool heap_getnextslot_tidrange (TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
 
bool heap_fetch (Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf)
 
bool heap_hot_search_buffer (ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call)
 
void heap_get_latest_tid (TableScanDesc sscan, ItemPointer tid)
 
static void UpdateXmaxHintBits (HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
 
BulkInsertState GetBulkInsertState (void)
 
void FreeBulkInsertState (BulkInsertState bistate)
 
void ReleaseBulkInsertStatePin (BulkInsertState bistate)
 
void heap_insert (Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate)
 
static int heap_multi_insert_pages (HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
 
void heap_multi_insert (Relation relation, TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate)
 
void simple_heap_insert (Relation relation, HeapTuple tup)
 
static uint8 compute_infobits (uint16 infomask, uint16 infomask2)
 
static bool xmax_infomask_changed (uint16 new_infomask, uint16 old_infomask)
 
TM_Result heap_delete (Relation relation, const ItemPointerData *tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
 
void simple_heap_delete (Relation relation, const ItemPointerData *tid)
 
TM_Result heap_update (Relation relation, const ItemPointerData *otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
 
static bool heap_attr_equals (TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, bool isnull1, bool isnull2)
 
void simple_heap_update (Relation relation, const ItemPointerData *otid, HeapTuple tup, TU_UpdateIndexes *update_indexes)
 
static MultiXactStatus get_mxact_status_for_lock (LockTupleMode mode, bool is_update)
 
TM_Result heap_lock_tuple (Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_updates, Buffer *buffer, TM_FailureData *tmfd)
 
static TM_Result test_lockmode_for_conflict (MultiXactStatus status, TransactionId xid, LockTupleMode mode, HeapTuple tup, bool *needwait)
 
static TM_Result heap_lock_updated_tuple_rec (Relation rel, TransactionId priorXmax, const ItemPointerData *tid, TransactionId xid, LockTupleMode mode)
 
void heap_finish_speculative (Relation relation, const ItemPointerData *tid)
 
void heap_abort_speculative (Relation relation, const ItemPointerData *tid)
 
bool heap_inplace_lock (Relation relation, HeapTuple oldtup_ptr, Buffer buffer, void(*release_callback)(void *), void *arg)
 
void heap_inplace_update_and_unlock (Relation relation, HeapTuple oldtup, HeapTuple tuple, Buffer buffer)
 
void heap_inplace_unlock (Relation relation, HeapTuple oldtup, Buffer buffer)
 
static TransactionId FreezeMultiXactId (MultiXactId multi, uint16 t_infomask, const struct VacuumCutoffs *cutoffs, uint16 *flags, HeapPageFreeze *pagefrz)
 
bool heap_prepare_freeze_tuple (HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, HeapPageFreeze *pagefrz, HeapTupleFreeze *frz, bool *totally_frozen)
 
void heap_pre_freeze_checks (Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
 
void heap_freeze_prepared_tuples (Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
 
bool heap_freeze_tuple (HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId FreezeLimit, TransactionId MultiXactCutoff)
 
TransactionId HeapTupleGetUpdateXid (const HeapTupleHeaderData *tup)
 
static bool Do_MultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, bool nowait, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining, bool logLockFailure)
 
bool heap_tuple_needs_eventual_freeze (HeapTupleHeader tuple)
 
bool heap_tuple_should_freeze (HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, TransactionId *NoFreezePageRelfrozenXid, MultiXactId *NoFreezePageRelminMxid)
 
void HeapTupleHeaderAdvanceConflictHorizon (HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
 
static void index_delete_check_htid (TM_IndexDeleteOp *delstate, Page page, OffsetNumber maxoff, const ItemPointerData *htid, TM_IndexStatus *istatus)
 
TransactionId heap_index_delete_tuples (Relation rel, TM_IndexDeleteOp *delstate)
 
static int index_delete_sort_cmp (TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
 
static int bottomup_nblocksfavorable (IndexDeleteCounts *blockgroups, int nblockgroups, TM_IndexDelete *deltids)
 
static int bottomup_sort_and_shrink_cmp (const void *arg1, const void *arg2)
 
XLogRecPtr log_heap_visible (Relation rel, Buffer heap_buffer, Buffer vm_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags)
 
void HeapCheckForSerializableConflictOut (bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot)
 

Variables

struct { 
 
   LOCKMODE   hwlock 
 
   int   lockstatus 
 
   int   updstatus 
 
tupleLockExtraInfo [] 
 
static const int MultiXactStatusLock [MaxMultiXactStatus+1]
 

Macro Definition Documentation

◆ BOTTOMUP_MAX_NBLOCKS

#define BOTTOMUP_MAX_NBLOCKS   6

Definition at line 189 of file heapam.c.

◆ BOTTOMUP_TOLERANCE_NBLOCKS

#define BOTTOMUP_TOLERANCE_NBLOCKS   3

Definition at line 190 of file heapam.c.

◆ ConditionalLockTupleTuplock

#define ConditionalLockTupleTuplock (   rel,
  tup,
  mode,
  log 
)     ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock, (log))

Definition at line 171 of file heapam.c.

179{
181 int next_item;
182 int ndeltids;
183 TM_IndexDelete *deltids;
185#endif
186
187/* heap_index_delete_tuples bottom-up index deletion costing constants */
188#define BOTTOMUP_MAX_NBLOCKS 6
189#define BOTTOMUP_TOLERANCE_NBLOCKS 3
190
191/*
192 * heap_index_delete_tuples uses this when determining which heap blocks it
193 * must visit to help its bottom-up index deletion caller
194 */
195typedef struct IndexDeleteCounts
196{
197 int16 npromisingtids; /* Number of "promising" TIDs in group */
198 int16 ntids; /* Number of TIDs in group */
199 int16 ifirsttid; /* Offset to group's first deltid */
201
202/*
203 * This table maps tuple lock strength values for each particular
204 * MultiXactStatus value.
205 */
206static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
207{
208 LockTupleKeyShare, /* ForKeyShare */
209 LockTupleShare, /* ForShare */
210 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
211 LockTupleExclusive, /* ForUpdate */
212 LockTupleNoKeyExclusive, /* NoKeyUpdate */
213 LockTupleExclusive /* Update */
214};
215
216/* Get the LockTupleMode for a given MultiXactStatus */
217#define TUPLOCK_from_mxstatus(status) \
218 (MultiXactStatusLock[(status)])
219
220/*
221 * Check that we have a valid snapshot if we might need TOAST access.
222 */
223static inline void
225{
226#ifdef USE_ASSERT_CHECKING
227
228 /* bootstrap mode in particular breaks this rule */
230 return;
231
232 /* if the relation doesn't have a TOAST table, we are good */
233 if (!OidIsValid(rel->rd_rel->reltoastrelid))
234 return;
235
237
238#endif /* USE_ASSERT_CHECKING */
239}
240
241/* ----------------------------------------------------------------
242 * heap support routines
243 * ----------------------------------------------------------------
244 */
245
246/*
247 * Streaming read API callback for parallel sequential scans. Returns the next
248 * block the caller wants from the read stream or InvalidBlockNumber when done.
249 */
250static BlockNumber
252 void *callback_private_data,
253 void *per_buffer_data)
254{
255 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
256
259
260 if (unlikely(!scan->rs_inited))
261 {
262 /* parallel scan */
266 scan->rs_startblock,
267 scan->rs_numblocks);
268
269 /* may return InvalidBlockNumber if there are no more blocks */
273 scan->rs_inited = true;
274 }
275 else
276 {
279 scan->rs_base.rs_parallel);
280 }
281
282 return scan->rs_prefetch_block;
283}
284
285/*
286 * Streaming read API callback for serial sequential and TID range scans.
287 * Returns the next block the caller wants from the read stream or
288 * InvalidBlockNumber when done.
289 */
290static BlockNumber
292 void *callback_private_data,
293 void *per_buffer_data)
294{
295 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
296
297 if (unlikely(!scan->rs_inited))
298 {
300 scan->rs_inited = true;
301 }
302 else
304 scan->rs_prefetch_block,
305 scan->rs_dir);
306
307 return scan->rs_prefetch_block;
308}
309
310/*
311 * Read stream API callback for bitmap heap scans.
312 * Returns the next block the caller wants from the read stream or
313 * InvalidBlockNumber when done.
314 */
315static BlockNumber
316bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data,
317 void *per_buffer_data)
318{
319 TBMIterateResult *tbmres = per_buffer_data;
322 TableScanDesc sscan = &hscan->rs_base;
323
324 for (;;)
325 {
327
328 /* no more entries in the bitmap */
329 if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
330 return InvalidBlockNumber;
331
332 /*
333 * Ignore any claimed entries past what we think is the end of the
334 * relation. It may have been extended after the start of our scan (we
335 * only hold an AccessShareLock, and it could be inserts from this
336 * backend). We don't take this optimization in SERIALIZABLE
337 * isolation though, as we need to examine all invisible tuples
338 * reachable by the index.
339 */
341 tbmres->blockno >= hscan->rs_nblocks)
342 continue;
343
344 return tbmres->blockno;
345 }
346
347 /* not reachable */
348 Assert(false);
349}
350
351/* ----------------
352 * initscan - scan code common to heap_beginscan and heap_rescan
353 * ----------------
354 */
355static void
357{
359 bool allow_strat;
360 bool allow_sync;
361
362 /*
363 * Determine the number of blocks we have to scan.
364 *
365 * It is sufficient to do this once at scan start, since any tuples added
366 * while the scan is in progress will be invisible to my snapshot anyway.
367 * (That is not true when using a non-MVCC snapshot. However, we couldn't
368 * guarantee to return tuples added after scan start anyway, since they
369 * might go into pages we already scanned. To guarantee consistent
370 * results for a non-MVCC snapshot, the caller must hold some higher-level
371 * lock that ensures the interesting tuple(s) won't change.)
372 */
373 if (scan->rs_base.rs_parallel != NULL)
374 {
376 scan->rs_nblocks = bpscan->phs_nblocks;
377 }
378 else
380
381 /*
382 * If the table is large relative to NBuffers, use a bulk-read access
383 * strategy and enable synchronized scanning (see syncscan.c). Although
384 * the thresholds for these features could be different, we make them the
385 * same so that there are only two behaviors to tune rather than four.
386 * (However, some callers need to be able to disable one or both of these
387 * behaviors, independently of the size of the table; also there is a GUC
388 * variable that can disable synchronized scanning.)
389 *
390 * Note that table_block_parallelscan_initialize has a very similar test;
391 * if you change this, consider changing that one, too.
392 */
394 scan->rs_nblocks > NBuffers / 4)
395 {
397 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
398 }
399 else
400 allow_strat = allow_sync = false;
401
402 if (allow_strat)
403 {
404 /* During a rescan, keep the previous strategy object. */
405 if (scan->rs_strategy == NULL)
407 }
408 else
409 {
410 if (scan->rs_strategy != NULL)
412 scan->rs_strategy = NULL;
413 }
414
415 if (scan->rs_base.rs_parallel != NULL)
416 {
417 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
420 else
422
423 /*
424 * If not rescanning, initialize the startblock. Finding the actual
425 * start location is done in table_block_parallelscan_startblock_init,
426 * based on whether an alternative start location has been set with
427 * heap_setscanlimits, or using the syncscan location, when syncscan
428 * is enabled.
429 */
430 if (!keep_startblock)
432 }
433 else
434 {
435 if (keep_startblock)
436 {
437 /*
438 * When rescanning, we want to keep the previous startblock
439 * setting, so that rewinding a cursor doesn't generate surprising
440 * results. Reset the active syncscan setting, though.
441 */
444 else
446 }
448 {
451 }
452 else
453 {
455 scan->rs_startblock = 0;
456 }
457 }
458
460 scan->rs_inited = false;
461 scan->rs_ctup.t_data = NULL;
463 scan->rs_cbuf = InvalidBuffer;
465 scan->rs_ntuples = 0;
466 scan->rs_cindex = 0;
467
468 /*
469 * Initialize to ForwardScanDirection because it is most common and
470 * because heap scans go forward before going backward (e.g. CURSORs).
471 */
474
475 /* page-at-a-time fields are always invalid when not rs_inited */
476
477 /*
478 * copy the scan key, if appropriate
479 */
480 if (key != NULL && scan->rs_base.rs_nkeys > 0)
481 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
482
483 /*
484 * Currently, we only have a stats counter for sequential heap scans (but
485 * e.g for bitmap scans the underlying bitmap index scans will be counted,
486 * and for sample scans we update stats for tuple fetches).
487 */
488 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
490}
491
492/*
493 * heap_setscanlimits - restrict range of a heapscan
494 *
495 * startBlk is the page to start at
496 * numBlks is number of pages to scan (InvalidBlockNumber means "all")
497 */
498void
500{
502
503 Assert(!scan->rs_inited); /* else too late to change */
504 /* else rs_startblock is significant */
506
507 /* Check startBlk is valid (but allow case of zero blocks...) */
508 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
509
510 scan->rs_startblock = startBlk;
511 scan->rs_numblocks = numBlks;
512}
513
514/*
515 * Per-tuple loop for heap_prepare_pagescan(). Pulled out so it can be called
516 * multiple times, with constant arguments for all_visible,
517 * check_serializable.
518 */
520static int
522 Page page, Buffer buffer,
523 BlockNumber block, int lines,
524 bool all_visible, bool check_serializable)
525{
526 Oid relid = RelationGetRelid(scan->rs_base.rs_rd);
527 int ntup = 0;
528 int nvis = 0;
530
531 /* page at a time should have been disabled otherwise */
532 Assert(IsMVCCSnapshot(snapshot));
533
534 /* first find all tuples on the page */
536 {
539
541 continue;
542
543 /*
544 * If the page is not all-visible or we need to check serializability,
545 * maintain enough state to be able to refind the tuple efficiently,
546 * without again first needing to fetch the item and then via that the
547 * tuple.
548 */
549 if (!all_visible || check_serializable)
550 {
551 tup = &batchmvcc.tuples[ntup];
552
554 tup->t_len = ItemIdGetLength(lpp);
555 tup->t_tableOid = relid;
556 ItemPointerSet(&(tup->t_self), block, lineoff);
557 }
558
559 /*
560 * If the page is all visible, these fields otherwise won't be
561 * populated in loop below.
562 */
563 if (all_visible)
564 {
566 {
567 batchmvcc.visible[ntup] = true;
568 }
569 scan->rs_vistuples[ntup] = lineoff;
570 }
571
572 ntup++;
573 }
574
576
577 /*
578 * Unless the page is all visible, test visibility for all tuples one go.
579 * That is considerably more efficient than calling
580 * HeapTupleSatisfiesMVCC() one-by-one.
581 */
582 if (all_visible)
583 nvis = ntup;
584 else
585 nvis = HeapTupleSatisfiesMVCCBatch(snapshot, buffer,
586 ntup,
587 &batchmvcc,
588 scan->rs_vistuples);
589
590 /*
591 * So far we don't have batch API for testing serializabilty, so do so
592 * one-by-one.
593 */
595 {
596 for (int i = 0; i < ntup; i++)
597 {
599 scan->rs_base.rs_rd,
600 &batchmvcc.tuples[i],
601 buffer, snapshot);
602 }
603 }
604
605 return nvis;
606}
607
608/*
609 * heap_prepare_pagescan - Prepare current scan page to be scanned in pagemode
610 *
611 * Preparation currently consists of 1. prune the scan's rs_cbuf page, and 2.
612 * fill the rs_vistuples[] array with the OffsetNumbers of visible tuples.
613 */
614void
616{
618 Buffer buffer = scan->rs_cbuf;
619 BlockNumber block = scan->rs_cblock;
620 Snapshot snapshot;
621 Page page;
622 int lines;
623 bool all_visible;
625
626 Assert(BufferGetBlockNumber(buffer) == block);
627
628 /* ensure we're not accidentally being used when not in pagemode */
630 snapshot = scan->rs_base.rs_snapshot;
631
632 /*
633 * Prune and repair fragmentation for the whole page, if possible.
634 */
635 heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
636
637 /*
638 * We must hold share lock on the buffer content while examining tuple
639 * visibility. Afterwards, however, the tuples we have found to be
640 * visible are guaranteed good as long as we hold the buffer pin.
641 */
643
644 page = BufferGetPage(buffer);
645 lines = PageGetMaxOffsetNumber(page);
646
647 /*
648 * If the all-visible flag indicates that all tuples on the page are
649 * visible to everyone, we can skip the per-tuple visibility tests.
650 *
651 * Note: In hot standby, a tuple that's already visible to all
652 * transactions on the primary might still be invisible to a read-only
653 * transaction in the standby. We partly handle this problem by tracking
654 * the minimum xmin of visible tuples as the cut-off XID while marking a
655 * page all-visible on the primary and WAL log that along with the
656 * visibility map SET operation. In hot standby, we wait for (or abort)
657 * all transactions that can potentially may not see one or more tuples on
658 * the page. That's how index-only scans work fine in hot standby. A
659 * crucial difference between index-only scans and heap scans is that the
660 * index-only scan completely relies on the visibility map where as heap
661 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
662 * the page-level flag can be trusted in the same way, because it might
663 * get propagated somehow without being explicitly WAL-logged, e.g. via a
664 * full page write. Until we can prove that beyond doubt, let's check each
665 * tuple for visibility the hard way.
666 */
667 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
670
671 /*
672 * We call page_collect_tuples() with constant arguments, to get the
673 * compiler to constant fold the constant arguments. Separate calls with
674 * constant arguments, rather than variables, are needed on several
675 * compilers to actually perform constant folding.
676 */
677 if (likely(all_visible))
678 {
680 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
681 block, lines, true, false);
682 else
683 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
684 block, lines, true, true);
685 }
686 else
687 {
689 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
690 block, lines, false, false);
691 else
692 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
693 block, lines, false, true);
694 }
695
697}
698
699/*
700 * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM.
701 *
702 * Read the next block of the scan relation from the read stream and save it
703 * in the scan descriptor. It is already pinned.
704 */
705static inline void
707{
708 Assert(scan->rs_read_stream);
709
710 /* release previous scan buffer, if any */
711 if (BufferIsValid(scan->rs_cbuf))
712 {
713 ReleaseBuffer(scan->rs_cbuf);
714 scan->rs_cbuf = InvalidBuffer;
715 }
716
717 /*
718 * Be sure to check for interrupts at least once per page. Checks at
719 * higher code levels won't be able to stop a seqscan that encounters many
720 * pages' worth of consecutive dead tuples.
721 */
723
724 /*
725 * If the scan direction is changing, reset the prefetch block to the
726 * current block. Otherwise, we will incorrectly prefetch the blocks
727 * between the prefetch block and the current block again before
728 * prefetching blocks in the new, correct scan direction.
729 */
730 if (unlikely(scan->rs_dir != dir))
731 {
732 scan->rs_prefetch_block = scan->rs_cblock;
734 }
735
736 scan->rs_dir = dir;
737
739 if (BufferIsValid(scan->rs_cbuf))
741}
742
743/*
744 * heapgettup_initial_block - return the first BlockNumber to scan
745 *
746 * Returns InvalidBlockNumber when there are no blocks to scan. This can
747 * occur with empty tables and in parallel scans when parallel workers get all
748 * of the pages before we can get a chance to get our first page.
749 */
752{
753 Assert(!scan->rs_inited);
754 Assert(scan->rs_base.rs_parallel == NULL);
755
756 /* When there are no pages to scan, return InvalidBlockNumber */
757 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
758 return InvalidBlockNumber;
759
760 if (ScanDirectionIsForward(dir))
761 {
762 return scan->rs_startblock;
763 }
764 else
765 {
766 /*
767 * Disable reporting to syncscan logic in a backwards scan; it's not
768 * very likely anyone else is doing the same thing at the same time,
769 * and much more likely that we'll just bollix things for forward
770 * scanners.
771 */
773
774 /*
775 * Start from last page of the scan. Ensure we take into account
776 * rs_numblocks if it's been adjusted by heap_setscanlimits().
777 */
778 if (scan->rs_numblocks != InvalidBlockNumber)
779 return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
780
781 if (scan->rs_startblock > 0)
782 return scan->rs_startblock - 1;
783
784 return scan->rs_nblocks - 1;
785 }
786}
787
788
789/*
790 * heapgettup_start_page - helper function for heapgettup()
791 *
792 * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
793 * to the number of tuples on this page. Also set *lineoff to the first
794 * offset to scan with forward scans getting the first offset and backward
795 * getting the final offset on the page.
796 */
797static Page
800{
801 Page page;
802
803 Assert(scan->rs_inited);
805
806 /* Caller is responsible for ensuring buffer is locked if needed */
807 page = BufferGetPage(scan->rs_cbuf);
808
810
811 if (ScanDirectionIsForward(dir))
813 else
815
816 /* lineoff now references the physically previous or next tid */
817 return page;
818}
819
820
821/*
822 * heapgettup_continue_page - helper function for heapgettup()
823 *
824 * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
825 * to the number of tuples left to scan on this page. Also set *lineoff to
826 * the next offset to scan according to the ScanDirection in 'dir'.
827 */
828static inline Page
831{
832 Page page;
833
834 Assert(scan->rs_inited);
836
837 /* Caller is responsible for ensuring buffer is locked if needed */
838 page = BufferGetPage(scan->rs_cbuf);
839
840 if (ScanDirectionIsForward(dir))
841 {
843 *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
844 }
845 else
846 {
847 /*
848 * The previous returned tuple may have been vacuumed since the
849 * previous scan when we use a non-MVCC snapshot, so we must
850 * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
851 */
853 *linesleft = *lineoff;
854 }
855
856 /* lineoff now references the physically previous or next tid */
857 return page;
858}
859
860/*
861 * heapgettup_advance_block - helper for heap_fetch_next_buffer()
862 *
863 * Given the current block number, the scan direction, and various information
864 * contained in the scan descriptor, calculate the BlockNumber to scan next
865 * and return it. If there are no further blocks to scan, return
866 * InvalidBlockNumber to indicate this fact to the caller.
867 *
868 * This should not be called to determine the initial block number -- only for
869 * subsequent blocks.
870 *
871 * This also adjusts rs_numblocks when a limit has been imposed by
872 * heap_setscanlimits().
873 */
874static inline BlockNumber
876{
877 Assert(scan->rs_base.rs_parallel == NULL);
878
880 {
881 block++;
882
883 /* wrap back to the start of the heap */
884 if (block >= scan->rs_nblocks)
885 block = 0;
886
887 /*
888 * Report our new scan position for synchronization purposes. We don't
889 * do that when moving backwards, however. That would just mess up any
890 * other forward-moving scanners.
891 *
892 * Note: we do this before checking for end of scan so that the final
893 * state of the position hint is back at the start of the rel. That's
894 * not strictly necessary, but otherwise when you run the same query
895 * multiple times the starting position would shift a little bit
896 * backwards on every invocation, which is confusing. We don't
897 * guarantee any specific ordering in general, though.
898 */
899 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
900 ss_report_location(scan->rs_base.rs_rd, block);
901
902 /* we're done if we're back at where we started */
903 if (block == scan->rs_startblock)
904 return InvalidBlockNumber;
905
906 /* check if the limit imposed by heap_setscanlimits() is met */
907 if (scan->rs_numblocks != InvalidBlockNumber)
908 {
909 if (--scan->rs_numblocks == 0)
910 return InvalidBlockNumber;
911 }
912
913 return block;
914 }
915 else
916 {
917 /* we're done if the last block is the start position */
918 if (block == scan->rs_startblock)
919 return InvalidBlockNumber;
920
921 /* check if the limit imposed by heap_setscanlimits() is met */
922 if (scan->rs_numblocks != InvalidBlockNumber)
923 {
924 if (--scan->rs_numblocks == 0)
925 return InvalidBlockNumber;
926 }
927
928 /* wrap to the end of the heap when the last page was page 0 */
929 if (block == 0)
930 block = scan->rs_nblocks;
931
932 block--;
933
934 return block;
935 }
936}
937
938/* ----------------
939 * heapgettup - fetch next heap tuple
940 *
941 * Initialize the scan if not already done; then advance to the next
942 * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
943 * or set scan->rs_ctup.t_data = NULL if no more tuples.
944 *
945 * Note: the reason nkeys/key are passed separately, even though they are
946 * kept in the scan descriptor, is that the caller may not want us to check
947 * the scankeys.
948 *
949 * Note: when we fall off the end of the scan in either direction, we
950 * reset rs_inited. This means that a further request with the same
951 * scan direction will restart the scan, which is a bit odd, but a
952 * request with the opposite scan direction will start a fresh scan
953 * in the proper direction. The latter is required behavior for cursors,
954 * while the former case is generally undefined behavior in Postgres
955 * so we don't care too much.
956 * ----------------
957 */
958static void
960 ScanDirection dir,
961 int nkeys,
962 ScanKey key)
963{
964 HeapTuple tuple = &(scan->rs_ctup);
965 Page page;
967 int linesleft;
968
969 if (likely(scan->rs_inited))
970 {
971 /* continue from previously returned page/tuple */
973 page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
974 goto continue_page;
975 }
976
977 /*
978 * advance the scan until we find a qualifying tuple or run out of stuff
979 * to scan
980 */
981 while (true)
982 {
983 heap_fetch_next_buffer(scan, dir);
984
985 /* did we run out of blocks to scan? */
986 if (!BufferIsValid(scan->rs_cbuf))
987 break;
988
990
992 page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
994
995 /*
996 * Only continue scanning the page while we have lines left.
997 *
998 * Note that this protects us from accessing line pointers past
999 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
1000 * table scan, and for when we start scanning a new page.
1001 */
1002 for (; linesleft > 0; linesleft--, lineoff += dir)
1003 {
1004 bool visible;
1006
1007 if (!ItemIdIsNormal(lpp))
1008 continue;
1009
1010 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1011 tuple->t_len = ItemIdGetLength(lpp);
1012 ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
1013
1014 visible = HeapTupleSatisfiesVisibility(tuple,
1015 scan->rs_base.rs_snapshot,
1016 scan->rs_cbuf);
1017
1019 tuple, scan->rs_cbuf,
1020 scan->rs_base.rs_snapshot);
1021
1022 /* skip tuples not visible to this snapshot */
1023 if (!visible)
1024 continue;
1025
1026 /* skip any tuples that don't match the scan key */
1027 if (key != NULL &&
1029 nkeys, key))
1030 continue;
1031
1033 scan->rs_coffset = lineoff;
1034 return;
1035 }
1036
1037 /*
1038 * if we get here, it means we've exhausted the items on this page and
1039 * it's time to move to the next.
1040 */
1042 }
1043
1044 /* end of scan */
1045 if (BufferIsValid(scan->rs_cbuf))
1046 ReleaseBuffer(scan->rs_cbuf);
1047
1048 scan->rs_cbuf = InvalidBuffer;
1051 tuple->t_data = NULL;
1052 scan->rs_inited = false;
1053}
1054
1055/* ----------------
1056 * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
1057 *
1058 * Same API as heapgettup, but used in page-at-a-time mode
1059 *
1060 * The internal logic is much the same as heapgettup's too, but there are some
1061 * differences: we do not take the buffer content lock (that only needs to
1062 * happen inside heap_prepare_pagescan), and we iterate through just the
1063 * tuples listed in rs_vistuples[] rather than all tuples on the page. Notice
1064 * that lineindex is 0-based, where the corresponding loop variable lineoff in
1065 * heapgettup is 1-based.
1066 * ----------------
1067 */
1068static void
1070 ScanDirection dir,
1071 int nkeys,
1072 ScanKey key)
1073{
1074 HeapTuple tuple = &(scan->rs_ctup);
1075 Page page;
1078
1079 if (likely(scan->rs_inited))
1080 {
1081 /* continue from previously returned page/tuple */
1082 page = BufferGetPage(scan->rs_cbuf);
1083
1084 lineindex = scan->rs_cindex + dir;
1085 if (ScanDirectionIsForward(dir))
1086 linesleft = scan->rs_ntuples - lineindex;
1087 else
1088 linesleft = scan->rs_cindex;
1089 /* lineindex now references the next or previous visible tid */
1090
1091 goto continue_page;
1092 }
1093
1094 /*
1095 * advance the scan until we find a qualifying tuple or run out of stuff
1096 * to scan
1097 */
1098 while (true)
1099 {
1100 heap_fetch_next_buffer(scan, dir);
1101
1102 /* did we run out of blocks to scan? */
1103 if (!BufferIsValid(scan->rs_cbuf))
1104 break;
1105
1107
1108 /* prune the page and determine visible tuple offsets */
1110 page = BufferGetPage(scan->rs_cbuf);
1111 linesleft = scan->rs_ntuples;
1113
1114 /* block is the same for all tuples, set it once outside the loop */
1116
1117 /* lineindex now references the next or previous visible tid */
1119
1120 for (; linesleft > 0; linesleft--, lineindex += dir)
1121 {
1122 ItemId lpp;
1124
1125 Assert(lineindex < scan->rs_ntuples);
1127 lpp = PageGetItemId(page, lineoff);
1129
1130 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1131 tuple->t_len = ItemIdGetLength(lpp);
1133
1134 /* skip any tuples that don't match the scan key */
1135 if (key != NULL &&
1137 nkeys, key))
1138 continue;
1139
1140 scan->rs_cindex = lineindex;
1141 return;
1142 }
1143 }
1144
1145 /* end of scan */
1146 if (BufferIsValid(scan->rs_cbuf))
1147 ReleaseBuffer(scan->rs_cbuf);
1148 scan->rs_cbuf = InvalidBuffer;
1151 tuple->t_data = NULL;
1152 scan->rs_inited = false;
1153}
1154
1155
1156/* ----------------------------------------------------------------
1157 * heap access method interface
1158 * ----------------------------------------------------------------
1159 */
1160
1161
1163heap_beginscan(Relation relation, Snapshot snapshot,
1164 int nkeys, ScanKey key,
1165 ParallelTableScanDesc parallel_scan,
1166 uint32 flags)
1167{
1168 HeapScanDesc scan;
1169
1170 /*
1171 * increment relation ref count while scanning relation
1172 *
1173 * This is just to make really sure the relcache entry won't go away while
1174 * the scan has a pointer to it. Caller should be holding the rel open
1175 * anyway, so this is redundant in all normal scenarios...
1176 */
1178
1179 /*
1180 * allocate and initialize scan descriptor
1181 */
1182 if (flags & SO_TYPE_BITMAPSCAN)
1183 {
1185
1186 /*
1187 * Bitmap Heap scans do not have any fields that a normal Heap Scan
1188 * does not have, so no special initializations required here.
1189 */
1190 scan = (HeapScanDesc) bscan;
1191 }
1192 else
1194
1195 scan->rs_base.rs_rd = relation;
1196 scan->rs_base.rs_snapshot = snapshot;
1197 scan->rs_base.rs_nkeys = nkeys;
1198 scan->rs_base.rs_flags = flags;
1199 scan->rs_base.rs_parallel = parallel_scan;
1200 scan->rs_strategy = NULL; /* set in initscan */
1201 scan->rs_cbuf = InvalidBuffer;
1202
1203 /*
1204 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1205 */
1206 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1208
1209 /* Check that a historic snapshot is not used for non-catalog tables */
1210 if (snapshot &&
1211 IsHistoricMVCCSnapshot(snapshot) &&
1213 {
1214 ereport(ERROR,
1216 errmsg("cannot query non-catalog table \"%s\" during logical decoding",
1217 RelationGetRelationName(relation))));
1218 }
1219
1220 /*
1221 * For seqscan and sample scans in a serializable transaction, acquire a
1222 * predicate lock on the entire relation. This is required not only to
1223 * lock all the matching tuples, but also to conflict with new insertions
1224 * into the table. In an indexscan, we take page locks on the index pages
1225 * covering the range specified in the scan qual, but in a heap scan there
1226 * is nothing more fine-grained to lock. A bitmap scan is a different
1227 * story, there we have already scanned the index and locked the index
1228 * pages covering the predicate. But in that case we still have to lock
1229 * any matching heap tuples. For sample scan we could optimize the locking
1230 * to be at least page-level granularity, but we'd need to add per-tuple
1231 * locking for that.
1232 */
1234 {
1235 /*
1236 * Ensure a missing snapshot is noticed reliably, even if the
1237 * isolation mode means predicate locking isn't performed (and
1238 * therefore the snapshot isn't used here).
1239 */
1240 Assert(snapshot);
1241 PredicateLockRelation(relation, snapshot);
1242 }
1243
1244 /* we only need to set this up once */
1245 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1246
1247 /*
1248 * Allocate memory to keep track of page allocation for parallel workers
1249 * when doing a parallel scan.
1250 */
1251 if (parallel_scan != NULL)
1253 else
1255
1256 /*
1257 * we do this here instead of in initscan() because heap_rescan also calls
1258 * initscan() and we don't want to allocate memory again
1259 */
1260 if (nkeys > 0)
1261 scan->rs_base.rs_key = palloc_array(ScanKeyData, nkeys);
1262 else
1263 scan->rs_base.rs_key = NULL;
1264
1265 initscan(scan, key, false);
1266
1267 scan->rs_read_stream = NULL;
1268
1269 /*
1270 * Set up a read stream for sequential scans and TID range scans. This
1271 * should be done after initscan() because initscan() allocates the
1272 * BufferAccessStrategy object passed to the read stream API.
1273 */
1274 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN ||
1276 {
1278
1279 if (scan->rs_base.rs_parallel)
1281 else
1283
1284 /* ---
1285 * It is safe to use batchmode as the only locks taken by `cb`
1286 * are never taken while waiting for IO:
1287 * - SyncScanLock is used in the non-parallel case
1288 * - in the parallel case, only spinlocks and atomics are used
1289 * ---
1290 */
1293 scan->rs_strategy,
1294 scan->rs_base.rs_rd,
1296 cb,
1297 scan,
1298 0);
1299 }
1300 else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
1301 {
1304 scan->rs_strategy,
1305 scan->rs_base.rs_rd,
1308 scan,
1309 sizeof(TBMIterateResult));
1310 }
1311
1312
1313 return (TableScanDesc) scan;
1314}
1315
1316void
1318 bool allow_strat, bool allow_sync, bool allow_pagemode)
1319{
1321
1322 if (set_params)
1323 {
1324 if (allow_strat)
1326 else
1328
1329 if (allow_sync)
1331 else
1333
1334 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1337 else
1339 }
1340
1341 /*
1342 * unpin scan buffers
1343 */
1344 if (BufferIsValid(scan->rs_cbuf))
1345 {
1346 ReleaseBuffer(scan->rs_cbuf);
1347 scan->rs_cbuf = InvalidBuffer;
1348 }
1349
1350 /*
1351 * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any
1352 * additional data vs a normal HeapScan
1353 */
1354
1355 /*
1356 * The read stream is reset on rescan. This must be done before
1357 * initscan(), as some state referred to by read_stream_reset() is reset
1358 * in initscan().
1359 */
1360 if (scan->rs_read_stream)
1362
1363 /*
1364 * reinitialize scan descriptor
1365 */
1366 initscan(scan, key, true);
1367}
1368
1369void
1371{
1373
1374 /* Note: no locking manipulations needed */
1375
1376 /*
1377 * unpin scan buffers
1378 */
1379 if (BufferIsValid(scan->rs_cbuf))
1380 ReleaseBuffer(scan->rs_cbuf);
1381
1382 /*
1383 * Must free the read stream before freeing the BufferAccessStrategy.
1384 */
1385 if (scan->rs_read_stream)
1387
1388 /*
1389 * decrement relation reference count and free scan descriptor storage
1390 */
1392
1393 if (scan->rs_base.rs_key)
1394 pfree(scan->rs_base.rs_key);
1395
1396 if (scan->rs_strategy != NULL)
1398
1399 if (scan->rs_parallelworkerdata != NULL)
1401
1402 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1404
1405 pfree(scan);
1406}
1407
1410{
1412
1413 /*
1414 * This is still widely used directly, without going through table AM, so
1415 * add a safety check. It's possible we should, at a later point,
1416 * downgrade this to an assert. The reason for checking the AM routine,
1417 * rather than the AM oid, is that this allows to write regression tests
1418 * that create another AM reusing the heap handler.
1419 */
1420 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1421 ereport(ERROR,
1423 errmsg_internal("only heap AM is supported")));
1424
1425 /* Note: no locking manipulations needed */
1426
1428 heapgettup_pagemode(scan, direction,
1429 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1430 else
1431 heapgettup(scan, direction,
1432 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1433
1434 if (scan->rs_ctup.t_data == NULL)
1435 return NULL;
1436
1437 /*
1438 * if we get here it means we have a new current scan tuple, so point to
1439 * the proper return buffer and return the tuple.
1440 */
1441
1443
1444 return &scan->rs_ctup;
1445}
1446
1447bool
1449{
1451
1452 /* Note: no locking manipulations needed */
1453
1454 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1455 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1456 else
1457 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1458
1459 if (scan->rs_ctup.t_data == NULL)
1460 {
1461 ExecClearTuple(slot);
1462 return false;
1463 }
1464
1465 /*
1466 * if we get here it means we have a new current scan tuple, so point to
1467 * the proper return buffer and return the tuple.
1468 */
1469
1471
1472 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1473 scan->rs_cbuf);
1474 return true;
1475}
1476
1477void
1480{
1486
1487 /*
1488 * For relations without any pages, we can simply leave the TID range
1489 * unset. There will be no tuples to scan, therefore no tuples outside
1490 * the given TID range.
1491 */
1492 if (scan->rs_nblocks == 0)
1493 return;
1494
1495 /*
1496 * Set up some ItemPointers which point to the first and last possible
1497 * tuples in the heap.
1498 */
1501
1502 /*
1503 * If the given maximum TID is below the highest possible TID in the
1504 * relation, then restrict the range to that, otherwise we scan to the end
1505 * of the relation.
1506 */
1509
1510 /*
1511 * If the given minimum TID is above the lowest possible TID in the
1512 * relation, then restrict the range to only scan for TIDs above that.
1513 */
1516
1517 /*
1518 * Check for an empty range and protect from would be negative results
1519 * from the numBlks calculation below.
1520 */
1522 {
1523 /* Set an empty range of blocks to scan */
1525 return;
1526 }
1527
1528 /*
1529 * Calculate the first block and the number of blocks we must scan. We
1530 * could be more aggressive here and perform some more validation to try
1531 * and further narrow the scope of blocks to scan by checking if the
1532 * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1533 * advance startBlk by one. Likewise, if highestItem has an offset of 0
1534 * we could scan one fewer blocks. However, such an optimization does not
1535 * seem worth troubling over, currently.
1536 */
1538
1541
1542 /* Set the start block and number of blocks to scan */
1544
1545 /* Finally, set the TID range in sscan */
1546 ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
1547 ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
1548}
1549
1550bool
1552 TupleTableSlot *slot)
1553{
1555 ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
1556 ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
1557
1558 /* Note: no locking manipulations needed */
1559 for (;;)
1560 {
1561 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1562 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1563 else
1564 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1565
1566 if (scan->rs_ctup.t_data == NULL)
1567 {
1568 ExecClearTuple(slot);
1569 return false;
1570 }
1571
1572 /*
1573 * heap_set_tidrange will have used heap_setscanlimits to limit the
1574 * range of pages we scan to only ones that can contain the TID range
1575 * we're scanning for. Here we must filter out any tuples from these
1576 * pages that are outside of that range.
1577 */
1578 if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1579 {
1580 ExecClearTuple(slot);
1581
1582 /*
1583 * When scanning backwards, the TIDs will be in descending order.
1584 * Future tuples in this direction will be lower still, so we can
1585 * just return false to indicate there will be no more tuples.
1586 */
1587 if (ScanDirectionIsBackward(direction))
1588 return false;
1589
1590 continue;
1591 }
1592
1593 /*
1594 * Likewise for the final page, we must filter out TIDs greater than
1595 * maxtid.
1596 */
1597 if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1598 {
1599 ExecClearTuple(slot);
1600
1601 /*
1602 * When scanning forward, the TIDs will be in ascending order.
1603 * Future tuples in this direction will be higher still, so we can
1604 * just return false to indicate there will be no more tuples.
1605 */
1606 if (ScanDirectionIsForward(direction))
1607 return false;
1608 continue;
1609 }
1610
1611 break;
1612 }
1613
1614 /*
1615 * if we get here it means we have a new current scan tuple, so point to
1616 * the proper return buffer and return the tuple.
1617 */
1619
1620 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1621 return true;
1622}
1623
1624/*
1625 * heap_fetch - retrieve tuple with given tid
1626 *
1627 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1628 * the tuple, fill in the remaining fields of *tuple, and check the tuple
1629 * against the specified snapshot.
1630 *
1631 * If successful (tuple found and passes snapshot time qual), then *userbuf
1632 * is set to the buffer holding the tuple and true is returned. The caller
1633 * must unpin the buffer when done with the tuple.
1634 *
1635 * If the tuple is not found (ie, item number references a deleted slot),
1636 * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer,
1637 * and false is returned.
1638 *
1639 * If the tuple is found but fails the time qual check, then the behavior
1640 * depends on the keep_buf parameter. If keep_buf is false, the results
1641 * are the same as for the tuple-not-found case. If keep_buf is true,
1642 * then tuple->t_data and *userbuf are returned as for the success case,
1643 * and again the caller must unpin the buffer; but false is returned.
1644 *
1645 * heap_fetch does not follow HOT chains: only the exact TID requested will
1646 * be fetched.
1647 *
1648 * It is somewhat inconsistent that we ereport() on invalid block number but
1649 * return false on invalid item number. There are a couple of reasons though.
1650 * One is that the caller can relatively easily check the block number for
1651 * validity, but cannot check the item number without reading the page
1652 * himself. Another is that when we are following a t_ctid link, we can be
1653 * reasonably confident that the page number is valid (since VACUUM shouldn't
1654 * truncate off the destination page without having killed the referencing
1655 * tuple first), but the item number might well not be good.
1656 */
1657bool
1658heap_fetch(Relation relation,
1659 Snapshot snapshot,
1660 HeapTuple tuple,
1661 Buffer *userbuf,
1662 bool keep_buf)
1663{
1664 ItemPointer tid = &(tuple->t_self);
1665 ItemId lp;
1666 Buffer buffer;
1667 Page page;
1668 OffsetNumber offnum;
1669 bool valid;
1670
1671 /*
1672 * Fetch and pin the appropriate page of the relation.
1673 */
1674 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1675
1676 /*
1677 * Need share lock on buffer to examine tuple commit status.
1678 */
1680 page = BufferGetPage(buffer);
1681
1682 /*
1683 * We'd better check for out-of-range offnum in case of VACUUM since the
1684 * TID was obtained.
1685 */
1686 offnum = ItemPointerGetOffsetNumber(tid);
1688 {
1690 ReleaseBuffer(buffer);
1692 tuple->t_data = NULL;
1693 return false;
1694 }
1695
1696 /*
1697 * get the item line pointer corresponding to the requested tid
1698 */
1699 lp = PageGetItemId(page, offnum);
1700
1701 /*
1702 * Must check for deleted tuple.
1703 */
1704 if (!ItemIdIsNormal(lp))
1705 {
1707 ReleaseBuffer(buffer);
1709 tuple->t_data = NULL;
1710 return false;
1711 }
1712
1713 /*
1714 * fill in *tuple fields
1715 */
1716 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1717 tuple->t_len = ItemIdGetLength(lp);
1718 tuple->t_tableOid = RelationGetRelid(relation);
1719
1720 /*
1721 * check tuple visibility, then release lock
1722 */
1723 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1724
1725 if (valid)
1726 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1728
1729 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1730
1732
1733 if (valid)
1734 {
1735 /*
1736 * All checks passed, so return the tuple as valid. Caller is now
1737 * responsible for releasing the buffer.
1738 */
1739 *userbuf = buffer;
1740
1741 return true;
1742 }
1743
1744 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1745 if (keep_buf)
1746 *userbuf = buffer;
1747 else
1748 {
1749 ReleaseBuffer(buffer);
1751 tuple->t_data = NULL;
1752 }
1753
1754 return false;
1755}
1756
1757/*
1758 * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1759 *
1760 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1761 * of a HOT chain), and buffer is the buffer holding this tuple. We search
1762 * for the first chain member satisfying the given snapshot. If one is
1763 * found, we update *tid to reference that tuple's offset number, and
1764 * return true. If no match, return false without modifying *tid.
1765 *
1766 * heapTuple is a caller-supplied buffer. When a match is found, we return
1767 * the tuple here, in addition to updating *tid. If no match is found, the
1768 * contents of this buffer on return are undefined.
1769 *
1770 * If all_dead is not NULL, we check non-visible tuples to see if they are
1771 * globally dead; *all_dead is set true if all members of the HOT chain
1772 * are vacuumable, false if not.
1773 *
1774 * Unlike heap_fetch, the caller must already have pin and (at least) share
1775 * lock on the buffer; it is still pinned/locked at exit.
1776 */
1777bool
1779 Snapshot snapshot, HeapTuple heapTuple,
1780 bool *all_dead, bool first_call)
1781{
1782 Page page = BufferGetPage(buffer);
1784 BlockNumber blkno;
1785 OffsetNumber offnum;
1786 bool at_chain_start;
1787 bool valid;
1788 bool skip;
1789 GlobalVisState *vistest = NULL;
1790
1791 /* If this is not the first call, previous call returned a (live!) tuple */
1792 if (all_dead)
1794
1795 blkno = ItemPointerGetBlockNumber(tid);
1796 offnum = ItemPointerGetOffsetNumber(tid);
1798 skip = !first_call;
1799
1800 /* XXX: we should assert that a snapshot is pushed or registered */
1802 Assert(BufferGetBlockNumber(buffer) == blkno);
1803
1804 /* Scan through possible multiple members of HOT-chain */
1805 for (;;)
1806 {
1807 ItemId lp;
1808
1809 /* check for bogus TID */
1811 break;
1812
1813 lp = PageGetItemId(page, offnum);
1814
1815 /* check for unused, dead, or redirected items */
1816 if (!ItemIdIsNormal(lp))
1817 {
1818 /* We should only see a redirect at start of chain */
1820 {
1821 /* Follow the redirect */
1822 offnum = ItemIdGetRedirect(lp);
1823 at_chain_start = false;
1824 continue;
1825 }
1826 /* else must be end of chain */
1827 break;
1828 }
1829
1830 /*
1831 * Update heapTuple to point to the element of the HOT chain we're
1832 * currently investigating. Having t_self set correctly is important
1833 * because the SSI checks and the *Satisfies routine for historical
1834 * MVCC snapshots need the correct tid to decide about the visibility.
1835 */
1836 heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1837 heapTuple->t_len = ItemIdGetLength(lp);
1838 heapTuple->t_tableOid = RelationGetRelid(relation);
1839 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1840
1841 /*
1842 * Shouldn't see a HEAP_ONLY tuple at chain start.
1843 */
1845 break;
1846
1847 /*
1848 * The xmin should match the previous xmax value, else chain is
1849 * broken.
1850 */
1854 break;
1855
1856 /*
1857 * When first_call is true (and thus, skip is initially false) we'll
1858 * return the first tuple we find. But on later passes, heapTuple
1859 * will initially be pointing to the tuple we returned last time.
1860 * Returning it again would be incorrect (and would loop forever), so
1861 * we skip it and return the next match we find.
1862 */
1863 if (!skip)
1864 {
1865 /* If it's visible per the snapshot, we must return it */
1866 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1868 buffer, snapshot);
1869
1870 if (valid)
1871 {
1872 ItemPointerSetOffsetNumber(tid, offnum);
1873 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1875 if (all_dead)
1876 *all_dead = false;
1877 return true;
1878 }
1879 }
1880 skip = false;
1881
1882 /*
1883 * If we can't see it, maybe no one else can either. At caller
1884 * request, check whether all chain members are dead to all
1885 * transactions.
1886 *
1887 * Note: if you change the criterion here for what is "dead", fix the
1888 * planner's get_actual_variable_range() function to match.
1889 */
1890 if (all_dead && *all_dead)
1891 {
1892 if (!vistest)
1893 vistest = GlobalVisTestFor(relation);
1894
1895 if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1896 *all_dead = false;
1897 }
1898
1899 /*
1900 * Check to see if HOT chain continues past this tuple; if so fetch
1901 * the next offnum and loop around.
1902 */
1904 {
1905 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1906 blkno);
1907 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1908 at_chain_start = false;
1910 }
1911 else
1912 break; /* end of chain */
1913 }
1914
1915 return false;
1916}
1917
1918/*
1919 * heap_get_latest_tid - get the latest tid of a specified tuple
1920 *
1921 * Actually, this gets the latest version that is visible according to the
1922 * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1923 * possibly uncommitted version.
1924 *
1925 * *tid is both an input and an output parameter: it is updated to
1926 * show the latest version of the row. Note that it will not be changed
1927 * if no version of the row passes the snapshot test.
1928 */
1929void
1931 ItemPointer tid)
1932{
1933 Relation relation = sscan->rs_rd;
1934 Snapshot snapshot = sscan->rs_snapshot;
1935 ItemPointerData ctid;
1937
1938 /*
1939 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1940 * Assume that t_ctid links are valid however - there shouldn't be invalid
1941 * ones in the table.
1942 */
1944
1945 /*
1946 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1947 * need to examine, and *tid is the TID we will return if ctid turns out
1948 * to be bogus.
1949 *
1950 * Note that we will loop until we reach the end of the t_ctid chain.
1951 * Depending on the snapshot passed, there might be at most one visible
1952 * version of the row, but we don't try to optimize for that.
1953 */
1954 ctid = *tid;
1955 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1956 for (;;)
1957 {
1958 Buffer buffer;
1959 Page page;
1960 OffsetNumber offnum;
1961 ItemId lp;
1962 HeapTupleData tp;
1963 bool valid;
1964
1965 /*
1966 * Read, pin, and lock the page.
1967 */
1968 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1970 page = BufferGetPage(buffer);
1971
1972 /*
1973 * Check for bogus item number. This is not treated as an error
1974 * condition because it can happen while following a t_ctid link. We
1975 * just assume that the prior tid is OK and return it unchanged.
1976 */
1977 offnum = ItemPointerGetOffsetNumber(&ctid);
1979 {
1980 UnlockReleaseBuffer(buffer);
1981 break;
1982 }
1983 lp = PageGetItemId(page, offnum);
1984 if (!ItemIdIsNormal(lp))
1985 {
1986 UnlockReleaseBuffer(buffer);
1987 break;
1988 }
1989
1990 /* OK to access the tuple */
1991 tp.t_self = ctid;
1992 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1993 tp.t_len = ItemIdGetLength(lp);
1994 tp.t_tableOid = RelationGetRelid(relation);
1995
1996 /*
1997 * After following a t_ctid link, we might arrive at an unrelated
1998 * tuple. Check for XMIN match.
1999 */
2002 {
2003 UnlockReleaseBuffer(buffer);
2004 break;
2005 }
2006
2007 /*
2008 * Check tuple visibility; if visible, set it as the new result
2009 * candidate.
2010 */
2011 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2012 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2013 if (valid)
2014 *tid = ctid;
2015
2016 /*
2017 * If there's a valid t_ctid link, follow it, else we're done.
2018 */
2019 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2023 {
2024 UnlockReleaseBuffer(buffer);
2025 break;
2026 }
2027
2028 ctid = tp.t_data->t_ctid;
2030 UnlockReleaseBuffer(buffer);
2031 } /* end of loop */
2032}
2033
2034
2035/*
2036 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2037 *
2038 * This is called after we have waited for the XMAX transaction to terminate.
2039 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2040 * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2041 * hint bit if possible --- but beware that that may not yet be possible,
2042 * if the transaction committed asynchronously.
2043 *
2044 * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2045 * even if it commits.
2046 *
2047 * Hence callers should look only at XMAX_INVALID.
2048 *
2049 * Note this is not allowed for tuples whose xmax is a multixact.
2050 */
2051static void
2053{
2056
2058 {
2059 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2062 xid);
2063 else
2066 }
2067}
2068
2069
2070/*
2071 * GetBulkInsertState - prepare status object for a bulk insert
2072 */
2075{
2076 BulkInsertState bistate;
2077
2080 bistate->current_buf = InvalidBuffer;
2081 bistate->next_free = InvalidBlockNumber;
2082 bistate->last_free = InvalidBlockNumber;
2083 bistate->already_extended_by = 0;
2084 return bistate;
2085}
2086
2087/*
2088 * FreeBulkInsertState - clean up after finishing a bulk insert
2089 */
2090void
2092{
2093 if (bistate->current_buf != InvalidBuffer)
2094 ReleaseBuffer(bistate->current_buf);
2095 FreeAccessStrategy(bistate->strategy);
2096 pfree(bistate);
2097}
2098
2099/*
2100 * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2101 */
2102void
2104{
2105 if (bistate->current_buf != InvalidBuffer)
2106 ReleaseBuffer(bistate->current_buf);
2107 bistate->current_buf = InvalidBuffer;
2108
2109 /*
2110 * Despite the name, we also reset bulk relation extension state.
2111 * Otherwise we can end up erroring out due to looking for free space in
2112 * ->next_free of one partition, even though ->next_free was set when
2113 * extending another partition. It could obviously also be bad for
2114 * efficiency to look at existing blocks at offsets from another
2115 * partition, even if we don't error out.
2116 */
2117 bistate->next_free = InvalidBlockNumber;
2118 bistate->last_free = InvalidBlockNumber;
2119}
2120
2121
2122/*
2123 * heap_insert - insert tuple into a heap
2124 *
2125 * The new tuple is stamped with current transaction ID and the specified
2126 * command ID.
2127 *
2128 * See table_tuple_insert for comments about most of the input flags, except
2129 * that this routine directly takes a tuple rather than a slot.
2130 *
2131 * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
2132 * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
2133 * implement table_tuple_insert_speculative().
2134 *
2135 * On return the header fields of *tup are updated to match the stored tuple;
2136 * in particular tup->t_self receives the actual TID where the tuple was
2137 * stored. But note that any toasting of fields within the tuple data is NOT
2138 * reflected into *tup.
2139 */
2140void
2142 int options, BulkInsertState bistate)
2143{
2146 Buffer buffer;
2147 Buffer vmbuffer = InvalidBuffer;
2148 bool all_visible_cleared = false;
2149
2150 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2153
2154 AssertHasSnapshotForToast(relation);
2155
2156 /*
2157 * Fill in tuple header fields and toast the tuple if necessary.
2158 *
2159 * Note: below this point, heaptup is the data we actually intend to store
2160 * into the relation; tup is the caller's original untoasted data.
2161 */
2162 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2163
2164 /*
2165 * Find buffer to insert this tuple into. If the page is all visible,
2166 * this will also pin the requisite visibility map page.
2167 */
2168 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2169 InvalidBuffer, options, bistate,
2170 &vmbuffer, NULL,
2171 0);
2172
2173 /*
2174 * We're about to do the actual insert -- but check for conflict first, to
2175 * avoid possibly having to roll back work we've just done.
2176 *
2177 * This is safe without a recheck as long as there is no possibility of
2178 * another process scanning the page between this check and the insert
2179 * being visible to the scan (i.e., an exclusive buffer content lock is
2180 * continuously held from this point until the tuple insert is visible).
2181 *
2182 * For a heap insert, we only need to check for table-level SSI locks. Our
2183 * new tuple can't possibly conflict with existing tuple locks, and heap
2184 * page locks are only consolidated versions of tuple locks; they do not
2185 * lock "gaps" as index page locks do. So we don't need to specify a
2186 * buffer when making the call, which makes for a faster check.
2187 */
2189
2190 /* NO EREPORT(ERROR) from here till changes are logged */
2192
2193 RelationPutHeapTuple(relation, buffer, heaptup,
2195
2196 if (PageIsAllVisible(BufferGetPage(buffer)))
2197 {
2198 all_visible_cleared = true;
2200 visibilitymap_clear(relation,
2202 vmbuffer, VISIBILITYMAP_VALID_BITS);
2203 }
2204
2205 /*
2206 * XXX Should we set PageSetPrunable on this page ?
2207 *
2208 * The inserting transaction may eventually abort thus making this tuple
2209 * DEAD and hence available for pruning. Though we don't want to optimize
2210 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2211 * aborted tuple will never be pruned until next vacuum is triggered.
2212 *
2213 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2214 */
2215
2216 MarkBufferDirty(buffer);
2217
2218 /* XLOG stuff */
2219 if (RelationNeedsWAL(relation))
2220 {
2224 Page page = BufferGetPage(buffer);
2225 uint8 info = XLOG_HEAP_INSERT;
2226 int bufflags = 0;
2227
2228 /*
2229 * If this is a catalog, we need to transmit combo CIDs to properly
2230 * decode, so log that as well.
2231 */
2233 log_heap_new_cid(relation, heaptup);
2234
2235 /*
2236 * If this is the single and first tuple on page, we can reinit the
2237 * page instead of restoring the whole thing. Set flag, and hide
2238 * buffer references from XLogInsert.
2239 */
2242 {
2243 info |= XLOG_HEAP_INIT_PAGE;
2245 }
2246
2247 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2248 xlrec.flags = 0;
2254
2255 /*
2256 * For logical decoding, we need the tuple even if we're doing a full
2257 * page write, so make sure it's included even if we take a full-page
2258 * image. (XXX We could alternatively store a pointer into the FPW).
2259 */
2260 if (RelationIsLogicallyLogged(relation) &&
2262 {
2265
2266 if (IsToastRelation(relation))
2268 }
2269
2272
2273 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2274 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2275 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2276
2277 /*
2278 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2279 * write the whole page to the xlog, we don't need to store
2280 * xl_heap_header in the xlog.
2281 */
2284 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2286 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2288
2289 /* filtering by origin on a row level is much more efficient */
2291
2292 recptr = XLogInsert(RM_HEAP_ID, info);
2293
2294 PageSetLSN(page, recptr);
2295 }
2296
2298
2299 UnlockReleaseBuffer(buffer);
2300 if (vmbuffer != InvalidBuffer)
2301 ReleaseBuffer(vmbuffer);
2302
2303 /*
2304 * If tuple is cacheable, mark it for invalidation from the caches in case
2305 * we abort. Note it is OK to do this after releasing the buffer, because
2306 * the heaptup data structure is all in local memory, not in the shared
2307 * buffer.
2308 */
2310
2311 /* Note: speculative insertions are counted too, even if aborted later */
2312 pgstat_count_heap_insert(relation, 1);
2313
2314 /*
2315 * If heaptup is a private copy, release it. Don't forget to copy t_self
2316 * back to the caller's image, too.
2317 */
2318 if (heaptup != tup)
2319 {
2320 tup->t_self = heaptup->t_self;
2322 }
2323}
2324
2325/*
2326 * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2327 * tuple header fields and toasts the tuple if necessary. Returns a toasted
2328 * version of the tuple if it was toasted, or the original tuple if not. Note
2329 * that in any case, the header fields are also set in the original tuple.
2330 */
2331static HeapTuple
2333 CommandId cid, int options)
2334{
2335 /*
2336 * To allow parallel inserts, we need to ensure that they are safe to be
2337 * performed in workers. We have the infrastructure to allow parallel
2338 * inserts in general except for the cases where inserts generate a new
2339 * CommandId (eg. inserts into a table having a foreign key column).
2340 */
2341 if (IsParallelWorker())
2342 ereport(ERROR,
2344 errmsg("cannot insert tuples in a parallel worker")));
2345
2346 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2347 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2348 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2349 HeapTupleHeaderSetXmin(tup->t_data, xid);
2352
2353 HeapTupleHeaderSetCmin(tup->t_data, cid);
2354 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2355 tup->t_tableOid = RelationGetRelid(relation);
2356
2357 /*
2358 * If the new tuple is too big for storage or contains already toasted
2359 * out-of-line attributes from some other relation, invoke the toaster.
2360 */
2361 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2362 relation->rd_rel->relkind != RELKIND_MATVIEW)
2363 {
2364 /* toast table entries should never be recursively toasted */
2366 return tup;
2367 }
2368 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2369 return heap_toast_insert_or_update(relation, tup, NULL, options);
2370 else
2371 return tup;
2372}
2373
2374/*
2375 * Helper for heap_multi_insert() that computes the number of entire pages
2376 * that inserting the remaining heaptuples requires. Used to determine how
2377 * much the relation needs to be extended by.
2378 */
2379static int
2381{
2383 int npages = 1;
2384
2385 for (int i = done; i < ntuples; i++)
2386 {
2387 size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2388
2389 if (page_avail < tup_sz)
2390 {
2391 npages++;
2393 }
2394 page_avail -= tup_sz;
2395 }
2396
2397 return npages;
2398}
2399
2400/*
2401 * heap_multi_insert - insert multiple tuples into a heap
2402 *
2403 * This is like heap_insert(), but inserts multiple tuples in one operation.
2404 * That's faster than calling heap_insert() in a loop, because when multiple
2405 * tuples can be inserted on a single page, we can write just a single WAL
2406 * record covering all of them, and only need to lock/unlock the page once.
2407 *
2408 * Note: this leaks memory into the current memory context. You can create a
2409 * temporary context before calling this, if that's a problem.
2410 */
2411void
2412heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2413 CommandId cid, int options, BulkInsertState bistate)
2414{
2417 int i;
2418 int ndone;
2420 Page page;
2421 Buffer vmbuffer = InvalidBuffer;
2422 bool needwal;
2426 bool starting_with_empty_page = false;
2427 int npages = 0;
2428 int npages_used = 0;
2429
2430 /* currently not needed (thus unsupported) for heap_multi_insert() */
2432
2433 AssertHasSnapshotForToast(relation);
2434
2435 needwal = RelationNeedsWAL(relation);
2438
2439 /* Toast and set header data in all the slots */
2440 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2441 for (i = 0; i < ntuples; i++)
2442 {
2443 HeapTuple tuple;
2444
2445 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2446 slots[i]->tts_tableOid = RelationGetRelid(relation);
2447 tuple->t_tableOid = slots[i]->tts_tableOid;
2448 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2449 options);
2450 }
2451
2452 /*
2453 * We're about to do the actual inserts -- but check for conflict first,
2454 * to minimize the possibility of having to roll back work we've just
2455 * done.
2456 *
2457 * A check here does not definitively prevent a serialization anomaly;
2458 * that check MUST be done at least past the point of acquiring an
2459 * exclusive buffer content lock on every buffer that will be affected,
2460 * and MAY be done after all inserts are reflected in the buffers and
2461 * those locks are released; otherwise there is a race condition. Since
2462 * multiple buffers can be locked and unlocked in the loop below, and it
2463 * would not be feasible to identify and lock all of those buffers before
2464 * the loop, we must do a final check at the end.
2465 *
2466 * The check here could be omitted with no loss of correctness; it is
2467 * present strictly as an optimization.
2468 *
2469 * For heap inserts, we only need to check for table-level SSI locks. Our
2470 * new tuples can't possibly conflict with existing tuple locks, and heap
2471 * page locks are only consolidated versions of tuple locks; they do not
2472 * lock "gaps" as index page locks do. So we don't need to specify a
2473 * buffer when making the call, which makes for a faster check.
2474 */
2476
2477 ndone = 0;
2478 while (ndone < ntuples)
2479 {
2480 Buffer buffer;
2481 bool all_visible_cleared = false;
2482 bool all_frozen_set = false;
2483 int nthispage;
2484
2486
2487 /*
2488 * Compute number of pages needed to fit the to-be-inserted tuples in
2489 * the worst case. This will be used to determine how much to extend
2490 * the relation by in RelationGetBufferForTuple(), if needed. If we
2491 * filled a prior page from scratch, we can just update our last
2492 * computation, but if we started with a partially filled page,
2493 * recompute from scratch, the number of potentially required pages
2494 * can vary due to tuples needing to fit onto the page, page headers
2495 * etc.
2496 */
2497 if (ndone == 0 || !starting_with_empty_page)
2498 {
2499 npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2501 npages_used = 0;
2502 }
2503 else
2504 npages_used++;
2505
2506 /*
2507 * Find buffer where at least the next tuple will fit. If the page is
2508 * all-visible, this will also pin the requisite visibility map page.
2509 *
2510 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2511 * empty page. See all_frozen_set below.
2512 */
2513 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2514 InvalidBuffer, options, bistate,
2515 &vmbuffer, NULL,
2516 npages - npages_used);
2517 page = BufferGetPage(buffer);
2518
2520
2522 {
2523 all_frozen_set = true;
2524 /* Lock the vmbuffer before entering the critical section */
2526 }
2527
2528 /* NO EREPORT(ERROR) from here till changes are logged */
2530
2531 /*
2532 * RelationGetBufferForTuple has ensured that the first tuple fits.
2533 * Put that on the page, and then as many other tuples as fit.
2534 */
2535 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2536
2537 /*
2538 * For logical decoding we need combo CIDs to properly decode the
2539 * catalog.
2540 */
2541 if (needwal && need_cids)
2542 log_heap_new_cid(relation, heaptuples[ndone]);
2543
2544 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2545 {
2547
2548 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2549 break;
2550
2551 RelationPutHeapTuple(relation, buffer, heaptup, false);
2552
2553 /*
2554 * For logical decoding we need combo CIDs to properly decode the
2555 * catalog.
2556 */
2557 if (needwal && need_cids)
2558 log_heap_new_cid(relation, heaptup);
2559 }
2560
2561 /*
2562 * If the page is all visible, need to clear that, unless we're only
2563 * going to add further frozen rows to it.
2564 *
2565 * If we're only adding already frozen rows to a previously empty
2566 * page, mark it as all-frozen and update the visibility map. We're
2567 * already holding a pin on the vmbuffer.
2568 */
2570 {
2571 all_visible_cleared = true;
2572 PageClearAllVisible(page);
2573 visibilitymap_clear(relation,
2574 BufferGetBlockNumber(buffer),
2575 vmbuffer, VISIBILITYMAP_VALID_BITS);
2576 }
2577 else if (all_frozen_set)
2578 {
2579 PageSetAllVisible(page);
2580 PageClearPrunable(page);
2582 vmbuffer,
2585 relation->rd_locator);
2586 }
2587
2588 /*
2589 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2590 */
2591
2592 MarkBufferDirty(buffer);
2593
2594 /* XLOG stuff */
2595 if (needwal)
2596 {
2600 char *tupledata;
2601 int totaldatalen;
2602 char *scratchptr = scratch.data;
2603 bool init;
2604 int bufflags = 0;
2605
2606 /*
2607 * If the page was previously empty, we can reinit the page
2608 * instead of restoring the whole thing.
2609 */
2611
2612 /* allocate xl_heap_multi_insert struct from the scratch area */
2615
2616 /*
2617 * Allocate offsets array. Unless we're reinitializing the page,
2618 * in that case the tuples are stored in order starting at
2619 * FirstOffsetNumber and we don't need to store the offsets
2620 * explicitly.
2621 */
2622 if (!init)
2623 scratchptr += nthispage * sizeof(OffsetNumber);
2624
2625 /* the rest of the scratch space is used for tuple data */
2626 tupledata = scratchptr;
2627
2628 /* check that the mutually exclusive flags are not both set */
2630
2631 xlrec->flags = 0;
2634
2635 /*
2636 * We don't have to worry about including a conflict xid in the
2637 * WAL record, as HEAP_INSERT_FROZEN intentionally violates
2638 * visibility rules.
2639 */
2640 if (all_frozen_set)
2642
2643 xlrec->ntuples = nthispage;
2644
2645 /*
2646 * Write out an xl_multi_insert_tuple and the tuple data itself
2647 * for each tuple.
2648 */
2649 for (i = 0; i < nthispage; i++)
2650 {
2652 xl_multi_insert_tuple *tuphdr;
2653 int datalen;
2654
2655 if (!init)
2656 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2657 /* xl_multi_insert_tuple needs two-byte alignment. */
2659 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2660
2661 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2662 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2663 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2664
2665 /* write bitmap [+ padding] [+ oid] + data */
2666 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2668 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2669 datalen);
2670 tuphdr->datalen = datalen;
2671 scratchptr += datalen;
2672 }
2673 totaldatalen = scratchptr - tupledata;
2674 Assert((scratchptr - scratch.data) < BLCKSZ);
2675
2676 if (need_tuple_data)
2678
2679 /*
2680 * Signal that this is the last xl_heap_multi_insert record
2681 * emitted by this call to heap_multi_insert(). Needed for logical
2682 * decoding so it knows when to cleanup temporary data.
2683 */
2684 if (ndone + nthispage == ntuples)
2686
2687 if (init)
2688 {
2689 info |= XLOG_HEAP_INIT_PAGE;
2691 }
2692
2693 /*
2694 * If we're doing logical decoding, include the new tuple data
2695 * even if we take a full-page image of the page.
2696 */
2697 if (need_tuple_data)
2699
2701 XLogRegisterData(xlrec, tupledata - scratch.data);
2703 if (all_frozen_set)
2704 XLogRegisterBuffer(1, vmbuffer, 0);
2705
2706 XLogRegisterBufData(0, tupledata, totaldatalen);
2707
2708 /* filtering by origin on a row level is much more efficient */
2710
2711 recptr = XLogInsert(RM_HEAP2_ID, info);
2712
2713 PageSetLSN(page, recptr);
2714 if (all_frozen_set)
2715 {
2716 Assert(BufferIsDirty(vmbuffer));
2717 PageSetLSN(BufferGetPage(vmbuffer), recptr);
2718 }
2719 }
2720
2722
2723 if (all_frozen_set)
2724 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
2725
2726 UnlockReleaseBuffer(buffer);
2727 ndone += nthispage;
2728
2729 /*
2730 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2731 * likely that we'll insert into subsequent heap pages that are likely
2732 * to use the same vm page.
2733 */
2734 }
2735
2736 /* We're done with inserting all tuples, so release the last vmbuffer. */
2737 if (vmbuffer != InvalidBuffer)
2738 ReleaseBuffer(vmbuffer);
2739
2740 /*
2741 * We're done with the actual inserts. Check for conflicts again, to
2742 * ensure that all rw-conflicts in to these inserts are detected. Without
2743 * this final check, a sequential scan of the heap may have locked the
2744 * table after the "before" check, missing one opportunity to detect the
2745 * conflict, and then scanned the table before the new tuples were there,
2746 * missing the other chance to detect the conflict.
2747 *
2748 * For heap inserts, we only need to check for table-level SSI locks. Our
2749 * new tuples can't possibly conflict with existing tuple locks, and heap
2750 * page locks are only consolidated versions of tuple locks; they do not
2751 * lock "gaps" as index page locks do. So we don't need to specify a
2752 * buffer when making the call.
2753 */
2755
2756 /*
2757 * If tuples are cacheable, mark them for invalidation from the caches in
2758 * case we abort. Note it is OK to do this after releasing the buffer,
2759 * because the heaptuples data structure is all in local memory, not in
2760 * the shared buffer.
2761 */
2762 if (IsCatalogRelation(relation))
2763 {
2764 for (i = 0; i < ntuples; i++)
2766 }
2767
2768 /* copy t_self fields back to the caller's slots */
2769 for (i = 0; i < ntuples; i++)
2770 slots[i]->tts_tid = heaptuples[i]->t_self;
2771
2772 pgstat_count_heap_insert(relation, ntuples);
2773}
2774
2775/*
2776 * simple_heap_insert - insert a tuple
2777 *
2778 * Currently, this routine differs from heap_insert only in supplying
2779 * a default command ID and not allowing access to the speedup options.
2780 *
2781 * This should be used rather than using heap_insert directly in most places
2782 * where we are modifying system catalogs.
2783 */
2784void
2786{
2787 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2788}
2789
2790/*
2791 * Given infomask/infomask2, compute the bits that must be saved in the
2792 * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2793 * xl_heap_lock_updated WAL records.
2794 *
2795 * See fix_infomask_from_infobits.
2796 */
2797static uint8
2799{
2800 return
2804 /* note we ignore HEAP_XMAX_SHR_LOCK here */
2806 ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2807 XLHL_KEYS_UPDATED : 0);
2808}
2809
2810/*
2811 * Given two versions of the same t_infomask for a tuple, compare them and
2812 * return whether the relevant status for a tuple Xmax has changed. This is
2813 * used after a buffer lock has been released and reacquired: we want to ensure
2814 * that the tuple state continues to be the same it was when we previously
2815 * examined it.
2816 *
2817 * Note the Xmax field itself must be compared separately.
2818 */
2819static inline bool
2821{
2822 const uint16 interesting =
2824
2825 if ((new_infomask & interesting) != (old_infomask & interesting))
2826 return true;
2827
2828 return false;
2829}
2830
2831/*
2832 * heap_delete - delete a tuple
2833 *
2834 * See table_tuple_delete() for an explanation of the parameters, except that
2835 * this routine directly takes a tuple rather than a slot.
2836 *
2837 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2838 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2839 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2840 * generated by another transaction).
2841 */
2843heap_delete(Relation relation, const ItemPointerData *tid,
2844 CommandId cid, Snapshot crosscheck, bool wait,
2845 TM_FailureData *tmfd, bool changingPart)
2846{
2847 TM_Result result;
2849 ItemId lp;
2850 HeapTupleData tp;
2851 Page page;
2852 BlockNumber block;
2853 Buffer buffer;
2854 Buffer vmbuffer = InvalidBuffer;
2855 TransactionId new_xmax;
2858 bool have_tuple_lock = false;
2859 bool iscombo;
2860 bool all_visible_cleared = false;
2861 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2862 bool old_key_copied = false;
2863
2865
2866 AssertHasSnapshotForToast(relation);
2867
2868 /*
2869 * Forbid this during a parallel operation, lest it allocate a combo CID.
2870 * Other workers might need that combo CID for visibility checks, and we
2871 * have no provision for broadcasting it to them.
2872 */
2873 if (IsInParallelMode())
2874 ereport(ERROR,
2876 errmsg("cannot delete tuples during a parallel operation")));
2877
2878 block = ItemPointerGetBlockNumber(tid);
2879 buffer = ReadBuffer(relation, block);
2880 page = BufferGetPage(buffer);
2881
2882 /*
2883 * Before locking the buffer, pin the visibility map page if it appears to
2884 * be necessary. Since we haven't got the lock yet, someone else might be
2885 * in the middle of changing this, so we'll need to recheck after we have
2886 * the lock.
2887 */
2888 if (PageIsAllVisible(page))
2889 visibilitymap_pin(relation, block, &vmbuffer);
2890
2892
2895
2896 tp.t_tableOid = RelationGetRelid(relation);
2897 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2898 tp.t_len = ItemIdGetLength(lp);
2899 tp.t_self = *tid;
2900
2901l1:
2902
2903 /*
2904 * If we didn't pin the visibility map page and the page has become all
2905 * visible while we were busy locking the buffer, we'll have to unlock and
2906 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2907 * unfortunate, but hopefully shouldn't happen often.
2908 */
2909 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2910 {
2912 visibilitymap_pin(relation, block, &vmbuffer);
2914 }
2915
2916 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2917
2918 if (result == TM_Invisible)
2919 {
2920 UnlockReleaseBuffer(buffer);
2921 ereport(ERROR,
2923 errmsg("attempted to delete invisible tuple")));
2924 }
2925 else if (result == TM_BeingModified && wait)
2926 {
2929
2930 /* must copy state data before unlocking buffer */
2933
2934 /*
2935 * Sleep until concurrent transaction ends -- except when there's a
2936 * single locker and it's our own transaction. Note we don't care
2937 * which lock mode the locker has, because we need the strongest one.
2938 *
2939 * Before sleeping, we need to acquire tuple lock to establish our
2940 * priority for the tuple (see heap_lock_tuple). LockTuple will
2941 * release us when we are next-in-line for the tuple.
2942 *
2943 * If we are forced to "start over" below, we keep the tuple lock;
2944 * this arranges that we stay at the head of the line while rechecking
2945 * tuple state.
2946 */
2948 {
2949 bool current_is_member = false;
2950
2953 {
2955
2956 /*
2957 * Acquire the lock, if necessary (but skip it when we're
2958 * requesting a lock and already have one; avoids deadlock).
2959 */
2960 if (!current_is_member)
2963
2964 /* wait for multixact */
2966 relation, &(tp.t_self), XLTW_Delete,
2967 NULL);
2969
2970 /*
2971 * If xwait had just locked the tuple then some other xact
2972 * could update this tuple before we get to this point. Check
2973 * for xmax change, and start over if so.
2974 *
2975 * We also must start over if we didn't pin the VM page, and
2976 * the page has become all visible.
2977 */
2978 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2981 xwait))
2982 goto l1;
2983 }
2984
2985 /*
2986 * You might think the multixact is necessarily done here, but not
2987 * so: it could have surviving members, namely our own xact or
2988 * other subxacts of this backend. It is legal for us to delete
2989 * the tuple in either case, however (the latter case is
2990 * essentially a situation of upgrading our former shared lock to
2991 * exclusive). We don't bother changing the on-disk hint bits
2992 * since we are about to overwrite the xmax altogether.
2993 */
2994 }
2996 {
2997 /*
2998 * Wait for regular transaction to end; but first, acquire tuple
2999 * lock.
3000 */
3004 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3006
3007 /*
3008 * xwait is done, but if xwait had just locked the tuple then some
3009 * other xact could update this tuple before we get to this point.
3010 * Check for xmax change, and start over if so.
3011 *
3012 * We also must start over if we didn't pin the VM page, and the
3013 * page has become all visible.
3014 */
3015 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
3018 xwait))
3019 goto l1;
3020
3021 /* Otherwise check if it committed or aborted */
3022 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3023 }
3024
3025 /*
3026 * We may overwrite if previous xmax aborted, or if it committed but
3027 * only locked the tuple without updating it.
3028 */
3029 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3032 result = TM_Ok;
3033 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
3034 result = TM_Updated;
3035 else
3036 result = TM_Deleted;
3037 }
3038
3039 /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3040 if (result != TM_Ok)
3041 {
3042 Assert(result == TM_SelfModified ||
3043 result == TM_Updated ||
3044 result == TM_Deleted ||
3045 result == TM_BeingModified);
3047 Assert(result != TM_Updated ||
3049 }
3050
3051 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3052 {
3053 /* Perform additional check for transaction-snapshot mode RI updates */
3054 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3055 result = TM_Updated;
3056 }
3057
3058 if (result != TM_Ok)
3059 {
3060 tmfd->ctid = tp.t_data->t_ctid;
3062 if (result == TM_SelfModified)
3064 else
3065 tmfd->cmax = InvalidCommandId;
3066 UnlockReleaseBuffer(buffer);
3067 if (have_tuple_lock)
3069 if (vmbuffer != InvalidBuffer)
3070 ReleaseBuffer(vmbuffer);
3071 return result;
3072 }
3073
3074 /*
3075 * We're about to do the actual delete -- check for conflict first, to
3076 * avoid possibly having to roll back work we've just done.
3077 *
3078 * This is safe without a recheck as long as there is no possibility of
3079 * another process scanning the page between this check and the delete
3080 * being visible to the scan (i.e., an exclusive buffer content lock is
3081 * continuously held from this point until the tuple delete is visible).
3082 */
3084
3085 /* replace cid with a combo CID if necessary */
3087
3088 /*
3089 * Compute replica identity tuple before entering the critical section so
3090 * we don't PANIC upon a memory allocation failure.
3091 */
3092 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3093
3094 /*
3095 * If this is the first possibly-multixact-able operation in the current
3096 * transaction, set my per-backend OldestMemberMXactId setting. We can be
3097 * certain that the transaction will never become a member of any older
3098 * MultiXactIds than that. (We have to do this even if we end up just
3099 * using our own TransactionId below, since some other backend could
3100 * incorporate our XID into a MultiXact immediately afterwards.)
3101 */
3103
3106 xid, LockTupleExclusive, true,
3107 &new_xmax, &new_infomask, &new_infomask2);
3108
3110
3111 /*
3112 * If this transaction commits, the tuple will become DEAD sooner or
3113 * later. Set flag that this page is a candidate for pruning once our xid
3114 * falls below the OldestXmin horizon. If the transaction finally aborts,
3115 * the subsequent page pruning will be a no-op and the hint will be
3116 * cleared.
3117 */
3118 PageSetPrunable(page, xid);
3119
3120 if (PageIsAllVisible(page))
3121 {
3122 all_visible_cleared = true;
3123 PageClearAllVisible(page);
3124 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3125 vmbuffer, VISIBILITYMAP_VALID_BITS);
3126 }
3127
3128 /* store transaction information of xact deleting the tuple */
3134 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3136 /* Make sure there is no forward chain link in t_ctid */
3137 tp.t_data->t_ctid = tp.t_self;
3138
3139 /* Signal that this is actually a move into another partition */
3140 if (changingPart)
3142
3143 MarkBufferDirty(buffer);
3144
3145 /*
3146 * XLOG stuff
3147 *
3148 * NB: heap_abort_speculative() uses the same xlog record and replay
3149 * routines.
3150 */
3151 if (RelationNeedsWAL(relation))
3152 {
3156
3157 /*
3158 * For logical decode we need combo CIDs to properly decode the
3159 * catalog
3160 */
3162 log_heap_new_cid(relation, &tp);
3163
3164 xlrec.flags = 0;
3167 if (changingPart)
3169 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3170 tp.t_data->t_infomask2);
3172 xlrec.xmax = new_xmax;
3173
3174 if (old_key_tuple != NULL)
3175 {
3176 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3178 else
3180 }
3181
3184
3186
3187 /*
3188 * Log replica identity of the deleted tuple if there is one
3189 */
3190 if (old_key_tuple != NULL)
3191 {
3192 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3193 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3194 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3195
3197 XLogRegisterData((char *) old_key_tuple->t_data
3199 old_key_tuple->t_len
3201 }
3202
3203 /* filtering by origin on a row level is much more efficient */
3205
3207
3208 PageSetLSN(page, recptr);
3209 }
3210
3212
3214
3215 if (vmbuffer != InvalidBuffer)
3216 ReleaseBuffer(vmbuffer);
3217
3218 /*
3219 * If the tuple has toasted out-of-line attributes, we need to delete
3220 * those items too. We have to do this before releasing the buffer
3221 * because we need to look at the contents of the tuple, but it's OK to
3222 * release the content lock on the buffer first.
3223 */
3224 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3225 relation->rd_rel->relkind != RELKIND_MATVIEW)
3226 {
3227 /* toast table entries should never be recursively toasted */
3229 }
3230 else if (HeapTupleHasExternal(&tp))
3231 heap_toast_delete(relation, &tp, false);
3232
3233 /*
3234 * Mark tuple for invalidation from system caches at next command
3235 * boundary. We have to do this before releasing the buffer because we
3236 * need to look at the contents of the tuple.
3237 */
3238 CacheInvalidateHeapTuple(relation, &tp, NULL);
3239
3240 /* Now we can release the buffer */
3241 ReleaseBuffer(buffer);
3242
3243 /*
3244 * Release the lmgr tuple lock, if we had it.
3245 */
3246 if (have_tuple_lock)
3248
3249 pgstat_count_heap_delete(relation);
3250
3253
3254 return TM_Ok;
3255}
3256
3257/*
3258 * simple_heap_delete - delete a tuple
3259 *
3260 * This routine may be used to delete a tuple when concurrent updates of
3261 * the target tuple are not expected (for example, because we have a lock
3262 * on the relation associated with the tuple). Any failure is reported
3263 * via ereport().
3264 */
3265void
3266simple_heap_delete(Relation relation, const ItemPointerData *tid)
3267{
3268 TM_Result result;
3269 TM_FailureData tmfd;
3270
3271 result = heap_delete(relation, tid,
3273 true /* wait for commit */ ,
3274 &tmfd, false /* changingPart */ );
3275 switch (result)
3276 {
3277 case TM_SelfModified:
3278 /* Tuple was already updated in current command? */
3279 elog(ERROR, "tuple already updated by self");
3280 break;
3281
3282 case TM_Ok:
3283 /* done successfully */
3284 break;
3285
3286 case TM_Updated:
3287 elog(ERROR, "tuple concurrently updated");
3288 break;
3289
3290 case TM_Deleted:
3291 elog(ERROR, "tuple concurrently deleted");
3292 break;
3293
3294 default:
3295 elog(ERROR, "unrecognized heap_delete status: %u", result);
3296 break;
3297 }
3298}
3299
3300/*
3301 * heap_update - replace a tuple
3302 *
3303 * See table_tuple_update() for an explanation of the parameters, except that
3304 * this routine directly takes a tuple rather than a slot.
3305 *
3306 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
3307 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
3308 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
3309 * generated by another transaction).
3310 */
3313 CommandId cid, Snapshot crosscheck, bool wait,
3314 TM_FailureData *tmfd, LockTupleMode *lockmode,
3316{
3317 TM_Result result;
3325 ItemId lp;
3329 bool old_key_copied = false;
3330 Page page;
3331 BlockNumber block;
3333 Buffer buffer,
3334 newbuf,
3335 vmbuffer = InvalidBuffer,
3337 bool need_toast;
3339 pagefree;
3340 bool have_tuple_lock = false;
3341 bool iscombo;
3342 bool use_hot_update = false;
3343 bool summarized_update = false;
3344 bool key_intact;
3345 bool all_visible_cleared = false;
3346 bool all_visible_cleared_new = false;
3347 bool checked_lockers;
3348 bool locker_remains;
3349 bool id_has_external = false;
3356
3358
3359 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3362
3363 AssertHasSnapshotForToast(relation);
3364
3365 /*
3366 * Forbid this during a parallel operation, lest it allocate a combo CID.
3367 * Other workers might need that combo CID for visibility checks, and we
3368 * have no provision for broadcasting it to them.
3369 */
3370 if (IsInParallelMode())
3371 ereport(ERROR,
3373 errmsg("cannot update tuples during a parallel operation")));
3374
3375#ifdef USE_ASSERT_CHECKING
3377#endif
3378
3379 /*
3380 * Fetch the list of attributes to be checked for various operations.
3381 *
3382 * For HOT considerations, this is wasted effort if we fail to update or
3383 * have to put the new tuple on a different page. But we must compute the
3384 * list before obtaining buffer lock --- in the worst case, if we are
3385 * doing an update on one of the relevant system catalogs, we could
3386 * deadlock if we try to fetch the list later. In any case, the relcache
3387 * caches the data so this is usually pretty cheap.
3388 *
3389 * We also need columns used by the replica identity and columns that are
3390 * considered the "key" of rows in the table.
3391 *
3392 * Note that we get copies of each bitmap, so we need not worry about
3393 * relcache flush happening midway through.
3394 */
3407
3409 INJECTION_POINT("heap_update-before-pin", NULL);
3410 buffer = ReadBuffer(relation, block);
3411 page = BufferGetPage(buffer);
3412
3413 /*
3414 * Before locking the buffer, pin the visibility map page if it appears to
3415 * be necessary. Since we haven't got the lock yet, someone else might be
3416 * in the middle of changing this, so we'll need to recheck after we have
3417 * the lock.
3418 */
3419 if (PageIsAllVisible(page))
3420 visibilitymap_pin(relation, block, &vmbuffer);
3421
3423
3425
3426 /*
3427 * Usually, a buffer pin and/or snapshot blocks pruning of otid, ensuring
3428 * we see LP_NORMAL here. When the otid origin is a syscache, we may have
3429 * neither a pin nor a snapshot. Hence, we may see other LP_ states, each
3430 * of which indicates concurrent pruning.
3431 *
3432 * Failing with TM_Updated would be most accurate. However, unlike other
3433 * TM_Updated scenarios, we don't know the successor ctid in LP_UNUSED and
3434 * LP_DEAD cases. While the distinction between TM_Updated and TM_Deleted
3435 * does matter to SQL statements UPDATE and MERGE, those SQL statements
3436 * hold a snapshot that ensures LP_NORMAL. Hence, the choice between
3437 * TM_Updated and TM_Deleted affects only the wording of error messages.
3438 * Settle on TM_Deleted, for two reasons. First, it avoids complicating
3439 * the specification of when tmfd->ctid is valid. Second, it creates
3440 * error log evidence that we took this branch.
3441 *
3442 * Since it's possible to see LP_UNUSED at otid, it's also possible to see
3443 * LP_NORMAL for a tuple that replaced LP_UNUSED. If it's a tuple for an
3444 * unrelated row, we'll fail with "duplicate key value violates unique".
3445 * XXX if otid is the live, newer version of the newtup row, we'll discard
3446 * changes originating in versions of this catalog row after the version
3447 * the caller got from syscache. See syscache-update-pruned.spec.
3448 */
3449 if (!ItemIdIsNormal(lp))
3450 {
3452
3453 UnlockReleaseBuffer(buffer);
3455 if (vmbuffer != InvalidBuffer)
3456 ReleaseBuffer(vmbuffer);
3457 tmfd->ctid = *otid;
3458 tmfd->xmax = InvalidTransactionId;
3459 tmfd->cmax = InvalidCommandId;
3461
3466 /* modified_attrs not yet initialized */
3468 return TM_Deleted;
3469 }
3470
3471 /*
3472 * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3473 * properly.
3474 */
3475 oldtup.t_tableOid = RelationGetRelid(relation);
3476 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3477 oldtup.t_len = ItemIdGetLength(lp);
3478 oldtup.t_self = *otid;
3479
3480 /* the new tuple is ready, except for this: */
3481 newtup->t_tableOid = RelationGetRelid(relation);
3482
3483 /*
3484 * Determine columns modified by the update. Additionally, identify
3485 * whether any of the unmodified replica identity key attributes in the
3486 * old tuple is externally stored or not. This is required because for
3487 * such attributes the flattened value won't be WAL logged as part of the
3488 * new tuple so we must include it as part of the old_key_tuple. See
3489 * ExtractReplicaIdentity.
3490 */
3492 id_attrs, &oldtup,
3494
3495 /*
3496 * If we're not updating any "key" column, we can grab a weaker lock type.
3497 * This allows for more concurrency when we are running simultaneously
3498 * with foreign key checks.
3499 *
3500 * Note that if a column gets detoasted while executing the update, but
3501 * the value ends up being the same, this test will fail and we will use
3502 * the stronger lock. This is acceptable; the important case to optimize
3503 * is updates that don't manipulate key columns, not those that
3504 * serendipitously arrive at the same key values.
3505 */
3507 {
3508 *lockmode = LockTupleNoKeyExclusive;
3510 key_intact = true;
3511
3512 /*
3513 * If this is the first possibly-multixact-able operation in the
3514 * current transaction, set my per-backend OldestMemberMXactId
3515 * setting. We can be certain that the transaction will never become a
3516 * member of any older MultiXactIds than that. (We have to do this
3517 * even if we end up just using our own TransactionId below, since
3518 * some other backend could incorporate our XID into a MultiXact
3519 * immediately afterwards.)
3520 */
3522 }
3523 else
3524 {
3525 *lockmode = LockTupleExclusive;
3527 key_intact = false;
3528 }
3529
3530 /*
3531 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3532 * otid may very well point at newtup->t_self, which we will overwrite
3533 * with the new tuple's location, so there's great risk of confusion if we
3534 * use otid anymore.
3535 */
3536
3537l2:
3538 checked_lockers = false;
3539 locker_remains = false;
3540 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3541
3542 /* see below about the "no wait" case */
3543 Assert(result != TM_BeingModified || wait);
3544
3545 if (result == TM_Invisible)
3546 {
3547 UnlockReleaseBuffer(buffer);
3548 ereport(ERROR,
3550 errmsg("attempted to update invisible tuple")));
3551 }
3552 else if (result == TM_BeingModified && wait)
3553 {
3556 bool can_continue = false;
3557
3558 /*
3559 * XXX note that we don't consider the "no wait" case here. This
3560 * isn't a problem currently because no caller uses that case, but it
3561 * should be fixed if such a caller is introduced. It wasn't a
3562 * problem previously because this code would always wait, but now
3563 * that some tuple locks do not conflict with one of the lock modes we
3564 * use, it is possible that this case is interesting to handle
3565 * specially.
3566 *
3567 * This may cause failures with third-party code that calls
3568 * heap_update directly.
3569 */
3570
3571 /* must copy state data before unlocking buffer */
3573 infomask = oldtup.t_data->t_infomask;
3574
3575 /*
3576 * Now we have to do something about the existing locker. If it's a
3577 * multi, sleep on it; we might be awakened before it is completely
3578 * gone (or even not sleep at all in some cases); we need to preserve
3579 * it as locker, unless it is gone completely.
3580 *
3581 * If it's not a multi, we need to check for sleeping conditions
3582 * before actually going to sleep. If the update doesn't conflict
3583 * with the locks, we just continue without sleeping (but making sure
3584 * it is preserved).
3585 *
3586 * Before sleeping, we need to acquire tuple lock to establish our
3587 * priority for the tuple (see heap_lock_tuple). LockTuple will
3588 * release us when we are next-in-line for the tuple. Note we must
3589 * not acquire the tuple lock until we're sure we're going to sleep;
3590 * otherwise we're open for race conditions with other transactions
3591 * holding the tuple lock which sleep on us.
3592 *
3593 * If we are forced to "start over" below, we keep the tuple lock;
3594 * this arranges that we stay at the head of the line while rechecking
3595 * tuple state.
3596 */
3598 {
3600 int remain;
3601 bool current_is_member = false;
3602
3604 *lockmode, &current_is_member))
3605 {
3607
3608 /*
3609 * Acquire the lock, if necessary (but skip it when we're
3610 * requesting a lock and already have one; avoids deadlock).
3611 */
3612 if (!current_is_member)
3613 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3615
3616 /* wait for multixact */
3618 relation, &oldtup.t_self, XLTW_Update,
3619 &remain);
3620 checked_lockers = true;
3621 locker_remains = remain != 0;
3623
3624 /*
3625 * If xwait had just locked the tuple then some other xact
3626 * could update this tuple before we get to this point. Check
3627 * for xmax change, and start over if so.
3628 */
3629 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3630 infomask) ||
3632 xwait))
3633 goto l2;
3634 }
3635
3636 /*
3637 * Note that the multixact may not be done by now. It could have
3638 * surviving members; our own xact or other subxacts of this
3639 * backend, and also any other concurrent transaction that locked
3640 * the tuple with LockTupleKeyShare if we only got
3641 * LockTupleNoKeyExclusive. If this is the case, we have to be
3642 * careful to mark the updated tuple with the surviving members in
3643 * Xmax.
3644 *
3645 * Note that there could have been another update in the
3646 * MultiXact. In that case, we need to check whether it committed
3647 * or aborted. If it aborted we are safe to update it again;
3648 * otherwise there is an update conflict, and we have to return
3649 * TableTuple{Deleted, Updated} below.
3650 *
3651 * In the LockTupleExclusive case, we still need to preserve the
3652 * surviving members: those would include the tuple locks we had
3653 * before this one, which are important to keep in case this
3654 * subxact aborts.
3655 */
3656 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3658 else
3660
3661 /*
3662 * There was no UPDATE in the MultiXact; or it aborted. No
3663 * TransactionIdIsInProgress() call needed here, since we called
3664 * MultiXactIdWait() above.
3665 */
3668 can_continue = true;
3669 }
3671 {
3672 /*
3673 * The only locker is ourselves; we can avoid grabbing the tuple
3674 * lock here, but must preserve our locking information.
3675 */
3676 checked_lockers = true;
3677 locker_remains = true;
3678 can_continue = true;
3679 }
3681 {
3682 /*
3683 * If it's just a key-share locker, and we're not changing the key
3684 * columns, we don't need to wait for it to end; but we need to
3685 * preserve it as locker.
3686 */
3687 checked_lockers = true;
3688 locker_remains = true;
3689 can_continue = true;
3690 }
3691 else
3692 {
3693 /*
3694 * Wait for regular transaction to end; but first, acquire tuple
3695 * lock.
3696 */
3698 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3700 XactLockTableWait(xwait, relation, &oldtup.t_self,
3701 XLTW_Update);
3702 checked_lockers = true;
3704
3705 /*
3706 * xwait is done, but if xwait had just locked the tuple then some
3707 * other xact could update this tuple before we get to this point.
3708 * Check for xmax change, and start over if so.
3709 */
3710 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3713 goto l2;
3714
3715 /* Otherwise check if it committed or aborted */
3716 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3717 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3718 can_continue = true;
3719 }
3720
3721 if (can_continue)
3722 result = TM_Ok;
3723 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3724 result = TM_Updated;
3725 else
3726 result = TM_Deleted;
3727 }
3728
3729 /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3730 if (result != TM_Ok)
3731 {
3732 Assert(result == TM_SelfModified ||
3733 result == TM_Updated ||
3734 result == TM_Deleted ||
3735 result == TM_BeingModified);
3736 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3737 Assert(result != TM_Updated ||
3738 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3739 }
3740
3741 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3742 {
3743 /* Perform additional check for transaction-snapshot mode RI updates */
3745 result = TM_Updated;
3746 }
3747
3748 if (result != TM_Ok)
3749 {
3750 tmfd->ctid = oldtup.t_data->t_ctid;
3751 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3752 if (result == TM_SelfModified)
3753 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3754 else
3755 tmfd->cmax = InvalidCommandId;
3756 UnlockReleaseBuffer(buffer);
3757 if (have_tuple_lock)
3758 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3759 if (vmbuffer != InvalidBuffer)
3760 ReleaseBuffer(vmbuffer);
3762
3769 return result;
3770 }
3771
3772 /*
3773 * If we didn't pin the visibility map page and the page has become all
3774 * visible while we were busy locking the buffer, or during some
3775 * subsequent window during which we had it unlocked, we'll have to unlock
3776 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3777 * bit unfortunate, especially since we'll now have to recheck whether the
3778 * tuple has been locked or updated under us, but hopefully it won't
3779 * happen very often.
3780 */
3781 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3782 {
3784 visibilitymap_pin(relation, block, &vmbuffer);
3786 goto l2;
3787 }
3788
3789 /* Fill in transaction status data */
3790
3791 /*
3792 * If the tuple we're updating is locked, we need to preserve the locking
3793 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3794 */
3796 oldtup.t_data->t_infomask,
3797 oldtup.t_data->t_infomask2,
3798 xid, *lockmode, true,
3801
3802 /*
3803 * And also prepare an Xmax value for the new copy of the tuple. If there
3804 * was no xmax previously, or there was one but all lockers are now gone,
3805 * then use InvalidTransactionId; otherwise, get the xmax from the old
3806 * tuple. (In rare cases that might also be InvalidTransactionId and yet
3807 * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3808 */
3809 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3810 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3813 else
3815
3817 {
3820 }
3821 else
3822 {
3823 /*
3824 * If we found a valid Xmax for the new tuple, then the infomask bits
3825 * to use on the new tuple depend on what was there on the old one.
3826 * Note that since we're doing an update, the only possibility is that
3827 * the lockers had FOR KEY SHARE lock.
3828 */
3829 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3830 {
3833 }
3834 else
3835 {
3838 }
3839 }
3840
3841 /*
3842 * Prepare the new tuple with the appropriate initial values of Xmin and
3843 * Xmax, as well as initial infomask bits as computed above.
3844 */
3845 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3846 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3847 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3849 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3850 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3852
3853 /*
3854 * Replace cid with a combo CID if necessary. Note that we already put
3855 * the plain cid into the new tuple.
3856 */
3858
3859 /*
3860 * If the toaster needs to be activated, OR if the new tuple will not fit
3861 * on the same page as the old, then we need to release the content lock
3862 * (but not the pin!) on the old tuple's buffer while we are off doing
3863 * TOAST and/or table-file-extension work. We must mark the old tuple to
3864 * show that it's locked, else other processes may try to update it
3865 * themselves.
3866 *
3867 * We need to invoke the toaster if there are already any out-of-line
3868 * toasted values present, or if the new tuple is over-threshold.
3869 */
3870 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3871 relation->rd_rel->relkind != RELKIND_MATVIEW)
3872 {
3873 /* toast table entries should never be recursively toasted */
3876 need_toast = false;
3877 }
3878 else
3881 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3882
3884
3885 newtupsize = MAXALIGN(newtup->t_len);
3886
3888 {
3892 bool cleared_all_frozen = false;
3893
3894 /*
3895 * To prevent concurrent sessions from updating the tuple, we have to
3896 * temporarily mark it locked, while we release the page-level lock.
3897 *
3898 * To satisfy the rule that any xid potentially appearing in a buffer
3899 * written out to disk, we unfortunately have to WAL log this
3900 * temporary modification. We can reuse xl_heap_lock for this
3901 * purpose. If we crash/error before following through with the
3902 * actual update, xmax will be of an aborted transaction, allowing
3903 * other sessions to proceed.
3904 */
3905
3906 /*
3907 * Compute xmax / infomask appropriate for locking the tuple. This has
3908 * to be done separately from the combo that's going to be used for
3909 * updating, because the potentially created multixact would otherwise
3910 * be wrong.
3911 */
3913 oldtup.t_data->t_infomask,
3914 oldtup.t_data->t_infomask2,
3915 xid, *lockmode, false,
3918
3920
3922
3923 /* Clear obsolete visibility flags ... */
3924 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3925 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3927 /* ... and store info about transaction updating this tuple */
3930 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3931 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3933
3934 /* temporarily make it look not-updated, but locked */
3935 oldtup.t_data->t_ctid = oldtup.t_self;
3936
3937 /*
3938 * Clear all-frozen bit on visibility map if needed. We could
3939 * immediately reset ALL_VISIBLE, but given that the WAL logging
3940 * overhead would be unchanged, that doesn't seem necessarily
3941 * worthwhile.
3942 */
3943 if (PageIsAllVisible(page) &&
3944 visibilitymap_clear(relation, block, vmbuffer,
3946 cleared_all_frozen = true;
3947
3948 MarkBufferDirty(buffer);
3949
3950 if (RelationNeedsWAL(relation))
3951 {
3954
3957
3958 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3960 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3961 oldtup.t_data->t_infomask2);
3962 xlrec.flags =
3966 PageSetLSN(page, recptr);
3967 }
3968
3970
3972
3973 /*
3974 * Let the toaster do its thing, if needed.
3975 *
3976 * Note: below this point, heaptup is the data we actually intend to
3977 * store into the relation; newtup is the caller's original untoasted
3978 * data.
3979 */
3980 if (need_toast)
3981 {
3982 /* Note we always use WAL and FSM during updates */
3984 newtupsize = MAXALIGN(heaptup->t_len);
3985 }
3986 else
3987 heaptup = newtup;
3988
3989 /*
3990 * Now, do we need a new page for the tuple, or not? This is a bit
3991 * tricky since someone else could have added tuples to the page while
3992 * we weren't looking. We have to recheck the available space after
3993 * reacquiring the buffer lock. But don't bother to do that if the
3994 * former amount of free space is still not enough; it's unlikely
3995 * there's more free now than before.
3996 *
3997 * What's more, if we need to get a new page, we will need to acquire
3998 * buffer locks on both old and new pages. To avoid deadlock against
3999 * some other backend trying to get the same two locks in the other
4000 * order, we must be consistent about the order we get the locks in.
4001 * We use the rule "lock the lower-numbered page of the relation
4002 * first". To implement this, we must do RelationGetBufferForTuple
4003 * while not holding the lock on the old page, and we must rely on it
4004 * to get the locks on both pages in the correct order.
4005 *
4006 * Another consideration is that we need visibility map page pin(s) if
4007 * we will have to clear the all-visible flag on either page. If we
4008 * call RelationGetBufferForTuple, we rely on it to acquire any such
4009 * pins; but if we don't, we have to handle that here. Hence we need
4010 * a loop.
4011 */
4012 for (;;)
4013 {
4014 if (newtupsize > pagefree)
4015 {
4016 /* It doesn't fit, must use RelationGetBufferForTuple. */
4017 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4018 buffer, 0, NULL,
4019 &vmbuffer_new, &vmbuffer,
4020 0);
4021 /* We're all done. */
4022 break;
4023 }
4024 /* Acquire VM page pin if needed and we don't have it. */
4025 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4026 visibilitymap_pin(relation, block, &vmbuffer);
4027 /* Re-acquire the lock on the old tuple's page. */
4029 /* Re-check using the up-to-date free space */
4031 if (newtupsize > pagefree ||
4032 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
4033 {
4034 /*
4035 * Rats, it doesn't fit anymore, or somebody just now set the
4036 * all-visible flag. We must now unlock and loop to avoid
4037 * deadlock. Fortunately, this path should seldom be taken.
4038 */
4040 }
4041 else
4042 {
4043 /* We're all done. */
4044 newbuf = buffer;
4045 break;
4046 }
4047 }
4048 }
4049 else
4050 {
4051 /* No TOAST work needed, and it'll fit on same page */
4052 newbuf = buffer;
4053 heaptup = newtup;
4054 }
4055
4056 /*
4057 * We're about to do the actual update -- check for conflict first, to
4058 * avoid possibly having to roll back work we've just done.
4059 *
4060 * This is safe without a recheck as long as there is no possibility of
4061 * another process scanning the pages between this check and the update
4062 * being visible to the scan (i.e., exclusive buffer content lock(s) are
4063 * continuously held from this point until the tuple update is visible).
4064 *
4065 * For the new tuple the only check needed is at the relation level, but
4066 * since both tuples are in the same relation and the check for oldtup
4067 * will include checking the relation level, there is no benefit to a
4068 * separate check for the new tuple.
4069 */
4070 CheckForSerializableConflictIn(relation, &oldtup.t_self,
4071 BufferGetBlockNumber(buffer));
4072
4073 /*
4074 * At this point newbuf and buffer are both pinned and locked, and newbuf
4075 * has enough space for the new tuple. If they are the same buffer, only
4076 * one pin is held.
4077 */
4078
4079 if (newbuf == buffer)
4080 {
4081 /*
4082 * Since the new tuple is going into the same page, we might be able
4083 * to do a HOT update. Check if any of the index columns have been
4084 * changed.
4085 */
4087 {
4088 use_hot_update = true;
4089
4090 /*
4091 * If none of the columns that are used in hot-blocking indexes
4092 * were updated, we can apply HOT, but we do still need to check
4093 * if we need to update the summarizing indexes, and update those
4094 * indexes if the columns were updated, or we may fail to detect
4095 * e.g. value bound changes in BRIN minmax indexes.
4096 */
4098 summarized_update = true;
4099 }
4100 }
4101 else
4102 {
4103 /* Set a hint that the old page could use prune/defrag */
4104 PageSetFull(page);
4105 }
4106
4107 /*
4108 * Compute replica identity tuple before entering the critical section so
4109 * we don't PANIC upon a memory allocation failure.
4110 * ExtractReplicaIdentity() will return NULL if nothing needs to be
4111 * logged. Pass old key required as true only if the replica identity key
4112 * columns are modified or it has external data.
4113 */
4118
4119 /* NO EREPORT(ERROR) from here till changes are logged */
4121
4122 /*
4123 * If this transaction commits, the old tuple will become DEAD sooner or
4124 * later. Set flag that this page is a candidate for pruning once our xid
4125 * falls below the OldestXmin horizon. If the transaction finally aborts,
4126 * the subsequent page pruning will be a no-op and the hint will be
4127 * cleared.
4128 *
4129 * XXX Should we set hint on newbuf as well? If the transaction aborts,
4130 * there would be a prunable tuple in the newbuf; but for now we choose
4131 * not to optimize for aborts. Note that heap_xlog_update must be kept in
4132 * sync if this decision changes.
4133 */
4134 PageSetPrunable(page, xid);
4135
4136 if (use_hot_update)
4137 {
4138 /* Mark the old tuple as HOT-updated */
4140 /* And mark the new tuple as heap-only */
4142 /* Mark the caller's copy too, in case different from heaptup */
4144 }
4145 else
4146 {
4147 /* Make sure tuples are correctly marked as not-HOT */
4151 }
4152
4153 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4154
4155
4156 /* Clear obsolete visibility flags, possibly set by ourselves above... */
4157 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4158 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4159 /* ... and store info about transaction updating this tuple */
4162 oldtup.t_data->t_infomask |= infomask_old_tuple;
4163 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4165
4166 /* record address of new tuple in t_ctid of old one */
4167 oldtup.t_data->t_ctid = heaptup->t_self;
4168
4169 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4170 if (PageIsAllVisible(BufferGetPage(buffer)))
4171 {
4172 all_visible_cleared = true;
4174 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4175 vmbuffer, VISIBILITYMAP_VALID_BITS);
4176 }
4177 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4178 {
4183 }
4184
4185 if (newbuf != buffer)
4187 MarkBufferDirty(buffer);
4188
4189 /* XLOG stuff */
4190 if (RelationNeedsWAL(relation))
4191 {
4193
4194 /*
4195 * For logical decoding we need combo CIDs to properly decode the
4196 * catalog.
4197 */
4199 {
4200 log_heap_new_cid(relation, &oldtup);
4201 log_heap_new_cid(relation, heaptup);
4202 }
4203
4204 recptr = log_heap_update(relation, buffer,
4209 if (newbuf != buffer)
4210 {
4212 }
4214 }
4215
4217
4218 if (newbuf != buffer)
4221
4222 /*
4223 * Mark old tuple for invalidation from system caches at next command
4224 * boundary, and mark the new tuple for invalidation in case we abort. We
4225 * have to do this before releasing the buffer because oldtup is in the
4226 * buffer. (heaptup is all in local memory, but it's necessary to process
4227 * both tuple versions in one call to inval.c so we can avoid redundant
4228 * sinval messages.)
4229 */
4231
4232 /* Now we can release the buffer(s) */
4233 if (newbuf != buffer)
4235 ReleaseBuffer(buffer);
4238 if (BufferIsValid(vmbuffer))
4239 ReleaseBuffer(vmbuffer);
4240
4241 /*
4242 * Release the lmgr tuple lock, if we had it.
4243 */
4244 if (have_tuple_lock)
4245 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4246
4247 pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
4248
4249 /*
4250 * If heaptup is a private copy, release it. Don't forget to copy t_self
4251 * back to the caller's image, too.
4252 */
4253 if (heaptup != newtup)
4254 {
4255 newtup->t_self = heaptup->t_self;
4257 }
4258
4259 /*
4260 * If it is a HOT update, the update may still need to update summarized
4261 * indexes, lest we fail to update those summaries and get incorrect
4262 * results (for example, minmax bounds of the block may change with this
4263 * update).
4264 */
4265 if (use_hot_update)
4266 {
4269 else
4271 }
4272 else
4274
4277
4284
4285 return TM_Ok;
4286}
4287
4288#ifdef USE_ASSERT_CHECKING
4289/*
4290 * Confirm adequate lock held during heap_update(), per rules from
4291 * README.tuplock section "Locking to write inplace-updated tables".
4292 */
4293static void
4295 const ItemPointerData *otid,
4297{
4298 /* LOCKTAG_TUPLE acceptable for any catalog */
4299 switch (RelationGetRelid(relation))
4300 {
4301 case RelationRelationId:
4302 case DatabaseRelationId:
4303 {
4305
4307 relation->rd_lockInfo.lockRelId.dbId,
4308 relation->rd_lockInfo.lockRelId.relId,
4312 return;
4313 }
4314 break;
4315 default:
4316 Assert(!IsInplaceUpdateRelation(relation));
4317 return;
4318 }
4319
4320 switch (RelationGetRelid(relation))
4321 {
4322 case RelationRelationId:
4323 {
4324 /* LOCKTAG_TUPLE or LOCKTAG_RELATION ok */
4326 Oid relid = classForm->oid;
4327 Oid dbid;
4328 LOCKTAG tag;
4329
4330 if (IsSharedRelation(relid))
4331 dbid = InvalidOid;
4332 else
4333 dbid = MyDatabaseId;
4334
4335 if (classForm->relkind == RELKIND_INDEX)
4336 {
4337 Relation irel = index_open(relid, AccessShareLock);
4338
4339 SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4341 }
4342 else
4343 SET_LOCKTAG_RELATION(tag, dbid, relid);
4344
4345 if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, false) &&
4346 !LockHeldByMe(&tag, ShareRowExclusiveLock, true))
4347 elog(WARNING,
4348 "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4349 NameStr(classForm->relname),
4350 relid,
4351 classForm->relkind,
4354 }
4355 break;
4356 case DatabaseRelationId:
4357 {
4358 /* LOCKTAG_TUPLE required */
4360
4361 elog(WARNING,
4362 "missing lock on database \"%s\" (OID %u) @ TID (%u,%u)",
4363 NameStr(dbForm->datname),
4364 dbForm->oid,
4367 }
4368 break;
4369 }
4370}
4371
4372/*
4373 * Confirm adequate relation lock held, per rules from README.tuplock section
4374 * "Locking to write inplace-updated tables".
4375 */
4376static void
4378{
4380 Oid relid = classForm->oid;
4381 Oid dbid;
4382 LOCKTAG tag;
4383
4384 if (IsSharedRelation(relid))
4385 dbid = InvalidOid;
4386 else
4387 dbid = MyDatabaseId;
4388
4389 if (classForm->relkind == RELKIND_INDEX)
4390 {
4391 Relation irel = index_open(relid, AccessShareLock);
4392
4393 SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4395 }
4396 else
4397 SET_LOCKTAG_RELATION(tag, dbid, relid);
4398
4399 if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, true))
4400 elog(WARNING,
4401 "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4402 NameStr(classForm->relname),
4403 relid,
4404 classForm->relkind,
4407}
4408#endif
4409
4410/*
4411 * Check if the specified attribute's values are the same. Subroutine for
4412 * HeapDetermineColumnsInfo.
4413 */
4414static bool
4415heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2,
4416 bool isnull1, bool isnull2)
4417{
4418 /*
4419 * If one value is NULL and other is not, then they are certainly not
4420 * equal
4421 */
4422 if (isnull1 != isnull2)
4423 return false;
4424
4425 /*
4426 * If both are NULL, they can be considered equal.
4427 */
4428 if (isnull1)
4429 return true;
4430
4431 /*
4432 * We do simple binary comparison of the two datums. This may be overly
4433 * strict because there can be multiple binary representations for the
4434 * same logical value. But we should be OK as long as there are no false
4435 * positives. Using a type-specific equality operator is messy because
4436 * there could be multiple notions of equality in different operator
4437 * classes; furthermore, we cannot safely invoke user-defined functions
4438 * while holding exclusive buffer lock.
4439 */
4440 if (attrnum <= 0)
4441 {
4442 /* The only allowed system columns are OIDs, so do this */
4444 }
4445 else
4446 {
4448
4450 att = TupleDescCompactAttr(tupdesc, attrnum - 1);
4451 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4452 }
4453}
4454
4455/*
4456 * Check which columns are being updated.
4457 *
4458 * Given an updated tuple, determine (and return into the output bitmapset),
4459 * from those listed as interesting, the set of columns that changed.
4460 *
4461 * has_external indicates if any of the unmodified attributes (from those
4462 * listed as interesting) of the old tuple is a member of external_cols and is
4463 * stored externally.
4464 */
4465static Bitmapset *
4470 bool *has_external)
4471{
4472 int attidx;
4474 TupleDesc tupdesc = RelationGetDescr(relation);
4475
4476 attidx = -1;
4477 while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
4478 {
4479 /* attidx is zero-based, attrnum is the normal attribute number */
4481 Datum value1,
4482 value2;
4483 bool isnull1,
4484 isnull2;
4485
4486 /*
4487 * If it's a whole-tuple reference, say "not equal". It's not really
4488 * worth supporting this case, since it could only succeed after a
4489 * no-op update, which is hardly a case worth optimizing for.
4490 */
4491 if (attrnum == 0)
4492 {
4493 modified = bms_add_member(modified, attidx);
4494 continue;
4495 }
4496
4497 /*
4498 * Likewise, automatically say "not equal" for any system attribute
4499 * other than tableOID; we cannot expect these to be consistent in a
4500 * HOT chain, or even to be set correctly yet in the new tuple.
4501 */
4502 if (attrnum < 0)
4503 {
4504 if (attrnum != TableOidAttributeNumber)
4505 {
4506 modified = bms_add_member(modified, attidx);
4507 continue;
4508 }
4509 }
4510
4511 /*
4512 * Extract the corresponding values. XXX this is pretty inefficient
4513 * if there are many indexed columns. Should we do a single
4514 * heap_deform_tuple call on each tuple, instead? But that doesn't
4515 * work for system columns ...
4516 */
4517 value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4518 value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4519
4520 if (!heap_attr_equals(tupdesc, attrnum, value1,
4521 value2, isnull1, isnull2))
4522 {
4523 modified = bms_add_member(modified, attidx);
4524 continue;
4525 }
4526
4527 /*
4528 * No need to check attributes that can't be stored externally. Note
4529 * that system attributes can't be stored externally.
4530 */
4531 if (attrnum < 0 || isnull1 ||
4532 TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
4533 continue;
4534
4535 /*
4536 * Check if the old tuple's attribute is stored externally and is a
4537 * member of external_cols.
4538 */
4541 *has_external = true;
4542 }
4543
4544 return modified;
4545}
4546
4547/*
4548 * simple_heap_update - replace a tuple
4549 *
4550 * This routine may be used to update a tuple when concurrent updates of
4551 * the target tuple are not expected (for example, because we have a lock
4552 * on the relation associated with the tuple). Any failure is reported
4553 * via ereport().
4554 */
4555void
4558{
4559 TM_Result result;
4560 TM_FailureData tmfd;
4561 LockTupleMode lockmode;
4562
4563 result = heap_update(relation, otid, tup,
4565 true /* wait for commit */ ,
4566 &tmfd, &lockmode, update_indexes);
4567 switch (result)
4568 {
4569 case TM_SelfModified:
4570 /* Tuple was already updated in current command? */
4571 elog(ERROR, "tuple already updated by self");
4572 break;
4573
4574 case TM_Ok:
4575 /* done successfully */
4576 break;
4577
4578 case TM_Updated:
4579 elog(ERROR, "tuple concurrently updated");
4580 break;
4581
4582 case TM_Deleted:
4583 elog(ERROR, "tuple concurrently deleted");
4584 break;
4585
4586 default:
4587 elog(ERROR, "unrecognized heap_update status: %u", result);
4588 break;
4589 }
4590}
4591
4592
4593/*
4594 * Return the MultiXactStatus corresponding to the given tuple lock mode.
4595 */
4596static MultiXactStatus
4598{
4599 int retval;
4600
4601 if (is_update)
4602 retval = tupleLockExtraInfo[mode].updstatus;
4603 else
4604 retval = tupleLockExtraInfo[mode].lockstatus;
4605
4606 if (retval == -1)
4607 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4608 is_update ? "true" : "false");
4609
4610 return (MultiXactStatus) retval;
4611}
4612
4613/*
4614 * heap_lock_tuple - lock a tuple in shared or exclusive mode
4615 *
4616 * Note that this acquires a buffer pin, which the caller must release.
4617 *
4618 * Input parameters:
4619 * relation: relation containing tuple (caller must hold suitable lock)
4620 * cid: current command ID (used for visibility test, and stored into
4621 * tuple's cmax if lock is successful)
4622 * mode: indicates if shared or exclusive tuple lock is desired
4623 * wait_policy: what to do if tuple lock is not available
4624 * follow_updates: if true, follow the update chain to also lock descendant
4625 * tuples.
4626 *
4627 * Output parameters:
4628 * *tuple: all fields filled in
4629 * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4630 * *tmfd: filled in failure cases (see below)
4631 *
4632 * Function results are the same as the ones for table_tuple_lock().
4633 *
4634 * In the failure cases other than TM_Invisible, the routine fills
4635 * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4636 * if necessary), and t_cmax (the last only for TM_SelfModified,
4637 * since we cannot obtain cmax from a combo CID generated by another
4638 * transaction).
4639 * See comments for struct TM_FailureData for additional info.
4640 *
4641 * See README.tuplock for a thorough explanation of this mechanism.
4642 */
4644heap_lock_tuple(Relation relation, HeapTuple tuple,
4646 bool follow_updates,
4647 Buffer *buffer, TM_FailureData *tmfd)
4648{
4649 TM_Result result;
4650 ItemPointer tid = &(tuple->t_self);
4651 ItemId lp;
4652 Page page;
4653 Buffer vmbuffer = InvalidBuffer;
4654 BlockNumber block;
4655 TransactionId xid,
4656 xmax;
4660 bool first_time = true;
4661 bool skip_tuple_lock = false;
4662 bool have_tuple_lock = false;
4663 bool cleared_all_frozen = false;
4664
4665 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4666 block = ItemPointerGetBlockNumber(tid);
4667
4668 /*
4669 * Before locking the buffer, pin the visibility map page if it appears to
4670 * be necessary. Since we haven't got the lock yet, someone else might be
4671 * in the middle of changing this, so we'll need to recheck after we have
4672 * the lock.
4673 */
4674 if (PageIsAllVisible(BufferGetPage(*buffer)))
4675 visibilitymap_pin(relation, block, &vmbuffer);
4676
4678
4679 page = BufferGetPage(*buffer);
4682
4683 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4684 tuple->t_len = ItemIdGetLength(lp);
4685 tuple->t_tableOid = RelationGetRelid(relation);
4686
4687l3:
4688 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4689
4690 if (result == TM_Invisible)
4691 {
4692 /*
4693 * This is possible, but only when locking a tuple for ON CONFLICT DO
4694 * SELECT/UPDATE. We return this value here rather than throwing an
4695 * error in order to give that case the opportunity to throw a more
4696 * specific error.
4697 */
4698 result = TM_Invisible;
4699 goto out_locked;
4700 }
4701 else if (result == TM_BeingModified ||
4702 result == TM_Updated ||
4703 result == TM_Deleted)
4704 {
4708 bool require_sleep;
4709 ItemPointerData t_ctid;
4710
4711 /* must copy state data before unlocking buffer */
4713 infomask = tuple->t_data->t_infomask;
4714 infomask2 = tuple->t_data->t_infomask2;
4715 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4716
4718
4719 /*
4720 * If any subtransaction of the current top transaction already holds
4721 * a lock as strong as or stronger than what we're requesting, we
4722 * effectively hold the desired lock already. We *must* succeed
4723 * without trying to take the tuple lock, else we will deadlock
4724 * against anyone wanting to acquire a stronger lock.
4725 *
4726 * Note we only do this the first time we loop on the HTSU result;
4727 * there is no point in testing in subsequent passes, because
4728 * evidently our own transaction cannot have acquired a new lock after
4729 * the first time we checked.
4730 */
4731 if (first_time)
4732 {
4733 first_time = false;
4734
4736 {
4737 int i;
4738 int nmembers;
4739 MultiXactMember *members;
4740
4741 /*
4742 * We don't need to allow old multixacts here; if that had
4743 * been the case, HeapTupleSatisfiesUpdate would have returned
4744 * MayBeUpdated and we wouldn't be here.
4745 */
4746 nmembers =
4747 GetMultiXactIdMembers(xwait, &members, false,
4749
4750 for (i = 0; i < nmembers; i++)
4751 {
4752 /* only consider members of our own transaction */
4753 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4754 continue;
4755
4756 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4757 {
4758 pfree(members);
4759 result = TM_Ok;
4760 goto out_unlocked;
4761 }
4762 else
4763 {
4764 /*
4765 * Disable acquisition of the heavyweight tuple lock.
4766 * Otherwise, when promoting a weaker lock, we might
4767 * deadlock with another locker that has acquired the
4768 * heavyweight tuple lock and is waiting for our
4769 * transaction to finish.
4770 *
4771 * Note that in this case we still need to wait for
4772 * the multixact if required, to avoid acquiring
4773 * conflicting locks.
4774 */
4775 skip_tuple_lock = true;
4776 }
4777 }
4778
4779 if (members)
4780 pfree(members);
4781 }
4783 {
4784 switch (mode)
4785 {
4786 case LockTupleKeyShare:
4790 result = TM_Ok;
4791 goto out_unlocked;
4792 case LockTupleShare:
4795 {
4796 result = TM_Ok;
4797 goto out_unlocked;
4798 }
4799 break;
4802 {
4803 result = TM_Ok;
4804 goto out_unlocked;
4805 }
4806 break;
4807 case LockTupleExclusive:
4810 {
4811 result = TM_Ok;
4812 goto out_unlocked;
4813 }
4814 break;
4815 }
4816 }
4817 }
4818
4819 /*
4820 * Initially assume that we will have to wait for the locking
4821 * transaction(s) to finish. We check various cases below in which
4822 * this can be turned off.
4823 */
4824 require_sleep = true;
4825 if (mode == LockTupleKeyShare)
4826 {
4827 /*
4828 * If we're requesting KeyShare, and there's no update present, we
4829 * don't need to wait. Even if there is an update, we can still
4830 * continue if the key hasn't been modified.
4831 *
4832 * However, if there are updates, we need to walk the update chain
4833 * to mark future versions of the row as locked, too. That way,
4834 * if somebody deletes that future version, we're protected
4835 * against the key going away. This locking of future versions
4836 * could block momentarily, if a concurrent transaction is
4837 * deleting a key; or it could return a value to the effect that
4838 * the transaction deleting the key has already committed. So we
4839 * do this before re-locking the buffer; otherwise this would be
4840 * prone to deadlocks.
4841 *
4842 * Note that the TID we're locking was grabbed before we unlocked
4843 * the buffer. For it to change while we're not looking, the
4844 * other properties we're testing for below after re-locking the
4845 * buffer would also change, in which case we would restart this
4846 * loop above.
4847 */
4849 {
4850 bool updated;
4851
4853
4854 /*
4855 * If there are updates, follow the update chain; bail out if
4856 * that cannot be done.
4857 */
4858 if (follow_updates && updated &&
4859 !ItemPointerEquals(&tuple->t_self, &t_ctid))
4860 {
4861 TM_Result res;
4862
4863 res = heap_lock_updated_tuple(relation,
4864 infomask, xwait, &t_ctid,
4866 mode);
4867 if (res != TM_Ok)
4868 {
4869 result = res;
4870 /* recovery code expects to have buffer lock held */
4872 goto failed;
4873 }
4874 }
4875
4877
4878 /*
4879 * Make sure it's still an appropriate lock, else start over.
4880 * Also, if it wasn't updated before we released the lock, but
4881 * is updated now, we start over too; the reason is that we
4882 * now need to follow the update chain to lock the new
4883 * versions.
4884 */
4885 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4886 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4887 !updated))
4888 goto l3;
4889
4890 /* Things look okay, so we can skip sleeping */
4891 require_sleep = false;
4892
4893 /*
4894 * Note we allow Xmax to change here; other updaters/lockers
4895 * could have modified it before we grabbed the buffer lock.
4896 * However, this is not a problem, because with the recheck we
4897 * just did we ensure that they still don't conflict with the
4898 * lock we want.
4899 */
4900 }
4901 }
4902 else if (mode == LockTupleShare)
4903 {
4904 /*
4905 * If we're requesting Share, we can similarly avoid sleeping if
4906 * there's no update and no exclusive lock present.
4907 */
4910 {
4912
4913 /*
4914 * Make sure it's still an appropriate lock, else start over.
4915 * See above about allowing xmax to change.
4916 */
4919 goto l3;
4920 require_sleep = false;
4921 }
4922 }
4923 else if (mode == LockTupleNoKeyExclusive)
4924 {
4925 /*
4926 * If we're requesting NoKeyExclusive, we might also be able to
4927 * avoid sleeping; just ensure that there no conflicting lock
4928 * already acquired.
4929 */
4931 {
4933 mode, NULL))
4934 {
4935 /*
4936 * No conflict, but if the xmax changed under us in the
4937 * meantime, start over.
4938 */
4942 xwait))
4943 goto l3;
4944
4945 /* otherwise, we're good */
4946 require_sleep = false;
4947 }
4948 }
4950 {
4952
4953 /* if the xmax changed in the meantime, start over */
4956 xwait))
4957 goto l3;
4958 /* otherwise, we're good */
4959 require_sleep = false;
4960 }
4961 }
4962
4963 /*
4964 * As a check independent from those above, we can also avoid sleeping
4965 * if the current transaction is the sole locker of the tuple. Note
4966 * that the strength of the lock already held is irrelevant; this is
4967 * not about recording the lock in Xmax (which will be done regardless
4968 * of this optimization, below). Also, note that the cases where we
4969 * hold a lock stronger than we are requesting are already handled
4970 * above by not doing anything.
4971 *
4972 * Note we only deal with the non-multixact case here; MultiXactIdWait
4973 * is well equipped to deal with this situation on its own.
4974 */
4977 {
4978 /* ... but if the xmax changed in the meantime, start over */
4982 xwait))
4983 goto l3;
4985 require_sleep = false;
4986 }
4987
4988 /*
4989 * Time to sleep on the other transaction/multixact, if necessary.
4990 *
4991 * If the other transaction is an update/delete that's already
4992 * committed, then sleeping cannot possibly do any good: if we're
4993 * required to sleep, get out to raise an error instead.
4994 *
4995 * By here, we either have already acquired the buffer exclusive lock,
4996 * or we must wait for the locking transaction or multixact; so below
4997 * we ensure that we grab buffer lock after the sleep.
4998 */
4999 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
5000 {
5002 goto failed;
5003 }
5004 else if (require_sleep)
5005 {
5006 /*
5007 * Acquire tuple lock to establish our priority for the tuple, or
5008 * die trying. LockTuple will release us when we are next-in-line
5009 * for the tuple. We must do this even if we are share-locking,
5010 * but not if we already have a weaker lock on the tuple.
5011 *
5012 * If we are forced to "start over" below, we keep the tuple lock;
5013 * this arranges that we stay at the head of the line while
5014 * rechecking tuple state.
5015 */
5016 if (!skip_tuple_lock &&
5017 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
5019 {
5020 /*
5021 * This can only happen if wait_policy is Skip and the lock
5022 * couldn't be obtained.
5023 */
5024 result = TM_WouldBlock;
5025 /* recovery code expects to have buffer lock held */
5027 goto failed;
5028 }
5029
5031 {
5033
5034 /* We only ever lock tuples, never update them */
5035 if (status >= MultiXactStatusNoKeyUpdate)
5036 elog(ERROR, "invalid lock mode in heap_lock_tuple");
5037
5038 /* wait for multixact to end, or die trying */
5039 switch (wait_policy)
5040 {
5041 case LockWaitBlock:
5043 relation, &tuple->t_self, XLTW_Lock, NULL);
5044 break;
5045 case LockWaitSkip:
5047 status, infomask, relation,
5048 NULL, false))
5049 {
5050 result = TM_WouldBlock;
5051 /* recovery code expects to have buffer lock held */
5053 goto failed;
5054 }
5055 break;
5056 case LockWaitError:
5058 status, infomask, relation,
5060 ereport(ERROR,
5062 errmsg("could not obtain lock on row in relation \"%s\"",
5063 RelationGetRelationName(relation))));
5064
5065 break;
5066 }
5067
5068 /*
5069 * Of course, the multixact might not be done here: if we're
5070 * requesting a light lock mode, other transactions with light
5071 * locks could still be alive, as well as locks owned by our
5072 * own xact or other subxacts of this backend. We need to
5073 * preserve the surviving MultiXact members. Note that it
5074 * isn't absolutely necessary in the latter case, but doing so
5075 * is simpler.
5076 */
5077 }
5078 else
5079 {
5080 /* wait for regular transaction to end, or die trying */
5081 switch (wait_policy)
5082 {
5083 case LockWaitBlock:
5084 XactLockTableWait(xwait, relation, &tuple->t_self,
5085 XLTW_Lock);
5086 break;
5087 case LockWaitSkip:
5089 {
5090 result = TM_WouldBlock;
5091 /* recovery code expects to have buffer lock held */
5093 goto failed;
5094 }
5095 break;
5096 case LockWaitError:
5098 ereport(ERROR,
5100 errmsg("could not obtain lock on row in relation \"%s\"",
5101 RelationGetRelationName(relation))));
5102 break;
5103 }
5104 }
5105
5106 /* if there are updates, follow the update chain */
5108 !ItemPointerEquals(&tuple->t_self, &t_ctid))
5109 {
5110 TM_Result res;
5111
5112 res = heap_lock_updated_tuple(relation,
5113 infomask, xwait, &t_ctid,
5115 mode);
5116 if (res != TM_Ok)
5117 {
5118 result = res;
5119 /* recovery code expects to have buffer lock held */
5121 goto failed;
5122 }
5123 }
5124
5126
5127 /*
5128 * xwait is done, but if xwait had just locked the tuple then some
5129 * other xact could update this tuple before we get to this point.
5130 * Check for xmax change, and start over if so.
5131 */
5134 xwait))
5135 goto l3;
5136
5138 {
5139 /*
5140 * Otherwise check if it committed or aborted. Note we cannot
5141 * be here if the tuple was only locked by somebody who didn't
5142 * conflict with us; that would have been handled above. So
5143 * that transaction must necessarily be gone by now. But
5144 * don't check for this in the multixact case, because some
5145 * locker transactions might still be running.
5146 */
5147 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5148 }
5149 }
5150
5151 /* By here, we're certain that we hold buffer exclusive lock again */
5152
5153 /*
5154 * We may lock if previous xmax aborted, or if it committed but only
5155 * locked the tuple without updating it; or if we didn't have to wait
5156 * at all for whatever reason.
5157 */
5158 if (!require_sleep ||
5159 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5162 result = TM_Ok;
5163 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
5164 result = TM_Updated;
5165 else
5166 result = TM_Deleted;
5167 }
5168
5169failed:
5170 if (result != TM_Ok)
5171 {
5172 Assert(result == TM_SelfModified || result == TM_Updated ||
5173 result == TM_Deleted || result == TM_WouldBlock);
5174
5175 /*
5176 * When locking a tuple under LockWaitSkip semantics and we fail with
5177 * TM_WouldBlock above, it's possible for concurrent transactions to
5178 * release the lock and set HEAP_XMAX_INVALID in the meantime. So
5179 * this assert is slightly different from the equivalent one in
5180 * heap_delete and heap_update.
5181 */
5182 Assert((result == TM_WouldBlock) ||
5183 !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5184 Assert(result != TM_Updated ||
5185 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
5186 tmfd->ctid = tuple->t_data->t_ctid;
5187 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5188 if (result == TM_SelfModified)
5189 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5190 else
5191 tmfd->cmax = InvalidCommandId;
5192 goto out_locked;
5193 }
5194
5195 /*
5196 * If we didn't pin the visibility map page and the page has become all
5197 * visible while we were busy locking the buffer, or during some
5198 * subsequent window during which we had it unlocked, we'll have to unlock
5199 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5200 * unfortunate, especially since we'll now have to recheck whether the
5201 * tuple has been locked or updated under us, but hopefully it won't
5202 * happen very often.
5203 */
5204 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5205 {
5207 visibilitymap_pin(relation, block, &vmbuffer);
5209 goto l3;
5210 }
5211
5212 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5213 old_infomask = tuple->t_data->t_infomask;
5214
5215 /*
5216 * If this is the first possibly-multixact-able operation in the current
5217 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5218 * certain that the transaction will never become a member of any older
5219 * MultiXactIds than that. (We have to do this even if we end up just
5220 * using our own TransactionId below, since some other backend could
5221 * incorporate our XID into a MultiXact immediately afterwards.)
5222 */
5224
5225 /*
5226 * Compute the new xmax and infomask to store into the tuple. Note we do
5227 * not modify the tuple just yet, because that would leave it in the wrong
5228 * state if multixact.c elogs.
5229 */
5231 GetCurrentTransactionId(), mode, false,
5232 &xid, &new_infomask, &new_infomask2);
5233
5235
5236 /*
5237 * Store transaction information of xact locking the tuple.
5238 *
5239 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5240 * possibly generating a useless combo CID. Moreover, if we're locking a
5241 * previously updated tuple, it's important to preserve the Cmax.
5242 *
5243 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5244 * we would break the HOT chain.
5245 */
5248 tuple->t_data->t_infomask |= new_infomask;
5249 tuple->t_data->t_infomask2 |= new_infomask2;
5252 HeapTupleHeaderSetXmax(tuple->t_data, xid);
5253
5254 /*
5255 * Make sure there is no forward chain link in t_ctid. Note that in the
5256 * cases where the tuple has been updated, we must not overwrite t_ctid,
5257 * because it was set by the updater. Moreover, if the tuple has been
5258 * updated, we need to follow the update chain to lock the new versions of
5259 * the tuple as well.
5260 */
5262 tuple->t_data->t_ctid = *tid;
5263
5264 /* Clear only the all-frozen bit on visibility map if needed */
5265 if (PageIsAllVisible(page) &&
5266 visibilitymap_clear(relation, block, vmbuffer,
5268 cleared_all_frozen = true;
5269
5270
5271 MarkBufferDirty(*buffer);
5272
5273 /*
5274 * XLOG stuff. You might think that we don't need an XLOG record because
5275 * there is no state change worth restoring after a crash. You would be
5276 * wrong however: we have just written either a TransactionId or a
5277 * MultiXactId that may never have been seen on disk before, and we need
5278 * to make sure that there are XLOG entries covering those ID numbers.
5279 * Else the same IDs might be re-used after a crash, which would be
5280 * disastrous if this page made it to disk before the crash. Essentially
5281 * we have to enforce the WAL log-before-data rule even in this case.
5282 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5283 * entries for everything anyway.)
5284 */
5285 if (RelationNeedsWAL(relation))
5286 {
5289
5292
5293 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5294 xlrec.xmax = xid;
5295 xlrec.infobits_set = compute_infobits(new_infomask,
5296 tuple->t_data->t_infomask2);
5299
5300 /* we don't decode row locks atm, so no need to log the origin */
5301
5303
5304 PageSetLSN(page, recptr);
5305 }
5306
5308
5309 result = TM_Ok;
5310
5313
5315 if (BufferIsValid(vmbuffer))
5316 ReleaseBuffer(vmbuffer);
5317
5318 /*
5319 * Don't update the visibility map here. Locking a tuple doesn't change
5320 * visibility info.
5321 */
5322
5323 /*
5324 * Now that we have successfully marked the tuple as locked, we can
5325 * release the lmgr tuple lock, if we had it.
5326 */
5327 if (have_tuple_lock)
5328 UnlockTupleTuplock(relation, tid, mode);
5329
5330 return result;
5331}
5332
5333/*
5334 * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5335 * its normal, Xmax-based tuple lock.
5336 *
5337 * have_tuple_lock is an input and output parameter: on input, it indicates
5338 * whether the lock has previously been acquired (and this function does
5339 * nothing in that case). If this function returns success, have_tuple_lock
5340 * has been flipped to true.
5341 *
5342 * Returns false if it was unable to obtain the lock; this can only happen if
5343 * wait_policy is Skip.
5344 */
5345static bool
5348{
5349 if (*have_tuple_lock)
5350 return true;
5351
5352 switch (wait_policy)
5353 {
5354 case LockWaitBlock:
5355 LockTupleTuplock(relation, tid, mode);
5356 break;
5357
5358 case LockWaitSkip:
5359 if (!ConditionalLockTupleTuplock(relation, tid, mode, false))
5360 return false;
5361 break;
5362
5363 case LockWaitError:
5365 ereport(ERROR,
5367 errmsg("could not obtain lock on row in relation \"%s\"",
5368 RelationGetRelationName(relation))));
5369 break;
5370 }
5371 *have_tuple_lock = true;
5372
5373 return true;
5374}
5375
5376/*
5377 * Given an original set of Xmax and infomask, and a transaction (identified by
5378 * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5379 * corresponding infomasks to use on the tuple.
5380 *
5381 * Note that this might have side effects such as creating a new MultiXactId.
5382 *
5383 * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5384 * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5385 * but it was not running anymore. There is a race condition, which is that the
5386 * MultiXactId may have finished since then, but that uncommon case is handled
5387 * either here, or within MultiXactIdExpand.
5388 *
5389 * There is a similar race condition possible when the old xmax was a regular
5390 * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5391 * window, but it's still possible to end up creating an unnecessary
5392 * MultiXactId. Fortunately this is harmless.
5393 */
5394static void
5400{
5401 TransactionId new_xmax;
5404
5406
5407l5:
5408 new_infomask = 0;
5409 new_infomask2 = 0;
5411 {
5412 /*
5413 * No previous locker; we just insert our own TransactionId.
5414 *
5415 * Note that it's critical that this case be the first one checked,
5416 * because there are several blocks below that come back to this one
5417 * to implement certain optimizations; old_infomask might contain
5418 * other dirty bits in those cases, but we don't really care.
5419 */
5420 if (is_update)
5421 {
5422 new_xmax = add_to_xmax;
5423 if (mode == LockTupleExclusive)
5425 }
5426 else
5427 {
5429 switch (mode)
5430 {
5431 case LockTupleKeyShare:
5432 new_xmax = add_to_xmax;
5434 break;
5435 case LockTupleShare:
5436 new_xmax = add_to_xmax;
5438 break;
5440 new_xmax = add_to_xmax;
5442 break;
5443 case LockTupleExclusive:
5444 new_xmax = add_to_xmax;
5447 break;
5448 default:
5449 new_xmax = InvalidTransactionId; /* silence compiler */
5450 elog(ERROR, "invalid lock mode");
5451 }
5452 }
5453 }
5455 {
5457
5458 /*
5459 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5460 * cross-check.
5461 */
5463
5464 /*
5465 * A multixact together with LOCK_ONLY set but neither lock bit set
5466 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5467 * anymore. This check is critical for databases upgraded by
5468 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5469 * that such multis are never passed.
5470 */
5472 {
5475 goto l5;
5476 }
5477
5478 /*
5479 * If the XMAX is already a MultiXactId, then we need to expand it to
5480 * include add_to_xmax; but if all the members were lockers and are
5481 * all gone, we can do away with the IS_MULTI bit and just set
5482 * add_to_xmax as the only locker/updater. If all lockers are gone
5483 * and we have an updater that aborted, we can also do without a
5484 * multi.
5485 *
5486 * The cost of doing GetMultiXactIdMembers would be paid by
5487 * MultiXactIdExpand if we weren't to do this, so this check is not
5488 * incurring extra work anyhow.
5489 */
5491 {
5494 old_infomask)))
5495 {
5496 /*
5497 * Reset these bits and restart; otherwise fall through to
5498 * create a new multi below.
5499 */
5502 goto l5;
5503 }
5504 }
5505
5507
5508 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5509 new_status);
5511 }
5513 {
5514 /*
5515 * It's a committed update, so we need to preserve him as updater of
5516 * the tuple.
5517 */
5518 MultiXactStatus status;
5520
5522 status = MultiXactStatusUpdate;
5523 else
5525
5527
5528 /*
5529 * since it's not running, it's obviously impossible for the old
5530 * updater to be identical to the current one, so we need not check
5531 * for that case as we do in the block above.
5532 */
5533 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5535 }
5536 else if (TransactionIdIsInProgress(xmax))
5537 {
5538 /*
5539 * If the XMAX is a valid, in-progress TransactionId, then we need to
5540 * create a new MultiXactId that includes both the old locker or
5541 * updater and our own TransactionId.
5542 */
5546
5548 {
5554 {
5557 else
5559 }
5560 else
5561 {
5562 /*
5563 * LOCK_ONLY can be present alone only when a page has been
5564 * upgraded by pg_upgrade. But in that case,
5565 * TransactionIdIsInProgress() should have returned false. We
5566 * assume it's no longer locked in this case.
5567 */
5568 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5571 goto l5;
5572 }
5573 }
5574 else
5575 {
5576 /* it's an update, but which kind? */
5579 else
5581 }
5582
5584
5585 /*
5586 * If the lock to be acquired is for the same TransactionId as the
5587 * existing lock, there's an optimization possible: consider only the
5588 * strongest of both locks as the only one present, and restart.
5589 */
5590 if (xmax == add_to_xmax)
5591 {
5592 /*
5593 * Note that it's not possible for the original tuple to be
5594 * updated: we wouldn't be here because the tuple would have been
5595 * invisible and we wouldn't try to update it. As a subtlety,
5596 * this code can also run when traversing an update chain to lock
5597 * future versions of a tuple. But we wouldn't be here either,
5598 * because the add_to_xmax would be different from the original
5599 * updater.
5600 */
5602
5603 /* acquire the strongest of both */
5604 if (mode < old_mode)
5605 mode = old_mode;
5606 /* mustn't touch is_update */
5607
5609 goto l5;
5610 }
5611
5612 /* otherwise, just fall back to creating a new multixact */
5614 new_xmax = MultiXactIdCreate(xmax, old_status,
5617 }
5620 {
5621 /*
5622 * It's a committed update, so we gotta preserve him as updater of the
5623 * tuple.
5624 */
5625 MultiXactStatus status;
5627
5629 status = MultiXactStatusUpdate;
5630 else
5632
5634
5635 /*
5636 * since it's not running, it's obviously impossible for the old
5637 * updater to be identical to the current one, so we need not check
5638 * for that case as we do in the block above.
5639 */
5640 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5642 }
5643 else
5644 {
5645 /*
5646 * Can get here iff the locking/updating transaction was running when
5647 * the infomask was extracted from the tuple, but finished before
5648 * TransactionIdIsInProgress got to run. Deal with it as if there was
5649 * no locker at all in the first place.
5650 */
5652 goto l5;
5653 }
5654
5657 *result_xmax = new_xmax;
5658}
5659
5660/*
5661 * Subroutine for heap_lock_updated_tuple_rec.
5662 *
5663 * Given a hypothetical multixact status held by the transaction identified
5664 * with the given xid, does the current transaction need to wait, fail, or can
5665 * it continue if it wanted to acquire a lock of the given mode? "needwait"
5666 * is set to true if waiting is necessary; if it can continue, then TM_Ok is
5667 * returned. If the lock is already held by the current transaction, return
5668 * TM_SelfModified. In case of a conflict with another transaction, a
5669 * different HeapTupleSatisfiesUpdate return code is returned.
5670 *
5671 * The held status is said to be hypothetical because it might correspond to a
5672 * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5673 * way for simplicity of API.
5674 */
5675static TM_Result
5678 bool *needwait)
5679{
5681
5682 *needwait = false;
5684
5685 /*
5686 * Note: we *must* check TransactionIdIsInProgress before
5687 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5688 * for an explanation.
5689 */
5691 {
5692 /*
5693 * The tuple has already been locked by our own transaction. This is
5694 * very rare but can happen if multiple transactions are trying to
5695 * lock an ancient version of the same tuple.
5696 */
5697 return TM_SelfModified;
5698 }
5699 else if (TransactionIdIsInProgress(xid))
5700 {
5701 /*
5702 * If the locking transaction is running, what we do depends on
5703 * whether the lock modes conflict: if they do, then we must wait for
5704 * it to finish; otherwise we can fall through to lock this tuple
5705 * version without waiting.
5706 */
5709 {
5710 *needwait = true;
5711 }
5712
5713 /*
5714 * If we set needwait above, then this value doesn't matter;
5715 * otherwise, this value signals to caller that it's okay to proceed.
5716 */
5717 return TM_Ok;
5718 }
5719 else if (TransactionIdDidAbort(xid))
5720 return TM_Ok;
5721 else if (TransactionIdDidCommit(xid))
5722 {
5723 /*
5724 * The other transaction committed. If it was only a locker, then the
5725 * lock is completely gone now and we can return success; but if it
5726 * was an update, then what we do depends on whether the two lock
5727 * modes conflict. If they conflict, then we must report error to
5728 * caller. But if they don't, we can fall through to allow the current
5729 * transaction to lock the tuple.
5730 *
5731 * Note: the reason we worry about ISUPDATE here is because as soon as
5732 * a transaction ends, all its locks are gone and meaningless, and
5733 * thus we can ignore them; whereas its updates persist. In the
5734 * TransactionIdIsInProgress case, above, we don't need to check
5735 * because we know the lock is still "alive" and thus a conflict needs
5736 * always be checked.
5737 */
5738 if (!ISUPDATE_from_mxstatus(status))
5739 return TM_Ok;
5740
5743 {
5744 /* bummer */
5745 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5746 return TM_Updated;
5747 else
5748 return TM_Deleted;
5749 }
5750
5751 return TM_Ok;
5752 }
5753
5754 /* Not in progress, not aborted, not committed -- must have crashed */
5755 return TM_Ok;
5756}
5757
5758
5759/*
5760 * Recursive part of heap_lock_updated_tuple
5761 *
5762 * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5763 * xid with the given mode; if this tuple is updated, recurse to lock the new
5764 * version as well.
5765 */
5766static TM_Result
5768 const ItemPointerData *tid, TransactionId xid,
5770{
5771 TM_Result result;
5774 Buffer buf;
5779 TransactionId xmax,
5780 new_xmax;
5781 bool cleared_all_frozen = false;
5783 Buffer vmbuffer = InvalidBuffer;
5784 BlockNumber block;
5785
5786 ItemPointerCopy(tid, &tupid);
5787
5788 for (;;)
5789 {
5790 new_infomask = 0;
5791 new_xmax = InvalidTransactionId;
5793 ItemPointerCopy(&tupid, &(mytup.t_self));
5794
5795 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
5796 {
5797 /*
5798 * if we fail to find the updated version of the tuple, it's
5799 * because it was vacuumed/pruned away after its creator
5800 * transaction aborted. So behave as if we got to the end of the
5801 * chain, and there's no further tuple to lock: return success to
5802 * caller.
5803 */
5804 result = TM_Ok;
5805 goto out_unlocked;
5806 }
5807
5808l4:
5810
5811 /*
5812 * Before locking the buffer, pin the visibility map page if it
5813 * appears to be necessary. Since we haven't got the lock yet,
5814 * someone else might be in the middle of changing this, so we'll need
5815 * to recheck after we have the lock.
5816 */
5818 {
5819 visibilitymap_pin(rel, block, &vmbuffer);
5820 pinned_desired_page = true;
5821 }
5822 else
5823 pinned_desired_page = false;
5824
5826
5827 /*
5828 * If we didn't pin the visibility map page and the page has become
5829 * all visible while we were busy locking the buffer, we'll have to
5830 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5831 * That's a bit unfortunate, but hopefully shouldn't happen often.
5832 *
5833 * Note: in some paths through this function, we will reach here
5834 * holding a pin on a vm page that may or may not be the one matching
5835 * this page. If this page isn't all-visible, we won't use the vm
5836 * page, but we hold onto such a pin till the end of the function.
5837 */
5839 {
5841 visibilitymap_pin(rel, block, &vmbuffer);
5843 }
5844
5845 /*
5846 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5847 * end of the chain, we're done, so return success.
5848 */
5851 priorXmax))
5852 {
5853 result = TM_Ok;
5854 goto out_locked;
5855 }
5856
5857 /*
5858 * Also check Xmin: if this tuple was created by an aborted
5859 * (sub)transaction, then we already locked the last live one in the
5860 * chain, thus we're done, so return success.
5861 */
5863 {
5864 result = TM_Ok;
5865 goto out_locked;
5866 }
5867
5868 old_infomask = mytup.t_data->t_infomask;
5869 old_infomask2 = mytup.t_data->t_infomask2;
5870 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5871
5872 /*
5873 * If this tuple version has been updated or locked by some concurrent
5874 * transaction(s), what we do depends on whether our lock mode
5875 * conflicts with what those other transactions hold, and also on the
5876 * status of them.
5877 */
5879 {
5881 bool needwait;
5882
5885 {
5886 int nmembers;
5887 int i;
5888 MultiXactMember *members;
5889
5890 /*
5891 * We don't need a test for pg_upgrade'd tuples: this is only
5892 * applied to tuples after the first in an update chain. Said
5893 * first tuple in the chain may well be locked-in-9.2-and-
5894 * pg_upgraded, but that one was already locked by our caller,
5895 * not us; and any subsequent ones cannot be because our
5896 * caller must necessarily have obtained a snapshot later than
5897 * the pg_upgrade itself.
5898 */
5899 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5900
5901 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5903 for (i = 0; i < nmembers; i++)
5904 {
5905 result = test_lockmode_for_conflict(members[i].status,
5906 members[i].xid,
5907 mode,
5908 &mytup,
5909 &needwait);
5910
5911 /*
5912 * If the tuple was already locked by ourselves in a
5913 * previous iteration of this (say heap_lock_tuple was
5914 * forced to restart the locking loop because of a change
5915 * in xmax), then we hold the lock already on this tuple
5916 * version and we don't need to do anything; and this is
5917 * not an error condition either. We just need to skip
5918 * this tuple and continue locking the next version in the
5919 * update chain.
5920 */
5921 if (result == TM_SelfModified)
5922 {
5923 pfree(members);
5924 goto next;
5925 }
5926
5927 if (needwait)
5928 {
5930 XactLockTableWait(members[i].xid, rel,
5931 &mytup.t_self,
5933 pfree(members);
5934 goto l4;
5935 }
5936 if (result != TM_Ok)
5937 {
5938 pfree(members);
5939 goto out_locked;
5940 }
5941 }
5942 if (members)
5943 pfree(members);
5944 }
5945 else
5946 {
5947 MultiXactStatus status;
5948
5949 /*
5950 * For a non-multi Xmax, we first need to compute the
5951 * corresponding MultiXactStatus by using the infomask bits.
5952 */
5954 {
5958 status = MultiXactStatusForShare;
5960 {
5962 status = MultiXactStatusForUpdate;
5963 else
5965 }
5966 else
5967 {
5968 /*
5969 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5970 * as share-locked in the old cluster) shouldn't be
5971 * seen in the middle of an update chain.
5972 */
5973 elog(ERROR, "invalid lock status in tuple");
5974 }
5975 }
5976 else
5977 {
5978 /* it's an update, but which kind? */
5980 status = MultiXactStatusUpdate;
5981 else
5983 }
5984
5985 result = test_lockmode_for_conflict(status, rawxmax, mode,
5986 &mytup, &needwait);
5987
5988 /*
5989 * If the tuple was already locked by ourselves in a previous
5990 * iteration of this (say heap_lock_tuple was forced to
5991 * restart the locking loop because of a change in xmax), then
5992 * we hold the lock already on this tuple version and we don't
5993 * need to do anything; and this is not an error condition
5994 * either. We just need to skip this tuple and continue
5995 * locking the next version in the update chain.
5996 */
5997 if (result == TM_SelfModified)
5998 goto next;
5999
6000 if (needwait)
6001 {
6003 XactLockTableWait(rawxmax, rel, &mytup.t_self,
6005 goto l4;
6006 }
6007 if (result != TM_Ok)
6008 {
6009 goto out_locked;
6010 }
6011 }
6012 }
6013
6014 /* compute the new Xmax and infomask values for the tuple ... */
6015 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
6016 xid, mode, false,
6017 &new_xmax, &new_infomask, &new_infomask2);
6018
6020 visibilitymap_clear(rel, block, vmbuffer,
6022 cleared_all_frozen = true;
6023
6025
6026 /* ... and set them */
6027 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
6028 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
6029 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6030 mytup.t_data->t_infomask |= new_infomask;
6031 mytup.t_data->t_infomask2 |= new_infomask2;
6032
6034
6035 /* XLOG stuff */
6036 if (RelationNeedsWAL(rel))
6037 {
6040 Page page = BufferGetPage(buf);
6041
6044
6045 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
6046 xlrec.xmax = new_xmax;
6048 xlrec.flags =
6050
6052
6054
6055 PageSetLSN(page, recptr);
6056 }
6057
6059
6060next:
6061 /* if we find the end of update chain, we're done. */
6062 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
6064 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
6066 {
6067 result = TM_Ok;
6068 goto out_locked;
6069 }
6070
6071 /* tail recursion */
6073 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
6075 }
6076
6077 result = TM_Ok;
6078
6081
6083 if (vmbuffer != InvalidBuffer)
6084 ReleaseBuffer(vmbuffer);
6085
6086 return result;
6087}
6088
6089/*
6090 * heap_lock_updated_tuple
6091 * Follow update chain when locking an updated tuple, acquiring locks (row
6092 * marks) on the updated versions.
6093 *
6094 * 'prior_infomask', 'prior_raw_xmax' and 'prior_ctid' are the corresponding
6095 * fields from the initial tuple. We will lock the tuples starting from the
6096 * one that 'prior_ctid' points to. Note: This function does not lock the
6097 * initial tuple itself.
6098 *
6099 * This function doesn't check visibility, it just unconditionally marks the
6100 * tuple(s) as locked. If any tuple in the updated chain is being deleted
6101 * concurrently (or updated with the key being modified), sleep until the
6102 * transaction doing it is finished.
6103 *
6104 * Note that we don't acquire heavyweight tuple locks on the tuples we walk
6105 * when we have to wait for other transactions to release them, as opposed to
6106 * what heap_lock_tuple does. The reason is that having more than one
6107 * transaction walking the chain is probably uncommon enough that risk of
6108 * starvation is not likely: one of the preconditions for being here is that
6109 * the snapshot in use predates the update that created this tuple (because we
6110 * started at an earlier version of the tuple), but at the same time such a
6111 * transaction cannot be using repeatable read or serializable isolation
6112 * levels, because that would lead to a serializability failure.
6113 */
6114static TM_Result
6120{
6121 INJECTION_POINT("heap_lock_updated_tuple", NULL);
6122
6123 /*
6124 * If the tuple has moved into another partition (effectively a delete)
6125 * stop here.
6126 */
6128 {
6130
6131 /*
6132 * If this is the first possibly-multixact-able operation in the
6133 * current transaction, set my per-backend OldestMemberMXactId
6134 * setting. We can be certain that the transaction will never become a
6135 * member of any older MultiXactIds than that. (We have to do this
6136 * even if we end up just using our own TransactionId below, since
6137 * some other backend could incorporate our XID into a MultiXact
6138 * immediately afterwards.)
6139 */
6141
6145 }
6146
6147 /* nothing to lock */
6148 return TM_Ok;
6149}
6150
6151/*
6152 * heap_finish_speculative - mark speculative insertion as successful
6153 *
6154 * To successfully finish a speculative insertion we have to clear speculative
6155 * token from tuple. To do so the t_ctid field, which will contain a
6156 * speculative token value, is modified in place to point to the tuple itself,
6157 * which is characteristic of a newly inserted ordinary tuple.
6158 *
6159 * NB: It is not ok to commit without either finishing or aborting a
6160 * speculative insertion. We could treat speculative tuples of committed
6161 * transactions implicitly as completed, but then we would have to be prepared
6162 * to deal with speculative tokens on committed tuples. That wouldn't be
6163 * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
6164 * but clearing the token at completion isn't very expensive either.
6165 * An explicit confirmation WAL record also makes logical decoding simpler.
6166 */
6167void
6169{
6170 Buffer buffer;
6171 Page page;
6172 OffsetNumber offnum;
6173 ItemId lp;
6174 HeapTupleHeader htup;
6175
6176 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
6178 page = BufferGetPage(buffer);
6179
6180 offnum = ItemPointerGetOffsetNumber(tid);
6182 elog(ERROR, "offnum out of range");
6183 lp = PageGetItemId(page, offnum);
6184 if (!ItemIdIsNormal(lp))
6185 elog(ERROR, "invalid lp");
6186
6187 htup = (HeapTupleHeader) PageGetItem(page, lp);
6188
6189 /* NO EREPORT(ERROR) from here till changes are logged */
6191
6193
6194 MarkBufferDirty(buffer);
6195
6196 /*
6197 * Replace the speculative insertion token with a real t_ctid, pointing to
6198 * itself like it does on regular tuples.
6199 */
6200 htup->t_ctid = *tid;
6201
6202 /* XLOG stuff */
6203 if (RelationNeedsWAL(relation))
6204 {
6207
6209
6211
6212 /* We want the same filtering on this as on a plain insert */
6214
6217
6219
6220 PageSetLSN(page, recptr);
6221 }
6222
6224
6225 UnlockReleaseBuffer(buffer);
6226}
6227
6228/*
6229 * heap_abort_speculative - kill a speculatively inserted tuple
6230 *
6231 * Marks a tuple that was speculatively inserted in the same command as dead,
6232 * by setting its xmin as invalid. That makes it immediately appear as dead
6233 * to all transactions, including our own. In particular, it makes
6234 * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
6235 * inserting a duplicate key value won't unnecessarily wait for our whole
6236 * transaction to finish (it'll just wait for our speculative insertion to
6237 * finish).
6238 *
6239 * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
6240 * that arise due to a mutual dependency that is not user visible. By
6241 * definition, unprincipled deadlocks cannot be prevented by the user
6242 * reordering lock acquisition in client code, because the implementation level
6243 * lock acquisitions are not under the user's direct control. If speculative
6244 * inserters did not take this precaution, then under high concurrency they
6245 * could deadlock with each other, which would not be acceptable.
6246 *
6247 * This is somewhat redundant with heap_delete, but we prefer to have a
6248 * dedicated routine with stripped down requirements. Note that this is also
6249 * used to delete the TOAST tuples created during speculative insertion.
6250 *
6251 * This routine does not affect logical decoding as it only looks at
6252 * confirmation records.
6253 */
6254void
6256{
6258 ItemId lp;
6259 HeapTupleData tp;
6260 Page page;
6261 BlockNumber block;
6262 Buffer buffer;
6263
6265
6266 block = ItemPointerGetBlockNumber(tid);
6267 buffer = ReadBuffer(relation, block);
6268 page = BufferGetPage(buffer);
6269
6271
6272 /*
6273 * Page can't be all visible, we just inserted into it, and are still
6274 * running.
6275 */
6276 Assert(!PageIsAllVisible(page));
6277
6280
6281 tp.t_tableOid = RelationGetRelid(relation);
6282 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6283 tp.t_len = ItemIdGetLength(lp);
6284 tp.t_self = *tid;
6285
6286 /*
6287 * Sanity check that the tuple really is a speculatively inserted tuple,
6288 * inserted by us.
6289 */
6290 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6291 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6292 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6293 elog(ERROR, "attempted to kill a non-speculative tuple");
6295
6296 /*
6297 * No need to check for serializable conflicts here. There is never a
6298 * need for a combo CID, either. No need to extract replica identity, or
6299 * do anything special with infomask bits.
6300 */
6301
6303
6304 /*
6305 * The tuple will become DEAD immediately. Flag that this page is a
6306 * candidate for pruning by setting xmin to TransactionXmin. While not
6307 * immediately prunable, it is the oldest xid we can cheaply determine
6308 * that's safe against wraparound / being older than the table's
6309 * relfrozenxid. To defend against the unlikely case of a new relation
6310 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6311 * if so (vacuum can't subsequently move relfrozenxid to beyond
6312 * TransactionXmin, so there's no race here).
6313 */
6315 {
6316 TransactionId relfrozenxid = relation->rd_rel->relfrozenxid;
6318
6319 if (TransactionIdPrecedes(TransactionXmin, relfrozenxid))
6320 prune_xid = relfrozenxid;
6321 else
6324 }
6325
6326 /* store transaction information of xact deleting the tuple */
6329
6330 /*
6331 * Set the tuple header xmin to InvalidTransactionId. This makes the
6332 * tuple immediately invisible everyone. (In particular, to any
6333 * transactions waiting on the speculative token, woken up later.)
6334 */
6336
6337 /* Clear the speculative insertion token too */
6338 tp.t_data->t_ctid = tp.t_self;
6339
6340 MarkBufferDirty(buffer);
6341
6342 /*
6343 * XLOG stuff
6344 *
6345 * The WAL records generated here match heap_delete(). The same recovery
6346 * routines are used.
6347 */
6348 if (RelationNeedsWAL(relation))
6349 {
6352
6354 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6355 tp.t_data->t_infomask2);
6357 xlrec.xmax = xid;
6358
6362
6363 /* No replica identity & replication origin logged */
6364
6366
6367 PageSetLSN(page, recptr);
6368 }
6369
6371
6373
6374 if (HeapTupleHasExternal(&tp))
6375 {
6376 Assert(!IsToastRelation(relation));
6377 heap_toast_delete(relation, &tp, true);
6378 }
6379
6380 /*
6381 * Never need to mark tuple for invalidation, since catalogs don't support
6382 * speculative insertion
6383 */
6384
6385 /* Now we can release the buffer */
6386 ReleaseBuffer(buffer);
6387
6388 /* count deletion, as we counted the insertion too */
6389 pgstat_count_heap_delete(relation);
6390}
6391
6392/*
6393 * heap_inplace_lock - protect inplace update from concurrent heap_update()
6394 *
6395 * Evaluate whether the tuple's state is compatible with a no-key update.
6396 * Current transaction rowmarks are fine, as is KEY SHARE from any
6397 * transaction. If compatible, return true with the buffer exclusive-locked,
6398 * and the caller must release that by calling
6399 * heap_inplace_update_and_unlock(), calling heap_inplace_unlock(), or raising
6400 * an error. Otherwise, call release_callback(arg), wait for blocking
6401 * transactions to end, and return false.
6402 *
6403 * Since this is intended for system catalogs and SERIALIZABLE doesn't cover
6404 * DDL, this doesn't guarantee any particular predicate locking.
6405 *
6406 * heap_delete() is a rarer source of blocking transactions (xwait). We'll
6407 * wait for such a transaction just like for the normal heap_update() case.
6408 * Normal concurrent DROP commands won't cause that, because all inplace
6409 * updaters take some lock that conflicts with DROP. An explicit SQL "DELETE
6410 * FROM pg_class" can cause it. By waiting, if the concurrent transaction
6411 * executed both "DELETE FROM pg_class" and "INSERT INTO pg_class", our caller
6412 * can find the successor tuple.
6413 *
6414 * Readers of inplace-updated fields expect changes to those fields are
6415 * durable. For example, vac_truncate_clog() reads datfrozenxid from
6416 * pg_database tuples via catalog snapshots. A future snapshot must not
6417 * return a lower datfrozenxid for the same database OID (lower in the
6418 * FullTransactionIdPrecedes() sense). We achieve that since no update of a
6419 * tuple can start while we hold a lock on its buffer. In cases like
6420 * BEGIN;GRANT;CREATE INDEX;COMMIT we're inplace-updating a tuple visible only
6421 * to this transaction. ROLLBACK then is one case where it's okay to lose
6422 * inplace updates. (Restoring relhasindex=false on ROLLBACK is fine, since
6423 * any concurrent CREATE INDEX would have blocked, then inplace-updated the
6424 * committed tuple.)
6425 *
6426 * In principle, we could avoid waiting by overwriting every tuple in the
6427 * updated tuple chain. Reader expectations permit updating a tuple only if
6428 * it's aborted, is the tail of the chain, or we already updated the tuple
6429 * referenced in its t_ctid. Hence, we would need to overwrite the tuples in
6430 * order from tail to head. That would imply either (a) mutating all tuples
6431 * in one critical section or (b) accepting a chance of partial completion.
6432 * Partial completion of a relfrozenxid update would have the weird
6433 * consequence that the table's next VACUUM could see the table's relfrozenxid
6434 * move forward between vacuum_get_cutoffs() and finishing.
6435 */
6436bool
6438 HeapTuple oldtup_ptr, Buffer buffer,
6439 void (*release_callback) (void *), void *arg)
6440{
6441 HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */
6442 TM_Result result;
6443 bool ret;
6444
6445#ifdef USE_ASSERT_CHECKING
6446 if (RelationGetRelid(relation) == RelationRelationId)
6448#endif
6449
6450 Assert(BufferIsValid(buffer));
6451
6452 /*
6453 * Register shared cache invals if necessary. Other sessions may finish
6454 * inplace updates of this tuple between this step and LockTuple(). Since
6455 * inplace updates don't change cache keys, that's harmless.
6456 *
6457 * While it's tempting to register invals only after confirming we can
6458 * return true, the following obstacle precludes reordering steps that
6459 * way. Registering invals might reach a CatalogCacheInitializeCache()
6460 * that locks "buffer". That would hang indefinitely if running after our
6461 * own LockBuffer(). Hence, we must register invals before LockBuffer().
6462 */
6464
6465 LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6467
6468 /*----------
6469 * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
6470 *
6471 * - wait unconditionally
6472 * - already locked tuple above, since inplace needs that unconditionally
6473 * - don't recheck header after wait: simpler to defer to next iteration
6474 * - don't try to continue even if the updater aborts: likewise
6475 * - no crosscheck
6476 */
6478 buffer);
6479
6480 if (result == TM_Invisible)
6481 {
6482 /* no known way this can happen */
6483 ereport(ERROR,
6485 errmsg_internal("attempted to overwrite invisible tuple")));
6486 }
6487 else if (result == TM_SelfModified)
6488 {
6489 /*
6490 * CREATE INDEX might reach this if an expression is silly enough to
6491 * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
6492 * statements might get here after a heap_update() of the same row, in
6493 * the absence of an intervening CommandCounterIncrement().
6494 */
6495 ereport(ERROR,
6497 errmsg("tuple to be updated was already modified by an operation triggered by the current command")));
6498 }
6499 else if (result == TM_BeingModified)
6500 {
6503
6505 infomask = oldtup.t_data->t_infomask;
6506
6508 {
6511 int remain;
6512
6514 lockmode, NULL))
6515 {
6518 ret = false;
6520 relation, &oldtup.t_self, XLTW_Update,
6521 &remain);
6522 }
6523 else
6524 ret = true;
6525 }
6527 ret = true;
6529 ret = true;
6530 else
6531 {
6534 ret = false;
6535 XactLockTableWait(xwait, relation, &oldtup.t_self,
6536 XLTW_Update);
6537 }
6538 }
6539 else
6540 {
6541 ret = (result == TM_Ok);
6542 if (!ret)
6543 {
6546 }
6547 }
6548
6549 /*
6550 * GetCatalogSnapshot() relies on invalidation messages to know when to
6551 * take a new snapshot. COMMIT of xwait is responsible for sending the
6552 * invalidation. We're not acquiring heavyweight locks sufficient to
6553 * block if not yet sent, so we must take a new snapshot to ensure a later
6554 * attempt has a fair chance. While we don't need this if xwait aborted,
6555 * don't bother optimizing that.
6556 */
6557 if (!ret)
6558 {
6559 UnlockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6562 }
6563 return ret;
6564}
6565
6566/*
6567 * heap_inplace_update_and_unlock - core of systable_inplace_update_finish
6568 *
6569 * The tuple cannot change size, and therefore its header fields and null
6570 * bitmap (if any) don't change either.
6571 *
6572 * Since we hold LOCKTAG_TUPLE, no updater has a local copy of this tuple.
6573 */
6574void
6576 HeapTuple oldtup, HeapTuple tuple,
6577 Buffer buffer)
6578{
6579 HeapTupleHeader htup = oldtup->t_data;
6580 uint32 oldlen;
6581 uint32 newlen;
6582 char *dst;
6583 char *src;
6584 int nmsgs = 0;
6586 bool RelcacheInitFileInval = false;
6587
6588 Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
6589 oldlen = oldtup->t_len - htup->t_hoff;
6590 newlen = tuple->t_len - tuple->t_data->t_hoff;
6591 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6592 elog(ERROR, "wrong tuple length");
6593
6594 dst = (char *) htup + htup->t_hoff;
6595 src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6596
6597 /* Like RecordTransactionCommit(), log only if needed */
6600 &RelcacheInitFileInval);
6601
6602 /*
6603 * Unlink relcache init files as needed. If unlinking, acquire
6604 * RelCacheInitLock until after associated invalidations. By doing this
6605 * in advance, if we checkpoint and then crash between inplace
6606 * XLogInsert() and inval, we don't rely on StartupXLOG() ->
6607 * RelationCacheInitFileRemove(). That uses elevel==LOG, so replay would
6608 * neglect to PANIC on EIO.
6609 */
6611
6612 /*----------
6613 * NO EREPORT(ERROR) from here till changes are complete
6614 *
6615 * Our exclusive buffer lock won't stop a reader having already pinned and
6616 * checked visibility for this tuple. With the usual order of changes
6617 * (i.e. updating the buffer contents before WAL logging), a reader could
6618 * observe our not-yet-persistent update to relfrozenxid and update
6619 * datfrozenxid based on that. A crash in that moment could allow
6620 * datfrozenxid to overtake relfrozenxid:
6621 *
6622 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6623 * ["R" is a VACUUM tbl]
6624 * D: vac_update_datfrozenxid() -> systable_beginscan(pg_class)
6625 * D: systable_getnext() returns pg_class tuple of tbl
6626 * R: memcpy() into pg_class tuple of tbl
6627 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6628 * [crash]
6629 * [recovery restores datfrozenxid w/o relfrozenxid]
6630 *
6631 * We avoid that by using a temporary copy of the buffer to hide our
6632 * change from other backends until the change has been WAL-logged. We
6633 * apply our change to the temporary copy and WAL-log it, before modifying
6634 * the real page. That way any action a reader of the in-place-updated
6635 * value takes will be WAL logged after this change.
6636 */
6638
6639 MarkBufferDirty(buffer);
6640
6641 /* XLOG stuff */
6642 if (RelationNeedsWAL(relation))
6643 {
6646 char *origdata = (char *) BufferGetBlock(buffer);
6647 Page page = BufferGetPage(buffer);
6648 uint16 lower = ((PageHeader) page)->pd_lower;
6649 uint16 upper = ((PageHeader) page)->pd_upper;
6651 RelFileLocator rlocator;
6652 ForkNumber forkno;
6653 BlockNumber blkno;
6655
6656 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6657 xlrec.dbId = MyDatabaseId;
6659 xlrec.relcacheInitFileInval = RelcacheInitFileInval;
6660 xlrec.nmsgs = nmsgs;
6661
6664 if (nmsgs != 0)
6666 nmsgs * sizeof(SharedInvalidationMessage));
6667
6668 /* register block matching what buffer will look like after changes */
6673 BufferGetTag(buffer, &rlocator, &forkno, &blkno);
6674 Assert(forkno == MAIN_FORKNUM);
6675 XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
6677 XLogRegisterBufData(0, src, newlen);
6678
6679 /* inplace updates aren't decoded atm, don't log the origin */
6680
6682
6683 PageSetLSN(page, recptr);
6684 }
6685
6686 memcpy(dst, src, newlen);
6687
6689
6690 /*
6691 * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we
6692 * do this before UnlockTuple().
6693 */
6695
6697 UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
6698
6699 AcceptInvalidationMessages(); /* local processing of just-sent inval */
6700
6701 /*
6702 * Queue a transactional inval, for logical decoding and for third-party
6703 * code that might have been relying on it since long before inplace
6704 * update adopted immediate invalidation. See README.tuplock section
6705 * "Reading inplace-updated columns" for logical decoding details.
6706 */
6708 CacheInvalidateHeapTuple(relation, tuple, NULL);
6709}
6710
6711/*
6712 * heap_inplace_unlock - reverse of heap_inplace_lock
6713 */
6714void
6716 HeapTuple oldtup, Buffer buffer)
6717{
6719 UnlockTuple(relation, &oldtup->t_self, InplaceUpdateTupleLock);
6721}
6722
6723#define FRM_NOOP 0x0001
6724#define FRM_INVALIDATE_XMAX 0x0002
6725#define FRM_RETURN_IS_XID 0x0004
6726#define FRM_RETURN_IS_MULTI 0x0008
6727#define FRM_MARK_COMMITTED 0x0010
6728
6729/*
6730 * FreezeMultiXactId
6731 * Determine what to do during freezing when a tuple is marked by a
6732 * MultiXactId.
6733 *
6734 * "flags" is an output value; it's used to tell caller what to do on return.
6735 * "pagefrz" is an input/output value, used to manage page level freezing.
6736 *
6737 * Possible values that we can set in "flags":
6738 * FRM_NOOP
6739 * don't do anything -- keep existing Xmax
6740 * FRM_INVALIDATE_XMAX
6741 * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
6742 * FRM_RETURN_IS_XID
6743 * The Xid return value is a single update Xid to set as xmax.
6744 * FRM_MARK_COMMITTED
6745 * Xmax can be marked as HEAP_XMAX_COMMITTED
6746 * FRM_RETURN_IS_MULTI
6747 * The return value is a new MultiXactId to set as new Xmax.
6748 * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
6749 *
6750 * Caller delegates control of page freezing to us. In practice we always
6751 * force freezing of caller's page unless FRM_NOOP processing is indicated.
6752 * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff
6753 * can never be left behind. We freely choose when and how to process each
6754 * Multi, without ever violating the cutoff postconditions for freezing.
6755 *
6756 * It's useful to remove Multis on a proactive timeline (relative to freezing
6757 * XIDs) to keep MultiXact member SLRU buffer misses to a minimum. It can also
6758 * be cheaper in the short run, for us, since we too can avoid SLRU buffer
6759 * misses through eager processing.
6760 *
6761 * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only
6762 * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice.
6763 * This can usually be put off, which is usually enough to avoid it altogether.
6764 * Allocating new multis during VACUUM should be avoided on general principle;
6765 * only VACUUM can advance relminmxid, so allocating new Multis here comes with
6766 * its own special risks.
6767 *
6768 * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers
6769 * using heap_tuple_should_freeze when we haven't forced page-level freezing.
6770 *
6771 * NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we
6772 * have already forced page-level freezing, since that might incur the same
6773 * SLRU buffer misses that we specifically intended to avoid by freezing.
6774 */
6775static TransactionId
6776FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
6777 const struct VacuumCutoffs *cutoffs, uint16 *flags,
6778 HeapPageFreeze *pagefrz)
6779{
6781 MultiXactMember *members;
6782 int nmembers;
6783 bool need_replace;
6784 int nnewmembers;
6786 bool has_lockers;
6788 bool update_committed;
6789 TransactionId FreezePageRelfrozenXid;
6790
6791 *flags = 0;
6792
6793 /* We should only be called in Multis */
6794 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6795
6796 if (!MultiXactIdIsValid(multi) ||
6797 HEAP_LOCKED_UPGRADED(t_infomask))
6798 {
6799 *flags |= FRM_INVALIDATE_XMAX;
6800 pagefrz->freeze_required = true;
6801 return InvalidTransactionId;
6802 }
6803 else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
6804 ereport(ERROR,
6806 errmsg_internal("found multixact %u from before relminmxid %u",
6807 multi, cutoffs->relminmxid)));
6808 else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
6809 {
6811
6812 /*
6813 * This old multi cannot possibly have members still running, but
6814 * verify just in case. If it was a locker only, it can be removed
6815 * without any further consideration; but if it contained an update,
6816 * we might need to preserve it.
6817 */
6818 if (MultiXactIdIsRunning(multi,
6819 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6820 ereport(ERROR,
6822 errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
6823 multi, cutoffs->OldestMxact)));
6824
6825 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6826 {
6827 *flags |= FRM_INVALIDATE_XMAX;
6828 pagefrz->freeze_required = true;
6829 return InvalidTransactionId;
6830 }
6831
6832 /* replace multi with single XID for its updater? */
6833 update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
6835 ereport(ERROR,
6837 errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
6838 multi, update_xact,
6839 cutoffs->relfrozenxid)));
6840 else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
6841 {
6842 /*
6843 * Updater XID has to have aborted (otherwise the tuple would have
6844 * been pruned away instead, since updater XID is < OldestXmin).
6845 * Just remove xmax.
6846 */
6848 ereport(ERROR,
6850 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6851 multi, update_xact,
6852 cutoffs->OldestXmin)));
6853 *flags |= FRM_INVALIDATE_XMAX;
6854 pagefrz->freeze_required = true;
6855 return InvalidTransactionId;
6856 }
6857
6858 /* Have to keep updater XID as new xmax */
6859 *flags |= FRM_RETURN_IS_XID;
6860 pagefrz->freeze_required = true;
6861 return update_xact;
6862 }
6863
6864 /*
6865 * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
6866 * need to walk the whole members array to figure out what to do, if
6867 * anything.
6868 */
6869 nmembers =
6870 GetMultiXactIdMembers(multi, &members, false,
6871 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6872 if (nmembers <= 0)
6873 {
6874 /* Nothing worth keeping */
6875 *flags |= FRM_INVALIDATE_XMAX;
6876 pagefrz->freeze_required = true;
6877 return InvalidTransactionId;
6878 }
6879
6880 /*
6881 * The FRM_NOOP case is the only case where we might need to ratchet back
6882 * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
6883 * case where our caller might ratchet back its NoFreezePageRelfrozenXid
6884 * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
6885 * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
6886 * trackers managed by VACUUM being ratcheting back by xmax to the degree
6887 * required to make it safe to leave xmax undisturbed, independent of
6888 * whether or not page freezing is triggered somewhere else.
6889 *
6890 * Our policy is to force freezing in every case other than FRM_NOOP,
6891 * which obviates the need to maintain either set of trackers, anywhere.
6892 * Every other case will reliably execute a freeze plan for xmax that
6893 * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
6894 * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
6895 * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
6896 * OldestXmin/OldestMxact, so later values never need to be tracked here.)
6897 */
6898 need_replace = false;
6899 FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
6900 for (int i = 0; i < nmembers; i++)
6901 {
6902 TransactionId xid = members[i].xid;
6903
6904 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6905
6906 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
6907 {
6908 /* Can't violate the FreezeLimit postcondition */
6909 need_replace = true;
6910 break;
6911 }
6912 if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
6913 FreezePageRelfrozenXid = xid;
6914 }
6915
6916 /* Can't violate the MultiXactCutoff postcondition, either */
6917 if (!need_replace)
6919
6920 if (!need_replace)
6921 {
6922 /*
6923 * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
6924 * both together to make it safe to retain this particular multi after
6925 * freezing its page
6926 */
6927 *flags |= FRM_NOOP;
6928 pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
6929 if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
6930 pagefrz->FreezePageRelminMxid = multi;
6931 pfree(members);
6932 return multi;
6933 }
6934
6935 /*
6936 * Do a more thorough second pass over the multi to figure out which
6937 * member XIDs actually need to be kept. Checking the precise status of
6938 * individual members might even show that we don't need to keep anything.
6939 * That is quite possible even though the Multi must be >= OldestMxact,
6940 * since our second pass only keeps member XIDs when it's truly necessary;
6941 * even member XIDs >= OldestXmin often won't be kept by second pass.
6942 */
6943 nnewmembers = 0;
6945 has_lockers = false;
6947 update_committed = false;
6948
6949 /*
6950 * Determine whether to keep each member xid, or to ignore it instead
6951 */
6952 for (int i = 0; i < nmembers; i++)
6953 {
6954 TransactionId xid = members[i].xid;
6955 MultiXactStatus mstatus = members[i].status;
6956
6957 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6958
6959 if (!ISUPDATE_from_mxstatus(mstatus))
6960 {
6961 /*
6962 * Locker XID (not updater XID). We only keep lockers that are
6963 * still running.
6964 */
6967 {
6968 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6969 ereport(ERROR,
6971 errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
6972 multi, xid,
6973 cutoffs->OldestXmin)));
6974 newmembers[nnewmembers++] = members[i];
6975 has_lockers = true;
6976 }
6977
6978 continue;
6979 }
6980
6981 /*
6982 * Updater XID (not locker XID). Should we keep it?
6983 *
6984 * Since the tuple wasn't totally removed when vacuum pruned, the
6985 * update Xid cannot possibly be older than OldestXmin cutoff unless
6986 * the updater XID aborted. If the updater transaction is known
6987 * aborted or crashed then it's okay to ignore it, otherwise not.
6988 *
6989 * In any case the Multi should never contain two updaters, whatever
6990 * their individual commit status. Check for that first, in passing.
6991 */
6993 ereport(ERROR,
6995 errmsg_internal("multixact %u has two or more updating members",
6996 multi),
6997 errdetail_internal("First updater XID=%u second updater XID=%u.",
6998 update_xid, xid)));
6999
7000 /*
7001 * As with all tuple visibility routines, it's critical to test
7002 * TransactionIdIsInProgress before TransactionIdDidCommit, because of
7003 * race conditions explained in detail in heapam_visibility.c.
7004 */
7007 update_xid = xid;
7008 else if (TransactionIdDidCommit(xid))
7009 {
7010 /*
7011 * The transaction committed, so we can tell caller to set
7012 * HEAP_XMAX_COMMITTED. (We can only do this because we know the
7013 * transaction is not running.)
7014 */
7015 update_committed = true;
7016 update_xid = xid;
7017 }
7018 else
7019 {
7020 /*
7021 * Not in progress, not committed -- must be aborted or crashed;
7022 * we can ignore it.
7023 */
7024 continue;
7025 }
7026
7027 /*
7028 * We determined that updater must be kept -- add it to pending new
7029 * members list
7030 */
7031 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
7032 ereport(ERROR,
7034 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
7035 multi, xid, cutoffs->OldestXmin)));
7036 newmembers[nnewmembers++] = members[i];
7037 }
7038
7039 pfree(members);
7040
7041 /*
7042 * Determine what to do with caller's multi based on information gathered
7043 * during our second pass
7044 */
7045 if (nnewmembers == 0)
7046 {
7047 /* Nothing worth keeping */
7048 *flags |= FRM_INVALIDATE_XMAX;
7050 }
7052 {
7053 /*
7054 * If there's a single member and it's an update, pass it back alone
7055 * without creating a new Multi. (XXX we could do this when there's a
7056 * single remaining locker, too, but that would complicate the API too
7057 * much; moreover, the case with the single updater is more
7058 * interesting, because those are longer-lived.)
7059 */
7060 Assert(nnewmembers == 1);
7061 *flags |= FRM_RETURN_IS_XID;
7062 if (update_committed)
7063 *flags |= FRM_MARK_COMMITTED;
7065 }
7066 else
7067 {
7068 /*
7069 * Create a new multixact with the surviving members of the previous
7070 * one, to set as new Xmax in the tuple
7071 */
7073 *flags |= FRM_RETURN_IS_MULTI;
7074 }
7075
7077
7078 pagefrz->freeze_required = true;
7079 return newxmax;
7080}
7081
7082/*
7083 * heap_prepare_freeze_tuple
7084 *
7085 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7086 * are older than the OldestXmin and/or OldestMxact freeze cutoffs. If so,
7087 * setup enough state (in the *frz output argument) to enable caller to
7088 * process this tuple as part of freezing its page, and return true. Return
7089 * false if nothing can be changed about the tuple right now.
7090 *
7091 * FreezePageConflictXid is advanced only for xmin/xvac freezing, not for xmax
7092 * changes. We only remove xmax state here when it is lock-only, or when the
7093 * updater XID (including an updater member of a MultiXact) must be aborted;
7094 * otherwise, the tuple would already be removable. Neither case affects
7095 * visibility on a standby.
7096 *
7097 * Also sets *totally_frozen to true if the tuple will be totally frozen once
7098 * caller executes returned freeze plan (or if the tuple was already totally
7099 * frozen by an earlier VACUUM). This indicates that there are no remaining
7100 * XIDs or MultiXactIds that will need to be processed by a future VACUUM.
7101 *
7102 * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every
7103 * tuple that we returned true for, and then execute freezing. Caller must
7104 * initialize pagefrz fields for page as a whole before first call here for
7105 * each heap page.
7106 *
7107 * VACUUM caller decides on whether or not to freeze the page as a whole.
7108 * We'll often prepare freeze plans for a page that caller just discards.
7109 * However, VACUUM doesn't always get to make a choice; it must freeze when
7110 * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and
7111 * MXIDs < MultiXactCutoff) can never be left behind. We help to make sure
7112 * that VACUUM always follows that rule.
7113 *
7114 * We sometimes force freezing of xmax MultiXactId values long before it is
7115 * strictly necessary to do so just to ensure the FreezeLimit postcondition.
7116 * It's worth processing MultiXactIds proactively when it is cheap to do so,
7117 * and it's convenient to make that happen by piggy-backing it on the "force
7118 * freezing" mechanism. Conversely, we sometimes delay freezing MultiXactIds
7119 * because it is expensive right now (though only when it's still possible to
7120 * do so without violating the FreezeLimit/MultiXactCutoff postcondition).
7121 *
7122 * It is assumed that the caller has checked the tuple with
7123 * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
7124 * (else we should be removing the tuple, not freezing it).
7125 *
7126 * NB: This function has side effects: it might allocate a new MultiXactId.
7127 * It will be set as tuple's new xmax when our *frz output is processed within
7128 * heap_execute_freeze_tuple later on. If the tuple is in a shared buffer
7129 * then caller had better have an exclusive lock on it already.
7130 */
7131bool
7133 const struct VacuumCutoffs *cutoffs,
7134 HeapPageFreeze *pagefrz,
7136{
7137 bool xmin_already_frozen = false,
7138 xmax_already_frozen = false;
7139 bool freeze_xmin = false,
7140 replace_xvac = false,
7141 replace_xmax = false,
7142 freeze_xmax = false;
7143 TransactionId xid;
7144
7145 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
7146 frz->t_infomask2 = tuple->t_infomask2;
7147 frz->t_infomask = tuple->t_infomask;
7148 frz->frzflags = 0;
7149 frz->checkflags = 0;
7150
7151 /*
7152 * Process xmin, while keeping track of whether it's already frozen, or
7153 * will become frozen iff our freeze plan is executed by caller (could be
7154 * neither).
7155 */
7156 xid = HeapTupleHeaderGetXmin(tuple);
7157 if (!TransactionIdIsNormal(xid))
7158 xmin_already_frozen = true;
7159 else
7160 {
7161 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7162 ereport(ERROR,
7164 errmsg_internal("found xmin %u from before relfrozenxid %u",
7165 xid, cutoffs->relfrozenxid)));
7166
7167 /* Will set freeze_xmin flags in freeze plan below */
7169
7170 /* Verify that xmin committed if and when freeze plan is executed */
7171 if (freeze_xmin)
7172 {
7175 pagefrz->FreezePageConflictXid = xid;
7176 }
7177 }
7178
7179 /*
7180 * Old-style VACUUM FULL is gone, but we have to process xvac for as long
7181 * as we support having MOVED_OFF/MOVED_IN tuples in the database
7182 */
7183 xid = HeapTupleHeaderGetXvac(tuple);
7184 if (TransactionIdIsNormal(xid))
7185 {
7187 Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
7188
7189 /*
7190 * For Xvac, we always freeze proactively. This allows totally_frozen
7191 * tracking to ignore xvac.
7192 */
7193 replace_xvac = pagefrz->freeze_required = true;
7194
7196 pagefrz->FreezePageConflictXid = xid;
7197
7198 /* Will set replace_xvac flags in freeze plan below */
7199 }
7200
7201 /* Now process xmax */
7202 xid = frz->xmax;
7203 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7204 {
7205 /* Raw xmax is a MultiXactId */
7207 uint16 flags;
7208
7209 /*
7210 * We will either remove xmax completely (in the "freeze_xmax" path),
7211 * process xmax by replacing it (in the "replace_xmax" path), or
7212 * perform no-op xmax processing. The only constraint is that the
7213 * FreezeLimit/MultiXactCutoff postcondition must never be violated.
7214 */
7215 newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
7216 &flags, pagefrz);
7217
7218 if (flags & FRM_NOOP)
7219 {
7220 /*
7221 * xmax is a MultiXactId, and nothing about it changes for now.
7222 * This is the only case where 'freeze_required' won't have been
7223 * set for us by FreezeMultiXactId, as well as the only case where
7224 * neither freeze_xmax nor replace_xmax are set (given a multi).
7225 *
7226 * This is a no-op, but the call to FreezeMultiXactId might have
7227 * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
7228 * for us (the "freeze page" variants, specifically). That'll
7229 * make it safe for our caller to freeze the page later on, while
7230 * leaving this particular xmax undisturbed.
7231 *
7232 * FreezeMultiXactId is _not_ responsible for the "no freeze"
7233 * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
7234 * job. A call to heap_tuple_should_freeze for this same tuple
7235 * will take place below if 'freeze_required' isn't set already.
7236 * (This repeats work from FreezeMultiXactId, but allows "no
7237 * freeze" tracker maintenance to happen in only one place.)
7238 */
7241 }
7242 else if (flags & FRM_RETURN_IS_XID)
7243 {
7244 /*
7245 * xmax will become an updater Xid (original MultiXact's updater
7246 * member Xid will be carried forward as a simple Xid in Xmax).
7247 */
7249
7250 /*
7251 * NB -- some of these transformations are only valid because we
7252 * know the return Xid is a tuple updater (i.e. not merely a
7253 * locker.) Also note that the only reason we don't explicitly
7254 * worry about HEAP_KEYS_UPDATED is because it lives in
7255 * t_infomask2 rather than t_infomask.
7256 */
7257 frz->t_infomask &= ~HEAP_XMAX_BITS;
7258 frz->xmax = newxmax;
7259 if (flags & FRM_MARK_COMMITTED)
7260 frz->t_infomask |= HEAP_XMAX_COMMITTED;
7261 replace_xmax = true;
7262 }
7263 else if (flags & FRM_RETURN_IS_MULTI)
7264 {
7267
7268 /*
7269 * xmax is an old MultiXactId that we have to replace with a new
7270 * MultiXactId, to carry forward two or more original member XIDs.
7271 */
7273
7274 /*
7275 * We can't use GetMultiXactIdHintBits directly on the new multi
7276 * here; that routine initializes the masks to all zeroes, which
7277 * would lose other bits we need. Doing it this way ensures all
7278 * unrelated bits remain untouched.
7279 */
7280 frz->t_infomask &= ~HEAP_XMAX_BITS;
7281 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7283 frz->t_infomask |= newbits;
7284 frz->t_infomask2 |= newbits2;
7285 frz->xmax = newxmax;
7286 replace_xmax = true;
7287 }
7288 else
7289 {
7290 /*
7291 * Freeze plan for tuple "freezes xmax" in the strictest sense:
7292 * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
7293 */
7294 Assert(flags & FRM_INVALIDATE_XMAX);
7296
7297 /* Will set freeze_xmax flags in freeze plan below */
7298 freeze_xmax = true;
7299 }
7300
7301 /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
7302 Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
7303 }
7304 else if (TransactionIdIsNormal(xid))
7305 {
7306 /* Raw xmax is normal XID */
7307 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7308 ereport(ERROR,
7310 errmsg_internal("found xmax %u from before relfrozenxid %u",
7311 xid, cutoffs->relfrozenxid)));
7312
7313 /* Will set freeze_xmax flags in freeze plan below */
7315
7316 /*
7317 * Verify that xmax aborted if and when freeze plan is executed,
7318 * provided it's from an update. (A lock-only xmax can be removed
7319 * independent of this, since the lock is released at xact end.)
7320 */
7322 frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
7323 }
7324 else if (!TransactionIdIsValid(xid))
7325 {
7326 /* Raw xmax is InvalidTransactionId XID */
7327 Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
7328 xmax_already_frozen = true;
7329 }
7330 else
7331 ereport(ERROR,
7333 errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
7334 xid, tuple->t_infomask)));
7335
7336 if (freeze_xmin)
7337 {
7339
7340 frz->t_infomask |= HEAP_XMIN_FROZEN;
7341 }
7342 if (replace_xvac)
7343 {
7344 /*
7345 * If a MOVED_OFF tuple is not dead, the xvac transaction must have
7346 * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
7347 * transaction succeeded.
7348 */
7349 Assert(pagefrz->freeze_required);
7350 if (tuple->t_infomask & HEAP_MOVED_OFF)
7351 frz->frzflags |= XLH_INVALID_XVAC;
7352 else
7353 frz->frzflags |= XLH_FREEZE_XVAC;
7354 }
7355 if (replace_xmax)
7356 {
7358 Assert(pagefrz->freeze_required);
7359
7360 /* Already set replace_xmax flags in freeze plan earlier */
7361 }
7362 if (freeze_xmax)
7363 {
7365
7366 frz->xmax = InvalidTransactionId;
7367
7368 /*
7369 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7370 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
7371 * Also get rid of the HEAP_KEYS_UPDATED bit.
7372 */
7373 frz->t_infomask &= ~HEAP_XMAX_BITS;
7374 frz->t_infomask |= HEAP_XMAX_INVALID;
7375 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7376 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7377 }
7378
7379 /*
7380 * Determine if this tuple is already totally frozen, or will become
7381 * totally frozen (provided caller executes freeze plans for the page)
7382 */
7385
7386 if (!pagefrz->freeze_required && !(xmin_already_frozen &&
7388 {
7389 /*
7390 * So far no previous tuple from the page made freezing mandatory.
7391 * Does this tuple force caller to freeze the entire page?
7392 */
7393 pagefrz->freeze_required =
7394 heap_tuple_should_freeze(tuple, cutoffs,
7395 &pagefrz->NoFreezePageRelfrozenXid,
7396 &pagefrz->NoFreezePageRelminMxid);
7397 }
7398
7399 /* Tell caller if this tuple has a usable freeze plan set in *frz */
7401}
7402
7403/*
7404 * Perform xmin/xmax XID status sanity checks before actually executing freeze
7405 * plans.
7406 *
7407 * heap_prepare_freeze_tuple doesn't perform these checks directly because
7408 * pg_xact lookups are relatively expensive. They shouldn't be repeated by
7409 * successive VACUUMs that each decide against freezing the same page.
7410 */
7411void
7413 HeapTupleFreeze *tuples, int ntuples)
7414{
7415 Page page = BufferGetPage(buffer);
7416
7417 for (int i = 0; i < ntuples; i++)
7418 {
7419 HeapTupleFreeze *frz = tuples + i;
7420 ItemId itemid = PageGetItemId(page, frz->offset);
7421 HeapTupleHeader htup;
7422
7423 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7424
7425 /* Deliberately avoid relying on tuple hint bits here */
7426 if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
7427 {
7429
7431 if (unlikely(!TransactionIdDidCommit(xmin)))
7432 ereport(ERROR,
7434 errmsg_internal("uncommitted xmin %u needs to be frozen",
7435 xmin)));
7436 }
7437
7438 /*
7439 * TransactionIdDidAbort won't work reliably in the presence of XIDs
7440 * left behind by transactions that were in progress during a crash,
7441 * so we can only check that xmax didn't commit
7442 */
7443 if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
7444 {
7446
7449 ereport(ERROR,
7451 errmsg_internal("cannot freeze committed xmax %u",
7452 xmax)));
7453 }
7454 }
7455}
7456
7457/*
7458 * Helper which executes freezing of one or more heap tuples on a page on
7459 * behalf of caller. Caller passes an array of tuple plans from
7460 * heap_prepare_freeze_tuple. Caller must set 'offset' in each plan for us.
7461 * Must be called in a critical section that also marks the buffer dirty and,
7462 * if needed, emits WAL.
7463 */
7464void
7465heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
7466{
7467 Page page = BufferGetPage(buffer);
7468
7469 for (int i = 0; i < ntuples; i++)
7470 {
7471 HeapTupleFreeze *frz = tuples + i;
7472 ItemId itemid = PageGetItemId(page, frz->offset);
7473 HeapTupleHeader htup;
7474
7475 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7477 }
7478}
7479
7480/*
7481 * heap_freeze_tuple
7482 * Freeze tuple in place, without WAL logging.
7483 *
7484 * Useful for callers like CLUSTER that perform their own WAL logging.
7485 */
7486bool
7488 TransactionId relfrozenxid, TransactionId relminmxid,
7489 TransactionId FreezeLimit, TransactionId MultiXactCutoff)
7490{
7492 bool do_freeze;
7493 bool totally_frozen;
7494 struct VacuumCutoffs cutoffs;
7495 HeapPageFreeze pagefrz;
7496
7497 cutoffs.relfrozenxid = relfrozenxid;
7498 cutoffs.relminmxid = relminmxid;
7499 cutoffs.OldestXmin = FreezeLimit;
7500 cutoffs.OldestMxact = MultiXactCutoff;
7501 cutoffs.FreezeLimit = FreezeLimit;
7503
7504 pagefrz.freeze_required = true;
7505 pagefrz.FreezePageRelfrozenXid = FreezeLimit;
7506 pagefrz.FreezePageRelminMxid = MultiXactCutoff;
7507 pagefrz.FreezePageConflictXid = InvalidTransactionId;
7508 pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
7509 pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
7510
7511 do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
7512 &pagefrz, &frz, &totally_frozen);
7513
7514 /*
7515 * Note that because this is not a WAL-logged operation, we don't need to
7516 * fill in the offset in the freeze record.
7517 */
7518
7519 if (do_freeze)
7521 return do_freeze;
7522}
7523
7524/*
7525 * For a given MultiXactId, return the hint bits that should be set in the
7526 * tuple's infomask.
7527 *
7528 * Normally this should be called for a multixact that was just created, and
7529 * so is on our local cache, so the GetMembers call is fast.
7530 */
7531static void
7534{
7535 int nmembers;
7536 MultiXactMember *members;
7537 int i;
7539 uint16 bits2 = 0;
7540 bool has_update = false;
7542
7543 /*
7544 * We only use this in multis we just created, so they cannot be values
7545 * pre-pg_upgrade.
7546 */
7547 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7548
7549 for (i = 0; i < nmembers; i++)
7550 {
7552
7553 /*
7554 * Remember the strongest lock mode held by any member of the
7555 * multixact.
7556 */
7557 mode = TUPLOCK_from_mxstatus(members[i].status);
7558 if (mode > strongest)
7559 strongest = mode;
7560
7561 /* See what other bits we need */
7562 switch (members[i].status)
7563 {
7567 break;
7568
7571 break;
7572
7574 has_update = true;
7575 break;
7576
7579 has_update = true;
7580 break;
7581 }
7582 }
7583
7586 bits |= HEAP_XMAX_EXCL_LOCK;
7587 else if (strongest == LockTupleShare)
7588 bits |= HEAP_XMAX_SHR_LOCK;
7589 else if (strongest == LockTupleKeyShare)
7590 bits |= HEAP_XMAX_KEYSHR_LOCK;
7591
7592 if (!has_update)
7593 bits |= HEAP_XMAX_LOCK_ONLY;
7594
7595 if (nmembers > 0)
7596 pfree(members);
7597
7598 *new_infomask = bits;
7600}
7601
7602/*
7603 * MultiXactIdGetUpdateXid
7604 *
7605 * Given a multixact Xmax and corresponding infomask, which does not have the
7606 * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
7607 * transaction.
7608 *
7609 * Caller is expected to check the status of the updating transaction, if
7610 * necessary.
7611 */
7612static TransactionId
7614{
7616 MultiXactMember *members;
7617 int nmembers;
7618
7619 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7620 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7621
7622 /*
7623 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7624 * pre-pg_upgrade.
7625 */
7626 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7627
7628 if (nmembers > 0)
7629 {
7630 int i;
7631
7632 for (i = 0; i < nmembers; i++)
7633 {
7634 /* Ignore lockers */
7635 if (!ISUPDATE_from_mxstatus(members[i].status))
7636 continue;
7637
7638 /* there can be at most one updater */
7640 update_xact = members[i].xid;
7641#ifndef USE_ASSERT_CHECKING
7642
7643 /*
7644 * in an assert-enabled build, walk the whole array to ensure
7645 * there's no other updater.
7646 */
7647 break;
7648#endif
7649 }
7650
7651 pfree(members);
7652 }
7653
7654 return update_xact;
7655}
7656
7657/*
7658 * HeapTupleGetUpdateXid
7659 * As above, but use a HeapTupleHeader
7660 *
7661 * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
7662 * checking the hint bits.
7663 */
7666{
7668 tup->t_infomask);
7669}
7670
7671/*
7672 * Does the given multixact conflict with the current transaction grabbing a
7673 * tuple lock of the given strength?
7674 *
7675 * The passed infomask pairs up with the given multixact in the tuple header.
7676 *
7677 * If current_is_member is not NULL, it is set to 'true' if the current
7678 * transaction is a member of the given multixact.
7679 */
7680static bool
7682 LockTupleMode lockmode, bool *current_is_member)
7683{
7684 int nmembers;
7685 MultiXactMember *members;
7686 bool result = false;
7687 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7688
7690 return false;
7691
7692 nmembers = GetMultiXactIdMembers(multi, &members, false,
7694 if (nmembers >= 0)
7695 {
7696 int i;
7697
7698 for (i = 0; i < nmembers; i++)
7699 {
7702
7703 if (result && (current_is_member == NULL || *current_is_member))
7704 break;
7705
7706 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7707
7708 /* ignore members from current xact (but track their presence) */
7709 memxid = members[i].xid;
7711 {
7712 if (current_is_member != NULL)
7713 *current_is_member = true;
7714 continue;
7715 }
7716 else if (result)
7717 continue;
7718
7719 /* ignore members that don't conflict with the lock we want */
7721 continue;
7722
7723 if (ISUPDATE_from_mxstatus(members[i].status))
7724 {
7725 /* ignore aborted updaters */
7727 continue;
7728 }
7729 else
7730 {
7731 /* ignore lockers-only that are no longer in progress */
7733 continue;
7734 }
7735
7736 /*
7737 * Whatever remains are either live lockers that conflict with our
7738 * wanted lock, and updaters that are not aborted. Those conflict
7739 * with what we want. Set up to return true, but keep going to
7740 * look for the current transaction among the multixact members,
7741 * if needed.
7742 */
7743 result = true;
7744 }
7745 pfree(members);
7746 }
7747
7748 return result;
7749}
7750
7751/*
7752 * Do_MultiXactIdWait
7753 * Actual implementation for the two functions below.
7754 *
7755 * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
7756 * needed to ensure we only sleep on conflicting members, and the infomask is
7757 * used to optimize multixact access in case it's a lock-only multi); 'nowait'
7758 * indicates whether to use conditional lock acquisition, to allow callers to
7759 * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
7760 * context information for error messages. 'remaining', if not NULL, receives
7761 * the number of members that are still running, including any (non-aborted)
7762 * subtransactions of our own transaction. 'logLockFailure' indicates whether
7763 * to log details when a lock acquisition fails with 'nowait' enabled.
7764 *
7765 * We do this by sleeping on each member using XactLockTableWait. Any
7766 * members that belong to the current backend are *not* waited for, however;
7767 * this would not merely be useless but would lead to Assert failure inside
7768 * XactLockTableWait. By the time this returns, it is certain that all
7769 * transactions *of other backends* that were members of the MultiXactId
7770 * that conflict with the requested status are dead (and no new ones can have
7771 * been added, since it is not legal to add members to an existing
7772 * MultiXactId).
7773 *
7774 * But by the time we finish sleeping, someone else may have changed the Xmax
7775 * of the containing tuple, so the caller needs to iterate on us somehow.
7776 *
7777 * Note that in case we return false, the number of remaining members is
7778 * not to be trusted.
7779 */
7780static bool
7782 uint16 infomask, bool nowait,
7783 Relation rel, const ItemPointerData *ctid, XLTW_Oper oper,
7784 int *remaining, bool logLockFailure)
7785{
7786 bool result = true;
7787 MultiXactMember *members;
7788 int nmembers;
7789 int remain = 0;
7790
7791 /* for pre-pg_upgrade tuples, no need to sleep at all */
7792 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7793 GetMultiXactIdMembers(multi, &members, false,
7795
7796 if (nmembers >= 0)
7797 {
7798 int i;
7799
7800 for (i = 0; i < nmembers; i++)
7801 {
7802 TransactionId memxid = members[i].xid;
7803 MultiXactStatus memstatus = members[i].status;
7804
7806 {
7807 remain++;
7808 continue;
7809 }
7810
7812 LOCKMODE_from_mxstatus(status)))
7813 {
7815 remain++;
7816 continue;
7817 }
7818
7819 /*
7820 * This member conflicts with our multi, so we have to sleep (or
7821 * return failure, if asked to avoid waiting.)
7822 *
7823 * Note that we don't set up an error context callback ourselves,
7824 * but instead we pass the info down to XactLockTableWait. This
7825 * might seem a bit wasteful because the context is set up and
7826 * tore down for each member of the multixact, but in reality it
7827 * should be barely noticeable, and it avoids duplicate code.
7828 */
7829 if (nowait)
7830 {
7832 if (!result)
7833 break;
7834 }
7835 else
7836 XactLockTableWait(memxid, rel, ctid, oper);
7837 }
7838
7839 pfree(members);
7840 }
7841
7842 if (remaining)
7843 *remaining = remain;
7844
7845 return result;
7846}
7847
7848/*
7849 * MultiXactIdWait
7850 * Sleep on a MultiXactId.
7851 *
7852 * By the time we finish sleeping, someone else may have changed the Xmax
7853 * of the containing tuple, so the caller needs to iterate on us somehow.
7854 *
7855 * We return (in *remaining, if not NULL) the number of members that are still
7856 * running, including any (non-aborted) subtransactions of our own transaction.
7857 */
7858static void
7860 Relation rel, const ItemPointerData *ctid, XLTW_Oper oper,
7861 int *remaining)
7862{
7863 (void) Do_MultiXactIdWait(multi, status, infomask, false,
7864 rel, ctid, oper, remaining, false);
7865}
7866
7867/*
7868 * ConditionalMultiXactIdWait
7869 * As above, but only lock if we can get the lock without blocking.
7870 *
7871 * By the time we finish sleeping, someone else may have changed the Xmax
7872 * of the containing tuple, so the caller needs to iterate on us somehow.
7873 *
7874 * If the multixact is now all gone, return true. Returns false if some
7875 * transactions might still be running.
7876 *
7877 * We return (in *remaining, if not NULL) the number of members that are still
7878 * running, including any (non-aborted) subtransactions of our own transaction.
7879 */
7880static bool
7882 uint16 infomask, Relation rel, int *remaining,
7883 bool logLockFailure)
7884{
7885 return Do_MultiXactIdWait(multi, status, infomask, true,
7887}
7888
7889/*
7890 * heap_tuple_needs_eventual_freeze
7891 *
7892 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7893 * will eventually require freezing (if tuple isn't removed by pruning first).
7894 */
7895bool
7897{
7898 TransactionId xid;
7899
7900 /*
7901 * If xmin is a normal transaction ID, this tuple is definitely not
7902 * frozen.
7903 */
7904 xid = HeapTupleHeaderGetXmin(tuple);
7905 if (TransactionIdIsNormal(xid))
7906 return true;
7907
7908 /*
7909 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7910 */
7911 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7912 {
7913 MultiXactId multi;
7914
7915 multi = HeapTupleHeaderGetRawXmax(tuple);
7916 if (MultiXactIdIsValid(multi))
7917 return true;
7918 }
7919 else
7920 {
7921 xid = HeapTupleHeaderGetRawXmax(tuple);
7922 if (TransactionIdIsNormal(xid))
7923 return true;
7924 }
7925
7926 if (tuple->t_infomask & HEAP_MOVED)
7927 {
7928 xid = HeapTupleHeaderGetXvac(tuple);
7929 if (TransactionIdIsNormal(xid))
7930 return true;
7931 }
7932
7933 return false;
7934}
7935
7936/*
7937 * heap_tuple_should_freeze
7938 *
7939 * Return value indicates if heap_prepare_freeze_tuple sibling function would
7940 * (or should) force freezing of the heap page that contains caller's tuple.
7941 * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing.
7942 * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs.
7943 *
7944 * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output
7945 * arguments help VACUUM track the oldest extant XID/MXID remaining in rel.
7946 * Our working assumption is that caller won't decide to freeze this tuple.
7947 * It's up to caller to only ratchet back its own top-level trackers after the
7948 * point that it fully commits to not freezing the tuple/page in question.
7949 */
7950bool
7952 const struct VacuumCutoffs *cutoffs,
7953 TransactionId *NoFreezePageRelfrozenXid,
7954 MultiXactId *NoFreezePageRelminMxid)
7955{
7956 TransactionId xid;
7957 MultiXactId multi;
7958 bool freeze = false;
7959
7960 /* First deal with xmin */
7961 xid = HeapTupleHeaderGetXmin(tuple);
7962 if (TransactionIdIsNormal(xid))
7963 {
7965 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7966 *NoFreezePageRelfrozenXid = xid;
7967 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7968 freeze = true;
7969 }
7970
7971 /* Now deal with xmax */
7973 multi = InvalidMultiXactId;
7974 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7975 multi = HeapTupleHeaderGetRawXmax(tuple);
7976 else
7977 xid = HeapTupleHeaderGetRawXmax(tuple);
7978
7979 if (TransactionIdIsNormal(xid))
7980 {
7982 /* xmax is a non-permanent XID */
7983 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7984 *NoFreezePageRelfrozenXid = xid;
7985 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7986 freeze = true;
7987 }
7988 else if (!MultiXactIdIsValid(multi))
7989 {
7990 /* xmax is a permanent XID or invalid MultiXactId/XID */
7991 }
7992 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7993 {
7994 /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
7995 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
7996 *NoFreezePageRelminMxid = multi;
7997 /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
7998 freeze = true;
7999 }
8000 else
8001 {
8002 /* xmax is a MultiXactId that may have an updater XID */
8003 MultiXactMember *members;
8004 int nmembers;
8005
8007 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
8008 *NoFreezePageRelminMxid = multi;
8009 if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
8010 freeze = true;
8011
8012 /* need to check whether any member of the mxact is old */
8013 nmembers = GetMultiXactIdMembers(multi, &members, false,
8015
8016 for (int i = 0; i < nmembers; i++)
8017 {
8018 xid = members[i].xid;
8020 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8021 *NoFreezePageRelfrozenXid = xid;
8022 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
8023 freeze = true;
8024 }
8025 if (nmembers > 0)
8026 pfree(members);
8027 }
8028
8029 if (tuple->t_infomask & HEAP_MOVED)
8030 {
8031 xid = HeapTupleHeaderGetXvac(tuple);
8032 if (TransactionIdIsNormal(xid))
8033 {
8035 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8036 *NoFreezePageRelfrozenXid = xid;
8037 /* heap_prepare_freeze_tuple forces xvac freezing */
8038 freeze = true;
8039 }
8040 }
8041
8042 return freeze;
8043}
8044
8045/*
8046 * Maintain snapshotConflictHorizon for caller by ratcheting forward its value
8047 * using any committed XIDs contained in 'tuple', an obsolescent heap tuple
8048 * that caller is in the process of physically removing, e.g. via HOT pruning
8049 * or index deletion.
8050 *
8051 * Caller must initialize its value to InvalidTransactionId, which is
8052 * generally interpreted as "definitely no need for a recovery conflict".
8053 * Final value must reflect all heap tuples that caller will physically remove
8054 * (or remove TID references to) via its ongoing pruning/deletion operation.
8055 * ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from
8056 * caller's WAL record) by REDO routine when it replays caller's operation.
8057 */
8058void
8060 TransactionId *snapshotConflictHorizon)
8061{
8065
8066 if (tuple->t_infomask & HEAP_MOVED)
8067 {
8068 if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
8069 *snapshotConflictHorizon = xvac;
8070 }
8071
8072 /*
8073 * Ignore tuples inserted by an aborted transaction or if the tuple was
8074 * updated/deleted by the inserting transaction.
8075 *
8076 * Look for a committed hint bit, or if no xmin bit is set, check clog.
8077 */
8078 if (HeapTupleHeaderXminCommitted(tuple) ||
8080 {
8081 if (xmax != xmin &&
8082 TransactionIdFollows(xmax, *snapshotConflictHorizon))
8083 *snapshotConflictHorizon = xmax;
8084 }
8085}
8086
8087#ifdef USE_PREFETCH
8088/*
8089 * Helper function for heap_index_delete_tuples. Issues prefetch requests for
8090 * prefetch_count buffers. The prefetch_state keeps track of all the buffers
8091 * we can prefetch, and which have already been prefetched; each call to this
8092 * function picks up where the previous call left off.
8093 *
8094 * Note: we expect the deltids array to be sorted in an order that groups TIDs
8095 * by heap block, with all TIDs for each block appearing together in exactly
8096 * one group.
8097 */
8098static void
8101 int prefetch_count)
8102{
8104 int count = 0;
8105 int i;
8106 int ndeltids = prefetch_state->ndeltids;
8107 TM_IndexDelete *deltids = prefetch_state->deltids;
8108
8109 for (i = prefetch_state->next_item;
8110 i < ndeltids && count < prefetch_count;
8111 i++)
8112 {
8113 ItemPointer htid = &deltids[i].tid;
8114
8117 {
8120 count++;
8121 }
8122 }
8123
8124 /*
8125 * Save the prefetch position so that next time we can continue from that
8126 * position.
8127 */
8128 prefetch_state->next_item = i;
8129 prefetch_state->cur_hblkno = cur_hblkno;
8130}
8131#endif
8132
8133/*
8134 * Helper function for heap_index_delete_tuples. Checks for index corruption
8135 * involving an invalid TID in index AM caller's index page.
8136 *
8137 * This is an ideal place for these checks. The index AM must hold a buffer
8138 * lock on the index page containing the TIDs we examine here, so we don't
8139 * have to worry about concurrent VACUUMs at all. We can be sure that the
8140 * index is corrupt when htid points directly to an LP_UNUSED item or
8141 * heap-only tuple, which is not the case during standard index scans.
8142 */
8143static inline void
8145 Page page, OffsetNumber maxoff,
8147{
8149 ItemId iid;
8150
8151 Assert(OffsetNumberIsValid(istatus->idxoffnum));
8152
8153 if (unlikely(indexpagehoffnum > maxoff))
8154 ereport(ERROR,
8156 errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
8159 istatus->idxoffnum, delstate->iblknum,
8161
8163 if (unlikely(!ItemIdIsUsed(iid)))
8164 ereport(ERROR,
8166 errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
8169 istatus->idxoffnum, delstate->iblknum,
8171
8172 if (ItemIdHasStorage(iid))
8173 {
8174 HeapTupleHeader htup;
8175
8177 htup = (HeapTupleHeader) PageGetItem(page, iid);
8178
8180 ereport(ERROR,
8182 errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
8185 istatus->idxoffnum, delstate->iblknum,
8187 }
8188}
8189
8190/*
8191 * heapam implementation of tableam's index_delete_tuples interface.
8192 *
8193 * This helper function is called by index AMs during index tuple deletion.
8194 * See tableam header comments for an explanation of the interface implemented
8195 * here and a general theory of operation. Note that each call here is either
8196 * a simple index deletion call, or a bottom-up index deletion call.
8197 *
8198 * It's possible for this to generate a fair amount of I/O, since we may be
8199 * deleting hundreds of tuples from a single index block. To amortize that
8200 * cost to some degree, this uses prefetching and combines repeat accesses to
8201 * the same heap block.
8202 */
8205{
8206 /* Initial assumption is that earlier pruning took care of conflict */
8207 TransactionId snapshotConflictHorizon = InvalidTransactionId;
8210 Page page = NULL;
8213#ifdef USE_PREFETCH
8216#endif
8218 int finalndeltids = 0,
8219 nblocksaccessed = 0;
8220
8221 /* State that's only used in bottom-up index deletion case */
8222 int nblocksfavorable = 0;
8223 int curtargetfreespace = delstate->bottomupfreespace,
8224 lastfreespace = 0,
8225 actualfreespace = 0;
8226 bool bottomup_final_block = false;
8227
8229
8230 /* Sort caller's deltids array by TID for further processing */
8232
8233 /*
8234 * Bottom-up case: resort deltids array in an order attuned to where the
8235 * greatest number of promising TIDs are to be found, and determine how
8236 * many blocks from the start of sorted array should be considered
8237 * favorable. This will also shrink the deltids array in order to
8238 * eliminate completely unfavorable blocks up front.
8239 */
8240 if (delstate->bottomup)
8242
8243#ifdef USE_PREFETCH
8244 /* Initialize prefetch state. */
8246 prefetch_state.next_item = 0;
8247 prefetch_state.ndeltids = delstate->ndeltids;
8248 prefetch_state.deltids = delstate->deltids;
8249
8250 /*
8251 * Determine the prefetch distance that we will attempt to maintain.
8252 *
8253 * Since the caller holds a buffer lock somewhere in rel, we'd better make
8254 * sure that isn't a catalog relation before we call code that does
8255 * syscache lookups, to avoid risk of deadlock.
8256 */
8257 if (IsCatalogRelation(rel))
8259 else
8262
8263 /* Cap initial prefetch distance for bottom-up deletion caller */
8264 if (delstate->bottomup)
8265 {
8269 }
8270
8271 /* Start prefetching. */
8273#endif
8274
8275 /* Iterate over deltids, determine which to delete, check their horizon */
8276 Assert(delstate->ndeltids > 0);
8277 for (int i = 0; i < delstate->ndeltids; i++)
8278 {
8279 TM_IndexDelete *ideltid = &delstate->deltids[i];
8280 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8281 ItemPointer htid = &ideltid->tid;
8282 OffsetNumber offnum;
8283
8284 /*
8285 * Read buffer, and perform required extra steps each time a new block
8286 * is encountered. Avoid refetching if it's the same block as the one
8287 * from the last htid.
8288 */
8289 if (blkno == InvalidBlockNumber ||
8291 {
8292 /*
8293 * Consider giving up early for bottom-up index deletion caller
8294 * first. (Only prefetch next-next block afterwards, when it
8295 * becomes clear that we're at least going to access the next
8296 * block in line.)
8297 *
8298 * Sometimes the first block frees so much space for bottom-up
8299 * caller that the deletion process can end without accessing any
8300 * more blocks. It is usually necessary to access 2 or 3 blocks
8301 * per bottom-up deletion operation, though.
8302 */
8303 if (delstate->bottomup)
8304 {
8305 /*
8306 * We often allow caller to delete a few additional items
8307 * whose entries we reached after the point that space target
8308 * from caller was satisfied. The cost of accessing the page
8309 * was already paid at that point, so it made sense to finish
8310 * it off. When that happened, we finalize everything here
8311 * (by finishing off the whole bottom-up deletion operation
8312 * without needlessly paying the cost of accessing any more
8313 * blocks).
8314 */
8316 break;
8317
8318 /*
8319 * Give up when we didn't enable our caller to free any
8320 * additional space as a result of processing the page that we
8321 * just finished up with. This rule is the main way in which
8322 * we keep the cost of bottom-up deletion under control.
8323 */
8325 break;
8326 lastfreespace = actualfreespace; /* for next time */
8327
8328 /*
8329 * Deletion operation (which is bottom-up) will definitely
8330 * access the next block in line. Prepare for that now.
8331 *
8332 * Decay target free space so that we don't hang on for too
8333 * long with a marginal case. (Space target is only truly
8334 * helpful when it allows us to recognize that we don't need
8335 * to access more than 1 or 2 blocks to satisfy caller due to
8336 * agreeable workload characteristics.)
8337 *
8338 * We are a bit more patient when we encounter contiguous
8339 * blocks, though: these are treated as favorable blocks. The
8340 * decay process is only applied when the next block in line
8341 * is not a favorable/contiguous block. This is not an
8342 * exception to the general rule; we still insist on finding
8343 * at least one deletable item per block accessed. See
8344 * bottomup_nblocksfavorable() for full details of the theory
8345 * behind favorable blocks and heap block locality in general.
8346 *
8347 * Note: The first block in line is always treated as a
8348 * favorable block, so the earliest possible point that the
8349 * decay can be applied is just before we access the second
8350 * block in line. The Assert() verifies this for us.
8351 */
8353 if (nblocksfavorable > 0)
8355 else
8356 curtargetfreespace /= 2;
8357 }
8358
8359 /* release old buffer */
8360 if (BufferIsValid(buf))
8362
8364 buf = ReadBuffer(rel, blkno);
8366 Assert(!delstate->bottomup ||
8368
8369#ifdef USE_PREFETCH
8370
8371 /*
8372 * To maintain the prefetch distance, prefetch one more page for
8373 * each page we read.
8374 */
8376#endif
8377
8379
8380 page = BufferGetPage(buf);
8381 maxoff = PageGetMaxOffsetNumber(page);
8382 }
8383
8384 /*
8385 * In passing, detect index corruption involving an index page with a
8386 * TID that points to a location in the heap that couldn't possibly be
8387 * correct. We only do this with actual TIDs from caller's index page
8388 * (not items reached by traversing through a HOT chain).
8389 */
8391
8392 if (istatus->knowndeletable)
8393 Assert(!delstate->bottomup && !istatus->promising);
8394 else
8395 {
8396 ItemPointerData tmp = *htid;
8398
8399 /* Are any tuples from this HOT chain non-vacuumable? */
8401 &heapTuple, NULL, true))
8402 continue; /* can't delete entry */
8403
8404 /* Caller will delete, since whole HOT chain is vacuumable */
8405 istatus->knowndeletable = true;
8406
8407 /* Maintain index free space info for bottom-up deletion case */
8408 if (delstate->bottomup)
8409 {
8410 Assert(istatus->freespace > 0);
8411 actualfreespace += istatus->freespace;
8413 bottomup_final_block = true;
8414 }
8415 }
8416
8417 /*
8418 * Maintain snapshotConflictHorizon value for deletion operation as a
8419 * whole by advancing current value using heap tuple headers. This is
8420 * loosely based on the logic for pruning a HOT chain.
8421 */
8423 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
8424 for (;;)
8425 {
8426 ItemId lp;
8427 HeapTupleHeader htup;
8428
8429 /* Sanity check (pure paranoia) */
8430 if (offnum < FirstOffsetNumber)
8431 break;
8432
8433 /*
8434 * An offset past the end of page's line pointer array is possible
8435 * when the array was truncated
8436 */
8437 if (offnum > maxoff)
8438 break;
8439
8440 lp = PageGetItemId(page, offnum);
8442 {
8443 offnum = ItemIdGetRedirect(lp);
8444 continue;
8445 }
8446
8447 /*
8448 * We'll often encounter LP_DEAD line pointers (especially with an
8449 * entry marked knowndeletable by our caller up front). No heap
8450 * tuple headers get examined for an htid that leads us to an
8451 * LP_DEAD item. This is okay because the earlier pruning
8452 * operation that made the line pointer LP_DEAD in the first place
8453 * must have considered the original tuple header as part of
8454 * generating its own snapshotConflictHorizon value.
8455 *
8456 * Relying on XLOG_HEAP2_PRUNE_VACUUM_SCAN records like this is
8457 * the same strategy that index vacuuming uses in all cases. Index
8458 * VACUUM WAL records don't even have a snapshotConflictHorizon
8459 * field of their own for this reason.
8460 */
8461 if (!ItemIdIsNormal(lp))
8462 break;
8463
8464 htup = (HeapTupleHeader) PageGetItem(page, lp);
8465
8466 /*
8467 * Check the tuple XMIN against prior XMAX, if any
8468 */
8471 break;
8472
8474 &snapshotConflictHorizon);
8475
8476 /*
8477 * If the tuple is not HOT-updated, then we are at the end of this
8478 * HOT-chain. No need to visit later tuples from the same update
8479 * chain (they get their own index entries) -- just move on to
8480 * next htid from index AM caller.
8481 */
8482 if (!HeapTupleHeaderIsHotUpdated(htup))
8483 break;
8484
8485 /* Advance to next HOT chain member */
8486 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
8487 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
8489 }
8490
8491 /* Enable further/final shrinking of deltids for caller */
8492 finalndeltids = i + 1;
8493 }
8494
8496
8497 /*
8498 * Shrink deltids array to exclude non-deletable entries at the end. This
8499 * is not just a minor optimization. Final deltids array size might be
8500 * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
8501 * ndeltids being zero in all cases with zero total deletable entries.
8502 */
8503 Assert(finalndeltids > 0 || delstate->bottomup);
8504 delstate->ndeltids = finalndeltids;
8505
8506 return snapshotConflictHorizon;
8507}
8508
8509/*
8510 * Specialized inlineable comparison function for index_delete_sort()
8511 */
8512static inline int
8514{
8515 ItemPointer tid1 = &deltid1->tid;
8516 ItemPointer tid2 = &deltid2->tid;
8517
8518 {
8521
8522 if (blk1 != blk2)
8523 return (blk1 < blk2) ? -1 : 1;
8524 }
8525 {
8528
8529 if (pos1 != pos2)
8530 return (pos1 < pos2) ? -1 : 1;
8531 }
8532
8533 Assert(false);
8534
8535 return 0;
8536}
8537
8538/*
8539 * Sort deltids array from delstate by TID. This prepares it for further
8540 * processing by heap_index_delete_tuples().
8541 *
8542 * This operation becomes a noticeable consumer of CPU cycles with some
8543 * workloads, so we go to the trouble of specialization/micro optimization.
8544 * We use shellsort for this because it's easy to specialize, compiles to
8545 * relatively few instructions, and is adaptive to presorted inputs/subsets
8546 * (which are typical here).
8547 */
8548static void
8550{
8551 TM_IndexDelete *deltids = delstate->deltids;
8552 int ndeltids = delstate->ndeltids;
8553
8554 /*
8555 * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
8556 *
8557 * This implementation is fast with array sizes up to ~4500. This covers
8558 * all supported BLCKSZ values.
8559 */
8560 const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
8561
8562 /* Think carefully before changing anything here -- keep swaps cheap */
8563 StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
8564 "element size exceeds 8 bytes");
8565
8566 for (int g = 0; g < lengthof(gaps); g++)
8567 {
8568 for (int hi = gaps[g], i = hi; i < ndeltids; i++)
8569 {
8570 TM_IndexDelete d = deltids[i];
8571 int j = i;
8572
8573 while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
8574 {
8575 deltids[j] = deltids[j - hi];
8576 j -= hi;
8577 }
8578 deltids[j] = d;
8579 }
8580 }
8581}
8582
8583/*
8584 * Returns how many blocks should be considered favorable/contiguous for a
8585 * bottom-up index deletion pass. This is a number of heap blocks that starts
8586 * from and includes the first block in line.
8587 *
8588 * There is always at least one favorable block during bottom-up index
8589 * deletion. In the worst case (i.e. with totally random heap blocks) the
8590 * first block in line (the only favorable block) can be thought of as a
8591 * degenerate array of contiguous blocks that consists of a single block.
8592 * heap_index_delete_tuples() will expect this.
8593 *
8594 * Caller passes blockgroups, a description of the final order that deltids
8595 * will be sorted in for heap_index_delete_tuples() bottom-up index deletion
8596 * processing. Note that deltids need not actually be sorted just yet (caller
8597 * only passes deltids to us so that we can interpret blockgroups).
8598 *
8599 * You might guess that the existence of contiguous blocks cannot matter much,
8600 * since in general the main factor that determines which blocks we visit is
8601 * the number of promising TIDs, which is a fixed hint from the index AM.
8602 * We're not really targeting the general case, though -- the actual goal is
8603 * to adapt our behavior to a wide variety of naturally occurring conditions.
8604 * The effects of most of the heuristics we apply are only noticeable in the
8605 * aggregate, over time and across many _related_ bottom-up index deletion
8606 * passes.
8607 *
8608 * Deeming certain blocks favorable allows heapam to recognize and adapt to
8609 * workloads where heap blocks visited during bottom-up index deletion can be
8610 * accessed contiguously, in the sense that each newly visited block is the
8611 * neighbor of the block that bottom-up deletion just finished processing (or
8612 * close enough to it). It will likely be cheaper to access more favorable
8613 * blocks sooner rather than later (e.g. in this pass, not across a series of
8614 * related bottom-up passes). Either way it is probably only a matter of time
8615 * (or a matter of further correlated version churn) before all blocks that
8616 * appear together as a single large batch of favorable blocks get accessed by
8617 * _some_ bottom-up pass. Large batches of favorable blocks tend to either
8618 * appear almost constantly or not even once (it all depends on per-index
8619 * workload characteristics).
8620 *
8621 * Note that the blockgroups sort order applies a power-of-two bucketing
8622 * scheme that creates opportunities for contiguous groups of blocks to get
8623 * batched together, at least with workloads that are naturally amenable to
8624 * being driven by heap block locality. This doesn't just enhance the spatial
8625 * locality of bottom-up heap block processing in the obvious way. It also
8626 * enables temporal locality of access, since sorting by heap block number
8627 * naturally tends to make the bottom-up processing order deterministic.
8628 *
8629 * Consider the following example to get a sense of how temporal locality
8630 * might matter: There is a heap relation with several indexes, each of which
8631 * is low to medium cardinality. It is subject to constant non-HOT updates.
8632 * The updates are skewed (in one part of the primary key, perhaps). None of
8633 * the indexes are logically modified by the UPDATE statements (if they were
8634 * then bottom-up index deletion would not be triggered in the first place).
8635 * Naturally, each new round of index tuples (for each heap tuple that gets a
8636 * heap_update() call) will have the same heap TID in each and every index.
8637 * Since these indexes are low cardinality and never get logically modified,
8638 * heapam processing during bottom-up deletion passes will access heap blocks
8639 * in approximately sequential order. Temporal locality of access occurs due
8640 * to bottom-up deletion passes behaving very similarly across each of the
8641 * indexes at any given moment. This keeps the number of buffer misses needed
8642 * to visit heap blocks to a minimum.
8643 */
8644static int
8646 TM_IndexDelete *deltids)
8647{
8648 int64 lastblock = -1;
8649 int nblocksfavorable = 0;
8650
8651 Assert(nblockgroups >= 1);
8653
8654 /*
8655 * We tolerate heap blocks that will be accessed only slightly out of
8656 * physical order. Small blips occur when a pair of almost-contiguous
8657 * blocks happen to fall into different buckets (perhaps due only to a
8658 * small difference in npromisingtids that the bucketing scheme didn't
8659 * quite manage to ignore). We effectively ignore these blips by applying
8660 * a small tolerance. The precise tolerance we use is a little arbitrary,
8661 * but it works well enough in practice.
8662 */
8663 for (int b = 0; b < nblockgroups; b++)
8664 {
8665 IndexDeleteCounts *group = blockgroups + b;
8666 TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
8668
8669 if (lastblock != -1 &&
8672 break;
8673
8675 lastblock = block;
8676 }
8677
8678 /* Always indicate that there is at least 1 favorable block */
8680
8681 return nblocksfavorable;
8682}
8683
8684/*
8685 * qsort comparison function for bottomup_sort_and_shrink()
8686 */
8687static int
8688bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
8689{
8692
8693 /*
8694 * Most significant field is npromisingtids (which we invert the order of
8695 * so as to sort in desc order).
8696 *
8697 * Caller should have already normalized npromisingtids fields into
8698 * power-of-two values (buckets).
8699 */
8700 if (group1->npromisingtids > group2->npromisingtids)
8701 return -1;
8702 if (group1->npromisingtids < group2->npromisingtids)
8703 return 1;
8704
8705 /*
8706 * Tiebreak: desc ntids sort order.
8707 *
8708 * We cannot expect power-of-two values for ntids fields. We should
8709 * behave as if they were already rounded up for us instead.
8710 */
8711 if (group1->ntids != group2->ntids)
8712 {
8715
8716 if (ntids1 > ntids2)
8717 return -1;
8718 if (ntids1 < ntids2)
8719 return 1;
8720 }
8721
8722 /*
8723 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
8724 * block in deltids array) order.
8725 *
8726 * This is equivalent to sorting in ascending heap block number order
8727 * (among otherwise equal subsets of the array). This approach allows us
8728 * to avoid accessing the out-of-line TID. (We rely on the assumption
8729 * that the deltids array was sorted in ascending heap TID order when
8730 * these offsets to the first TID from each heap block group were formed.)
8731 */
8732 if (group1->ifirsttid > group2->ifirsttid)
8733 return 1;
8734 if (group1->ifirsttid < group2->ifirsttid)
8735 return -1;
8736
8738
8739 return 0;
8740}
8741
8742/*
8743 * heap_index_delete_tuples() helper function for bottom-up deletion callers.
8744 *
8745 * Sorts deltids array in the order needed for useful processing by bottom-up
8746 * deletion. The array should already be sorted in TID order when we're
8747 * called. The sort process groups heap TIDs from deltids into heap block
8748 * groupings. Earlier/more-promising groups/blocks are usually those that are
8749 * known to have the most "promising" TIDs.
8750 *
8751 * Sets new size of deltids array (ndeltids) in state. deltids will only have
8752 * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
8753 * return. This often means that deltids will be shrunk to a small fraction
8754 * of its original size (we eliminate many heap blocks from consideration for
8755 * caller up front).
8756 *
8757 * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable()
8758 * for a definition and full details.
8759 */
8760static int
8762{
8766 int nblockgroups = 0;
8767 int ncopied = 0;
8768 int nblocksfavorable = 0;
8769
8770 Assert(delstate->bottomup);
8771 Assert(delstate->ndeltids > 0);
8772
8773 /* Calculate per-heap-block count of TIDs */
8775 for (int i = 0; i < delstate->ndeltids; i++)
8776 {
8777 TM_IndexDelete *ideltid = &delstate->deltids[i];
8778 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8779 ItemPointer htid = &ideltid->tid;
8780 bool promising = istatus->promising;
8781
8783 {
8784 /* New block group */
8785 nblockgroups++;
8786
8789
8791 blockgroups[nblockgroups - 1].ifirsttid = i;
8792 blockgroups[nblockgroups - 1].ntids = 1;
8793 blockgroups[nblockgroups - 1].npromisingtids = 0;
8794 }
8795 else
8796 {
8797 blockgroups[nblockgroups - 1].ntids++;
8798 }
8799
8800 if (promising)
8801 blockgroups[nblockgroups - 1].npromisingtids++;
8802 }
8803
8804 /*
8805 * We're about ready to sort block groups to determine the optimal order
8806 * for visiting heap blocks. But before we do, round the number of
8807 * promising tuples for each block group up to the next power-of-two,
8808 * unless it is very low (less than 4), in which case we round up to 4.
8809 * npromisingtids is far too noisy to trust when choosing between a pair
8810 * of block groups that both have very low values.
8811 *
8812 * This scheme divides heap blocks/block groups into buckets. Each bucket
8813 * contains blocks that have _approximately_ the same number of promising
8814 * TIDs as each other. The goal is to ignore relatively small differences
8815 * in the total number of promising entries, so that the whole process can
8816 * give a little weight to heapam factors (like heap block locality)
8817 * instead. This isn't a trade-off, really -- we have nothing to lose. It
8818 * would be foolish to interpret small differences in npromisingtids
8819 * values as anything more than noise.
8820 *
8821 * We tiebreak on nhtids when sorting block group subsets that have the
8822 * same npromisingtids, but this has the same issues as npromisingtids,
8823 * and so nhtids is subject to the same power-of-two bucketing scheme. The
8824 * only reason that we don't fix nhtids in the same way here too is that
8825 * we'll need accurate nhtids values after the sort. We handle nhtids
8826 * bucketization dynamically instead (in the sort comparator).
8827 *
8828 * See bottomup_nblocksfavorable() for a full explanation of when and how
8829 * heap locality/favorable blocks can significantly influence when and how
8830 * heap blocks are accessed.
8831 */
8832 for (int b = 0; b < nblockgroups; b++)
8833 {
8834 IndexDeleteCounts *group = blockgroups + b;
8835
8836 /* Better off falling back on nhtids with low npromisingtids */
8837 if (group->npromisingtids <= 4)
8838 group->npromisingtids = 4;
8839 else
8840 group->npromisingtids =
8842 }
8843
8844 /* Sort groups and rearrange caller's deltids array */
8847 reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
8848
8850 /* Determine number of favorable blocks at the start of final deltids */
8852 delstate->deltids);
8853
8854 for (int b = 0; b < nblockgroups; b++)
8855 {
8856 IndexDeleteCounts *group = blockgroups + b;
8857 TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
8858
8860 sizeof(TM_IndexDelete) * group->ntids);
8861 ncopied += group->ntids;
8862 }
8863
8864 /* Copy final grouped and sorted TIDs back into start of caller's array */
8866 sizeof(TM_IndexDelete) * ncopied);
8867 delstate->ndeltids = ncopied;
8868
8871
8872 return nblocksfavorable;
8873}
8874
8875/*
8876 * Perform XLogInsert for a heap-visible operation. 'block' is the block
8877 * being marked all-visible, and vm_buffer is the buffer containing the
8878 * corresponding visibility map block. Both should have already been modified
8879 * and dirtied.
8880 *
8881 * snapshotConflictHorizon comes from the largest xmin on the page being
8882 * marked all-visible. REDO routine uses it to generate recovery conflicts.
8883 *
8884 * If checksums or wal_log_hints are enabled, we may also generate a full-page
8885 * image of heap_buffer. Otherwise, we optimize away the FPI (by specifying
8886 * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not*
8887 * update the heap page's LSN.
8888 */
8891 TransactionId snapshotConflictHorizon, uint8 vmflags)
8892{
8895 uint8 flags;
8896
8899
8900 xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
8901 xlrec.flags = vmflags;
8906
8908
8909 flags = REGBUF_STANDARD;
8910 if (!XLogHintBitIsNeeded())
8911 flags |= REGBUF_NO_IMAGE;
8913
8915
8916 return recptr;
8917}
8918
8919/*
8920 * Perform XLogInsert for a heap-update operation. Caller must already
8921 * have modified the buffer(s) and marked them dirty.
8922 */
8923static XLogRecPtr
8928{
8932 uint8 info;
8934 uint16 prefixlen = 0,
8935 suffixlen = 0;
8937 Page page = BufferGetPage(newbuf);
8939 bool init;
8940 int bufflags;
8941
8942 /* Caller should not call me on a non-WAL-logged relation */
8944
8946
8948 info = XLOG_HEAP_HOT_UPDATE;
8949 else
8950 info = XLOG_HEAP_UPDATE;
8951
8952 /*
8953 * If the old and new tuple are on the same page, we only need to log the
8954 * parts of the new tuple that were changed. That saves on the amount of
8955 * WAL we need to write. Currently, we just count any unchanged bytes in
8956 * the beginning and end of the tuple. That's quick to check, and
8957 * perfectly covers the common case that only one field is updated.
8958 *
8959 * We could do this even if the old and new tuple are on different pages,
8960 * but only if we don't make a full-page image of the old page, which is
8961 * difficult to know in advance. Also, if the old tuple is corrupt for
8962 * some reason, it would allow the corruption to propagate the new page,
8963 * so it seems best to avoid. Under the general assumption that most
8964 * updates tend to create the new tuple version on the same page, there
8965 * isn't much to be gained by doing this across pages anyway.
8966 *
8967 * Skip this if we're taking a full-page image of the new page, as we
8968 * don't include the new tuple in the WAL record in that case. Also
8969 * disable if effective_wal_level='logical', as logical decoding needs to
8970 * be able to read the new tuple in whole from the WAL record alone.
8971 */
8972 if (oldbuf == newbuf && !need_tuple_data &&
8974 {
8975 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8976 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8977 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8978 int newlen = newtup->t_len - newtup->t_data->t_hoff;
8979
8980 /* Check for common prefix between old and new tuple */
8981 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8982 {
8983 if (newp[prefixlen] != oldp[prefixlen])
8984 break;
8985 }
8986
8987 /*
8988 * Storing the length of the prefix takes 2 bytes, so we need to save
8989 * at least 3 bytes or there's no point.
8990 */
8991 if (prefixlen < 3)
8992 prefixlen = 0;
8993
8994 /* Same for suffix */
8996 {
8997 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
8998 break;
8999 }
9000 if (suffixlen < 3)
9001 suffixlen = 0;
9002 }
9003
9004 /* Prepare main WAL data chain */
9005 xlrec.flags = 0;
9010 if (prefixlen > 0)
9012 if (suffixlen > 0)
9014 if (need_tuple_data)
9015 {
9017 if (old_key_tuple)
9018 {
9019 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
9021 else
9023 }
9024 }
9025
9026 /* If new tuple is the single and first tuple on page... */
9029 {
9030 info |= XLOG_HEAP_INIT_PAGE;
9031 init = true;
9032 }
9033 else
9034 init = false;
9035
9036 /* Prepare WAL data for the old page */
9037 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
9038 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
9039 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
9040 oldtup->t_data->t_infomask2);
9041
9042 /* Prepare WAL data for the new page */
9043 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
9044 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
9045
9047 if (init)
9049 if (need_tuple_data)
9051
9053 if (oldbuf != newbuf)
9055
9057
9058 /*
9059 * Prepare WAL data for the new tuple.
9060 */
9061 if (prefixlen > 0 || suffixlen > 0)
9062 {
9063 if (prefixlen > 0 && suffixlen > 0)
9064 {
9067 XLogRegisterBufData(0, &prefix_suffix, sizeof(uint16) * 2);
9068 }
9069 else if (prefixlen > 0)
9070 {
9071 XLogRegisterBufData(0, &prefixlen, sizeof(uint16));
9072 }
9073 else
9074 {
9075 XLogRegisterBufData(0, &suffixlen, sizeof(uint16));
9076 }
9077 }
9078
9079 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
9080 xlhdr.t_infomask = newtup->t_data->t_infomask;
9081 xlhdr.t_hoff = newtup->t_data->t_hoff;
9083
9084 /*
9085 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
9086 *
9087 * The 'data' doesn't include the common prefix or suffix.
9088 */
9090 if (prefixlen == 0)
9091 {
9093 (char *) newtup->t_data + SizeofHeapTupleHeader,
9095 }
9096 else
9097 {
9098 /*
9099 * Have to write the null bitmap and data after the common prefix as
9100 * two separate rdata entries.
9101 */
9102 /* bitmap [+ padding] [+ oid] */
9103 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
9104 {
9106 (char *) newtup->t_data + SizeofHeapTupleHeader,
9107 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
9108 }
9109
9110 /* data after common prefix */
9112 (char *) newtup->t_data + newtup->t_data->t_hoff + prefixlen,
9113 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
9114 }
9115
9116 /* We need to log a tuple identity */
9118 {
9119 /* don't really need this, but its more comfy to decode */
9120 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
9121 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
9122 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
9123
9125
9126 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
9129 }
9130
9131 /* filtering by origin on a row level is much more efficient */
9133
9134 recptr = XLogInsert(RM_HEAP_ID, info);
9135
9136 return recptr;
9137}
9138
9139/*
9140 * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
9141 *
9142 * This is only used when effective_wal_level is logical, and only for
9143 * catalog tuples.
9144 */
9145static XLogRecPtr
9147{
9149
9151 HeapTupleHeader hdr = tup->t_data;
9152
9153 Assert(ItemPointerIsValid(&tup->t_self));
9154 Assert(tup->t_tableOid != InvalidOid);
9155
9156 xlrec.top_xid = GetTopTransactionId();
9157 xlrec.target_locator = relation->rd_locator;
9158 xlrec.target_tid = tup->t_self;
9159
9160 /*
9161 * If the tuple got inserted & deleted in the same TX we definitely have a
9162 * combo CID, set cmin and cmax.
9163 */
9164 if (hdr->t_infomask & HEAP_COMBOCID)
9165 {
9168 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
9169 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
9170 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
9171 }
9172 /* No combo CID, so only cmin or cmax can be set by this TX */
9173 else
9174 {
9175 /*
9176 * Tuple inserted.
9177 *
9178 * We need to check for LOCK ONLY because multixacts might be
9179 * transferred to the new tuple in case of FOR KEY SHARE updates in
9180 * which case there will be an xmax, although the tuple just got
9181 * inserted.
9182 */
9183 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
9185 {
9187 xlrec.cmax = InvalidCommandId;
9188 }
9189 /* Tuple from a different tx updated or deleted. */
9190 else
9191 {
9192 xlrec.cmin = InvalidCommandId;
9194 }
9195 xlrec.combocid = InvalidCommandId;
9196 }
9197
9198 /*
9199 * Note that we don't need to register the buffer here, because this
9200 * operation does not modify the page. The insert/update/delete that
9201 * called us certainly did, but that's WAL-logged separately.
9202 */
9205
9206 /* will be looked at irrespective of origin */
9207
9209
9210 return recptr;
9211}
9212
9213/*
9214 * Build a heap tuple representing the configured REPLICA IDENTITY to represent
9215 * the old tuple in an UPDATE or DELETE.
9216 *
9217 * Returns NULL if there's no need to log an identity or if there's no suitable
9218 * key defined.
9219 *
9220 * Pass key_required true if any replica identity columns changed value, or if
9221 * any of them have any external data. Delete must always pass true.
9222 *
9223 * *copy is set to true if the returned tuple is a modified copy rather than
9224 * the same tuple that was passed in.
9225 */
9226static HeapTuple
9228 bool *copy)
9229{
9230 TupleDesc desc = RelationGetDescr(relation);
9231 char replident = relation->rd_rel->relreplident;
9234 bool nulls[MaxHeapAttributeNumber];
9236
9237 *copy = false;
9238
9239 if (!RelationIsLogicallyLogged(relation))
9240 return NULL;
9241
9242 if (replident == REPLICA_IDENTITY_NOTHING)
9243 return NULL;
9244
9245 if (replident == REPLICA_IDENTITY_FULL)
9246 {
9247 /*
9248 * When logging the entire old tuple, it very well could contain
9249 * toasted columns. If so, force them to be inlined.
9250 */
9251 if (HeapTupleHasExternal(tp))
9252 {
9253 *copy = true;
9254 tp = toast_flatten_tuple(tp, desc);
9255 }
9256 return tp;
9257 }
9258
9259 /* if the key isn't required and we're only logging the key, we're done */
9260 if (!key_required)
9261 return NULL;
9262
9263 /* find out the replica identity columns */
9266
9267 /*
9268 * If there's no defined replica identity columns, treat as !key_required.
9269 * (This case should not be reachable from heap_update, since that should
9270 * calculate key_required accurately. But heap_delete just passes
9271 * constant true for key_required, so we can hit this case in deletes.)
9272 */
9273 if (bms_is_empty(idattrs))
9274 return NULL;
9275
9276 /*
9277 * Construct a new tuple containing only the replica identity columns,
9278 * with nulls elsewhere. While we're at it, assert that the replica
9279 * identity columns aren't null.
9280 */
9281 heap_deform_tuple(tp, desc, values, nulls);
9282
9283 for (int i = 0; i < desc->natts; i++)
9284 {
9286 idattrs))
9287 Assert(!nulls[i]);
9288 else
9289 nulls[i] = true;
9290 }
9291
9292 key_tuple = heap_form_tuple(desc, values, nulls);
9293 *copy = true;
9294
9296
9297 /*
9298 * If the tuple, which by here only contains indexed columns, still has
9299 * toasted columns, force them to be inlined. This is somewhat unlikely
9300 * since there's limits on the size of indexed columns, so we don't
9301 * duplicate toast_flatten_tuple()s functionality in the above loop over
9302 * the indexed columns, even if it would be more efficient.
9303 */
9305 {
9307
9310 }
9311
9312 return key_tuple;
9313}
9314
9315/*
9316 * HeapCheckForSerializableConflictOut
9317 * We are reading a tuple. If it's not visible, there may be a
9318 * rw-conflict out with the inserter. Otherwise, if it is visible to us
9319 * but has been deleted, there may be a rw-conflict out with the deleter.
9320 *
9321 * We will determine the top level xid of the writing transaction with which
9322 * we may be in conflict, and ask CheckForSerializableConflictOut() to check
9323 * for overlap with our own transaction.
9324 *
9325 * This function should be called just about anywhere in heapam.c where a
9326 * tuple has been read. The caller must hold at least a shared lock on the
9327 * buffer, because this function might set hint bits on the tuple. There is
9328 * currently no known reason to call this function from an index AM.
9329 */
9330void
9331HeapCheckForSerializableConflictOut(bool visible, Relation relation,
9332 HeapTuple tuple, Buffer buffer,
9333 Snapshot snapshot)
9334{
9335 TransactionId xid;
9337
9338 if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9339 return;
9340
9341 /*
9342 * Check to see whether the tuple has been written to by a concurrent
9343 * transaction, either to create it not visible to us, or to delete it
9344 * while it is visible to us. The "visible" bool indicates whether the
9345 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9346 * is going on with it.
9347 *
9348 * In the event of a concurrently inserted tuple that also happens to have
9349 * been concurrently updated (by a separate transaction), the xmin of the
9350 * tuple will be used -- not the updater's xid.
9351 */
9353 switch (htsvResult)
9354 {
9355 case HEAPTUPLE_LIVE:
9356 if (visible)
9357 return;
9358 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9359 break;
9362 if (visible)
9363 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9364 else
9365 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9366
9368 {
9369 /* This is like the HEAPTUPLE_DEAD case */
9370 Assert(!visible);
9371 return;
9372 }
9373 break;
9375 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9376 break;
9377 case HEAPTUPLE_DEAD:
9378 Assert(!visible);
9379 return;
9380 default:
9381
9382 /*
9383 * The only way to get to this default clause is if a new value is
9384 * added to the enum type without adding it to this switch
9385 * statement. That's a bug, so elog.
9386 */
9387 elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9388
9389 /*
9390 * In spite of having all enum values covered and calling elog on
9391 * this default, some compilers think this is a code path which
9392 * allows xid to be used below without initialization. Silence
9393 * that warning.
9394 */
9396 }
9397
9400
9401 /*
9402 * Find top level xid. Bail out if xid is too early to be a conflict, or
9403 * if it's our own xid.
9404 */
9406 return;
9409 return;
9410
9411 CheckForSerializableConflictOut(relation, xid, snapshot);
9412}
int16 AttrNumber
Definition attnum.h:21
int bms_next_member(const Bitmapset *a, int prevbit)
Definition bitmapset.c:1290
void bms_free(Bitmapset *a)
Definition bitmapset.c:239
bool bms_is_member(int x, const Bitmapset *a)
Definition bitmapset.c:510
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition bitmapset.c:799
Bitmapset * bms_add_members(Bitmapset *a, const Bitmapset *b)
Definition bitmapset.c:901
bool bms_overlap(const Bitmapset *a, const Bitmapset *b)
Definition bitmapset.c:575
#define bms_is_empty(a)
Definition bitmapset.h:118
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static int32 next
Definition blutils.c:225
static Datum values[MAXATTR]
Definition bootstrap.c:187
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4355
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:774
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition bufmgr.c:4376
bool BufferIsDirty(Buffer buffer)
Definition bufmgr.c:3022
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5501
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5518
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3055
int maintenance_io_concurrency
Definition bufmgr.c:193
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:866
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:307
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:470
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:437
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:210
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:220
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:205
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:332
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:421
Size PageGetHeapFreeSpace(const PageData *page)
Definition bufpage.c:990
PageHeaderData * PageHeader
Definition bufpage.h:173
static bool PageIsAllVisible(const PageData *page)
Definition bufpage.h:428
static void PageClearAllVisible(Page page)
Definition bufpage.h:438
#define SizeOfPageHeaderData
Definition bufpage.h:216
static void PageSetAllVisible(Page page)
Definition bufpage.h:433
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:243
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:353
static void PageSetFull(Page page)
Definition bufpage.h:417
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:390
PageData * Page
Definition bufpage.h:81
#define PageClearPrunable(page)
Definition bufpage.h:459
#define PageSetPrunable(page, xid)
Definition bufpage.h:452
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:371
#define NameStr(name)
Definition c.h:798
#define InvalidCommandId
Definition c.h:716
#define pg_noinline
Definition c.h:307
#define Min(x, y)
Definition c.h:1054
#define likely(x)
Definition c.h:423
#define MAXALIGN(LEN)
Definition c.h:859
uint8_t uint8
Definition c.h:577
#define Assert(condition)
Definition c.h:906
int64_t int64
Definition c.h:576
TransactionId MultiXactId
Definition c.h:709
#define pg_attribute_always_inline
Definition c.h:291
int16_t int16
Definition c.h:574
#define SHORTALIGN(LEN)
Definition c.h:855
uint16_t uint16
Definition c.h:578
#define pg_unreachable()
Definition c.h:353
#define unlikely(x)
Definition c.h:424
uint32_t uint32
Definition c.h:579
#define lengthof(array)
Definition c.h:836
#define StaticAssertDecl(condition, errmessage)
Definition c.h:971
uint32 CommandId
Definition c.h:713
uint32 TransactionId
Definition c.h:699
#define OidIsValid(objectId)
Definition c.h:821
size_t Size
Definition c.h:652
bool IsToastRelation(Relation relation)
Definition catalog.c:206
bool IsCatalogRelation(Relation relation)
Definition catalog.c:104
bool IsSharedRelation(Oid relationId)
Definition catalog.c:304
bool IsInplaceUpdateRelation(Relation relation)
Definition catalog.c:183
CommandId HeapTupleHeaderGetCmin(const HeapTupleHeaderData *tup)
Definition combocid.c:104
void HeapTupleHeaderAdjustCmax(const HeapTupleHeaderData *tup, CommandId *cmax, bool *iscombo)
Definition combocid.c:153
CommandId HeapTupleHeaderGetCmax(const HeapTupleHeaderData *tup)
Definition combocid.c:118
bool datumIsEqual(Datum value1, Datum value2, bool typByVal, int typLen)
Definition datum.c:223
Datum arg
Definition elog.c:1322
int errcode(int sqlerrcode)
Definition elog.c:874
int int errdetail_internal(const char *fmt,...) pg_attribute_printf(1
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:36
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
HeapTuple ExecFetchSlotHeapTuple(TupleTableSlot *slot, bool materialize, bool *shouldFree)
TupleTableSlot * ExecStoreBufferHeapTuple(HeapTuple tuple, TupleTableSlot *slot, Buffer buffer)
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:461
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:643
int NBuffers
Definition globals.c:142
Oid MyDatabaseTableSpace
Definition globals.c:96
Oid MyDatabaseId
Definition globals.c:94
void simple_heap_update(Relation relation, const ItemPointerData *otid, HeapTuple tup, TU_UpdateIndexes *update_indexes)
Definition heapam.c:4557
static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, LockTupleMode lockmode, bool *current_is_member)
Definition heapam.c:7682
void heap_insert(Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate)
Definition heapam.c:2142
static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup)
Definition heapam.c:9147
XLogRecPtr log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags)
Definition heapam.c:8891
static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2)
Definition heapam.c:5396
static TM_Result heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax, const ItemPointerData *tid, TransactionId xid, LockTupleMode mode)
Definition heapam.c:5768
static void heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir)
Definition heapam.c:707
bool heap_inplace_lock(Relation relation, HeapTuple oldtup_ptr, Buffer buffer, void(*release_callback)(void *), void *arg)
Definition heapam.c:6438
bool heap_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf)
Definition heapam.c:1659
#define BOTTOMUP_TOLERANCE_NBLOCKS
Definition heapam.c:190
static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options)
Definition heapam.c:2333
static BlockNumber heap_scan_stream_read_next_parallel(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition heapam.c:252
static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
Definition heapam.c:8762
static bool heap_acquire_tuplock(Relation relation, const ItemPointerData *tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock)
Definition heapam.c:5347
static int heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
Definition heapam.c:2381
static pg_attribute_always_inline int page_collect_tuples(HeapScanDesc scan, Snapshot snapshot, Page page, Buffer buffer, BlockNumber block, int lines, bool all_visible, bool check_serializable)
Definition heapam.c:522
static BlockNumber heap_scan_stream_read_next_serial(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition heapam.c:292
static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2)
Definition heapam.c:7533
void heap_finish_speculative(Relation relation, const ItemPointerData *tid)
Definition heapam.c:6169
void HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
Definition heapam.c:8060
bool heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
Definition heapam.c:1449
#define LOCKMODE_from_mxstatus(status)
Definition heapam.c:159
void heap_endscan(TableScanDesc sscan)
Definition heapam.c:1371
#define FRM_RETURN_IS_XID
Definition heapam.c:6726
#define TUPLOCK_from_mxstatus(status)
Definition heapam.c:218
void heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode)
Definition heapam.c:1318
void heap_inplace_unlock(Relation relation, HeapTuple oldtup, Buffer buffer)
Definition heapam.c:6716
TM_Result heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
Definition heapam.c:3313
static int index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
Definition heapam.c:8514
static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining, bool logLockFailure)
Definition heapam.c:7882
bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
Definition heapam.c:7897
TM_Result heap_delete(Relation relation, const ItemPointerData *tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
Definition heapam.c:2844
static TransactionId FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, const struct VacuumCutoffs *cutoffs, uint16 *flags, HeapPageFreeze *pagefrz)
Definition heapam.c:6777
static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy)
Definition heapam.c:9228
static pg_noinline BlockNumber heapgettup_initial_block(HeapScanDesc scan, ScanDirection dir)
Definition heapam.c:752
static TM_Result heap_lock_updated_tuple(Relation rel, uint16 prior_infomask, TransactionId prior_raw_xmax, const ItemPointerData *prior_ctid, TransactionId xid, LockTupleMode mode)
Definition heapam.c:6116
#define LockTupleTuplock(rel, tup, mode)
Definition heapam.c:167
bool heap_tuple_should_freeze(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, TransactionId *NoFreezePageRelfrozenXid, MultiXactId *NoFreezePageRelminMxid)
Definition heapam.c:7952
bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId FreezeLimit, TransactionId MultiXactCutoff)
Definition heapam.c:7488
void heap_inplace_update_and_unlock(Relation relation, HeapTuple oldtup, HeapTuple tuple, Buffer buffer)
Definition heapam.c:6576
static BlockNumber heapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir)
Definition heapam.c:876
static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
Definition heapam.c:7614
#define BOTTOMUP_MAX_NBLOCKS
Definition heapam.c:189
void ReleaseBulkInsertStatePin(BulkInsertState bistate)
Definition heapam.c:2104
#define FRM_MARK_COMMITTED
Definition heapam.c:6728
#define FRM_NOOP
Definition heapam.c:6724
static void index_delete_check_htid(TM_IndexDeleteOp *delstate, Page page, OffsetNumber maxoff, const ItemPointerData *htid, TM_IndexStatus *istatus)
Definition heapam.c:8145
HeapTuple heap_getnext(TableScanDesc sscan, ScanDirection direction)
Definition heapam.c:1410
bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call)
Definition heapam.c:1779
void heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
Definition heapam.c:7466
bool heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
Definition heapam.c:1552
static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining)
Definition heapam.c:7860
void heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid)
Definition heapam.c:1479
void heap_abort_speculative(Relation relation, const ItemPointerData *tid)
Definition heapam.c:6256
static BlockNumber bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data, void *per_buffer_data)
Definition heapam.c:317
TableScanDesc heap_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, ParallelTableScanDesc parallel_scan, uint32 flags)
Definition heapam.c:1164
static void heapgettup(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
Definition heapam.c:960
static Page heapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
Definition heapam.c:830
static uint8 compute_infobits(uint16 infomask, uint16 infomask2)
Definition heapam.c:2799
#define FRM_RETURN_IS_MULTI
Definition heapam.c:6727
#define FRM_INVALIDATE_XMAX
Definition heapam.c:6725
static bool heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, bool isnull1, bool isnull2)
Definition heapam.c:4416
static void index_delete_sort(TM_IndexDeleteOp *delstate)
Definition heapam.c:8550
void heap_prepare_pagescan(TableScanDesc sscan)
Definition heapam.c:616
static Bitmapset * HeapDetermineColumnsInfo(Relation relation, Bitmapset *interesting_cols, Bitmapset *external_cols, HeapTuple oldtup, HeapTuple newtup, bool *has_external)
Definition heapam.c:4467
static const int MultiXactStatusLock[MaxMultiXactStatus+1]
Definition heapam.c:207
void simple_heap_insert(Relation relation, HeapTuple tup)
Definition heapam.c:2786
static bool xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
Definition heapam.c:2821
#define UnlockTupleTuplock(rel, tup, mode)
Definition heapam.c:169
static TM_Result test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, LockTupleMode mode, HeapTuple tup, bool *needwait)
Definition heapam.c:5677
bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, HeapPageFreeze *pagefrz, HeapTupleFreeze *frz, bool *totally_frozen)
Definition heapam.c:7133
static void AssertHasSnapshotForToast(Relation rel)
Definition heapam.c:225
void simple_heap_delete(Relation relation, const ItemPointerData *tid)
Definition heapam.c:3267
static const struct @15 tupleLockExtraInfo[]
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared)
Definition heapam.c:8925
TransactionId HeapTupleGetUpdateXid(const HeapTupleHeaderData *tup)
Definition heapam.c:7666
TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition heapam.c:8205
void heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate)
Definition heapam.c:2413
#define ConditionalLockTupleTuplock(rel, tup, mode, log)
Definition heapam.c:171
static void initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
Definition heapam.c:357
static int bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups, TM_IndexDelete *deltids)
Definition heapam.c:8646
static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
Definition heapam.c:1070
TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_updates, Buffer *buffer, TM_FailureData *tmfd)
Definition heapam.c:4645
static void UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
Definition heapam.c:2053
static bool Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, bool nowait, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining, bool logLockFailure)
Definition heapam.c:7782
static int bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
Definition heapam.c:8689
void heap_get_latest_tid(TableScanDesc sscan, ItemPointer tid)
Definition heapam.c:1931
void heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
Definition heapam.c:500
void HeapCheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot)
Definition heapam.c:9332
static Page heapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
Definition heapam.c:799
static MultiXactStatus get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
Definition heapam.c:4598
void heap_pre_freeze_checks(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
Definition heapam.c:7413
BulkInsertState GetBulkInsertState(void)
Definition heapam.c:2075
void FreeBulkInsertState(BulkInsertState bistate)
Definition heapam.c:2092
#define HEAP_INSERT_SPECULATIVE
Definition heapam.h:40
#define HEAP_FREEZE_CHECK_XMAX_ABORTED
Definition heapam.h:138
struct HeapScanDescData * HeapScanDesc
Definition heapam.h:102
HTSV_Result
Definition heapam.h:125
@ HEAPTUPLE_RECENTLY_DEAD
Definition heapam.h:128
@ HEAPTUPLE_INSERT_IN_PROGRESS
Definition heapam.h:129
@ HEAPTUPLE_LIVE
Definition heapam.h:127
@ HEAPTUPLE_DELETE_IN_PROGRESS
Definition heapam.h:130
@ HEAPTUPLE_DEAD
Definition heapam.h:126
struct BitmapHeapScanDescData * BitmapHeapScanDesc
Definition heapam.h:110
#define HEAP_INSERT_FROZEN
Definition heapam.h:38
static void heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz)
Definition heapam.h:505
#define HEAP_FREEZE_CHECK_XMIN_COMMITTED
Definition heapam.h:137
#define HEAP_INSERT_NO_LOGICAL
Definition heapam.h:39
struct BulkInsertStateData * BulkInsertState
Definition heapam.h:46
const TableAmRoutine * GetHeapamTableAmRoutine(void)
void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid)
bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer)
bool HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest)
HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer)
int HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, int ntups, BatchMVCCState *batchmvcc, OffsetNumber *vistuples_dense)
bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, Buffer buffer)
#define XLH_INSERT_ON_TOAST_RELATION
Definition heapam_xlog.h:76
#define SizeOfHeapMultiInsert
#define XLOG_HEAP2_MULTI_INSERT
Definition heapam_xlog.h:64
#define SizeOfHeapUpdate
#define XLH_INVALID_XVAC
#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:87
#define SizeOfHeapVisible
#define XLOG_HEAP_HOT_UPDATE
Definition heapam_xlog.h:37
#define XLOG_HEAP_DELETE
Definition heapam_xlog.h:34
#define XLH_INSERT_IS_SPECULATIVE
Definition heapam_xlog.h:74
#define XLH_LOCK_ALL_FROZEN_CLEARED
#define XLH_DELETE_CONTAINS_OLD_KEY
#define XLH_UPDATE_CONTAINS_NEW_TUPLE
Definition heapam_xlog.h:90
#define XLH_INSERT_LAST_IN_MULTI
Definition heapam_xlog.h:73
#define XLH_INSERT_ALL_FROZEN_SET
Definition heapam_xlog.h:79
#define XLH_FREEZE_XVAC
#define XLOG_HEAP_UPDATE
Definition heapam_xlog.h:35
#define XLHL_XMAX_KEYSHR_LOCK
#define XLH_DELETE_ALL_VISIBLE_CLEARED
#define XLH_UPDATE_CONTAINS_OLD_TUPLE
Definition heapam_xlog.h:88
#define SizeOfHeapNewCid
#define SizeOfHeapLockUpdated
#define XLHL_XMAX_IS_MULTI
#define XLH_INSERT_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:72
#define SizeOfHeapHeader
#define XLH_DELETE_IS_PARTITION_MOVE
#define MinSizeOfHeapInplace
#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:85
#define XLHL_XMAX_LOCK_ONLY
#define XLOG_HEAP_INPLACE
Definition heapam_xlog.h:40
#define XLOG_HEAP2_LOCK_UPDATED
Definition heapam_xlog.h:65
#define XLH_UPDATE_SUFFIX_FROM_OLD
Definition heapam_xlog.h:92
#define XLH_UPDATE_PREFIX_FROM_OLD
Definition heapam_xlog.h:91
#define SizeOfMultiInsertTuple
#define XLHL_XMAX_EXCL_LOCK
#define XLOG_HEAP2_NEW_CID
Definition heapam_xlog.h:66
#define XLH_DELETE_CONTAINS_OLD_TUPLE
#define XLOG_HEAP_LOCK
Definition heapam_xlog.h:39
#define XLOG_HEAP_INSERT
Definition heapam_xlog.h:33
#define SizeOfHeapInsert
#define SizeOfHeapDelete
#define XLH_DELETE_IS_SUPER
#define XLH_UPDATE_CONTAINS_OLD_KEY
Definition heapam_xlog.h:89
#define XLHL_KEYS_UPDATED
#define XLOG_HEAP2_VISIBLE
Definition heapam_xlog.h:63
#define XLH_INSERT_CONTAINS_NEW_TUPLE
Definition heapam_xlog.h:75
#define XLOG_HEAP_INIT_PAGE
Definition heapam_xlog.h:47
#define SizeOfHeapConfirm
#define SizeOfHeapLock
#define XLOG_HEAP_CONFIRM
Definition heapam_xlog.h:38
void heap_toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative)
Definition heaptoast.c:43
HeapTuple heap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, int options)
Definition heaptoast.c:96
HeapTuple toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc)
Definition heaptoast.c:350
#define TOAST_TUPLE_THRESHOLD
Definition heaptoast.h:48
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition heaptuple.c:1117
void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull)
Definition heaptuple.c:1346
void heap_freetuple(HeapTuple htup)
Definition heaptuple.c:1435
void RelationPutHeapTuple(Relation relation, Buffer buffer, HeapTuple tuple, bool token)
Definition hio.c:35
Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertState bistate, Buffer *vmbuffer, Buffer *vmbuffer_other, int num_pages)
Definition hio.c:500
HeapTupleHeaderData * HeapTupleHeader
Definition htup.h:23
#define HEAP_MOVED_OFF
#define HEAP_XMAX_SHR_LOCK
static bool HeapTupleIsHotUpdated(const HeapTupleData *tuple)
#define HEAP_XMIN_FROZEN
static Datum heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
static bool HeapTupleHeaderXminFrozen(const HeapTupleHeaderData *tup)
#define HeapTupleHeaderGetNatts(tup)
static void HeapTupleHeaderSetXminFrozen(HeapTupleHeaderData *tup)
#define SizeofHeapTupleHeader
#define HEAP_KEYS_UPDATED
static bool HEAP_XMAX_IS_SHR_LOCKED(uint16 infomask)
static bool HEAP_XMAX_IS_LOCKED_ONLY(uint16 infomask)
static bool HeapTupleHeaderXminInvalid(const HeapTupleHeaderData *tup)
static void HeapTupleClearHotUpdated(const HeapTupleData *tuple)
static bool HeapTupleHasExternal(const HeapTupleData *tuple)
static TransactionId HeapTupleHeaderGetXvac(const HeapTupleHeaderData *tup)
#define HEAP2_XACT_MASK
static void HeapTupleHeaderSetCmax(HeapTupleHeaderData *tup, CommandId cid, bool iscombo)
#define HEAP_XMAX_LOCK_ONLY
static void HeapTupleHeaderClearHotUpdated(HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetCmin(HeapTupleHeaderData *tup, CommandId cid)
#define HEAP_XMAX_BITS
#define HEAP_LOCK_MASK
static CommandId HeapTupleHeaderGetRawCommandId(const HeapTupleHeaderData *tup)
static TransactionId HeapTupleHeaderGetRawXmax(const HeapTupleHeaderData *tup)
static bool HeapTupleHeaderIsHeapOnly(const HeapTupleHeaderData *tup)
static bool HeapTupleIsHeapOnly(const HeapTupleData *tuple)
#define HEAP_MOVED
static void HeapTupleSetHeapOnly(const HeapTupleData *tuple)
#define HEAP_XMAX_IS_MULTI
static bool HEAP_XMAX_IS_KEYSHR_LOCKED(uint16 infomask)
#define HEAP_XMAX_COMMITTED
static TransactionId HeapTupleHeaderGetXmin(const HeapTupleHeaderData *tup)
#define HEAP_COMBOCID
#define HEAP_XACT_MASK
static bool HeapTupleHeaderIndicatesMovedPartitions(const HeapTupleHeaderData *tup)
static void HeapTupleSetHotUpdated(const HeapTupleData *tuple)
#define HEAP_XMAX_EXCL_LOCK
static bool HeapTupleHeaderIsHotUpdated(const HeapTupleHeaderData *tup)
#define HEAP_XMAX_INVALID
static TransactionId HeapTupleHeaderGetRawXmin(const HeapTupleHeaderData *tup)
static void * GETSTRUCT(const HeapTupleData *tuple)
static void HeapTupleClearHeapOnly(const HeapTupleData *tuple)
#define MaxHeapAttributeNumber
static bool HeapTupleHeaderIsSpeculative(const HeapTupleHeaderData *tup)
static TransactionId HeapTupleHeaderGetUpdateXid(const HeapTupleHeaderData *tup)
#define MaxHeapTuplesPerPage
static bool HEAP_XMAX_IS_EXCL_LOCKED(uint16 infomask)
static void HeapTupleHeaderSetXmin(HeapTupleHeaderData *tup, TransactionId xid)
static bool HEAP_LOCKED_UPGRADED(uint16 infomask)
#define HEAP_UPDATED
#define HEAP_XMAX_KEYSHR_LOCK
static void HeapTupleHeaderSetMovedPartitions(HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetXmax(HeapTupleHeaderData *tup, TransactionId xid)
static bool HeapTupleHeaderXminCommitted(const HeapTupleHeaderData *tup)
#define IsParallelWorker()
Definition parallel.h:62
void index_close(Relation relation, LOCKMODE lockmode)
Definition indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition indexam.c:133
int remaining
Definition informix.c:692
#define INJECTION_POINT(name, arg)
void AcceptInvalidationMessages(void)
Definition inval.c:930
int inplaceGetInvalidationMessages(SharedInvalidationMessage **msgs, bool *RelcacheInitFileInval)
Definition inval.c:1088
void PreInplace_Inval(void)
Definition inval.c:1250
void CacheInvalidateHeapTupleInplace(Relation relation, HeapTuple key_equivalent_tuple)
Definition inval.c:1593
void AtInplace_Inval(void)
Definition inval.c:1263
void ForgetInplace_Inval(void)
Definition inval.c:1286
void CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple, HeapTuple newtuple)
Definition inval.c:1571
int b
Definition isn.c:74
int j
Definition isn.c:78
int i
Definition isn.c:77
#define ItemIdGetLength(itemId)
Definition itemid.h:59
#define ItemIdIsNormal(itemId)
Definition itemid.h:99
#define ItemIdGetRedirect(itemId)
Definition itemid.h:78
#define ItemIdIsUsed(itemId)
Definition itemid.h:92
#define ItemIdIsRedirected(itemId)
Definition itemid.h:106
#define ItemIdHasStorage(itemId)
Definition itemid.h:120
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
bool ItemPointerEquals(const ItemPointerData *pointer1, const ItemPointerData *pointer2)
Definition itemptr.c:35
static void ItemPointerSet(ItemPointerData *pointer, BlockNumber blockNumber, OffsetNumber offNum)
Definition itemptr.h:135
static void ItemPointerSetInvalid(ItemPointerData *pointer)
Definition itemptr.h:184
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
Definition itemptr.h:158
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition itemptr.h:147
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition itemptr.h:124
static bool ItemPointerIndicatesMovedPartitions(const ItemPointerData *pointer)
Definition itemptr.h:197
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
static BlockNumber ItemPointerGetBlockNumberNoCheck(const ItemPointerData *pointer)
Definition itemptr.h:93
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition itemptr.h:83
void UnlockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode)
Definition lmgr.c:601
bool ConditionalXactLockTableWait(TransactionId xid, bool logLockFailure)
Definition lmgr.c:739
void LockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode)
Definition lmgr.c:562
void XactLockTableWait(TransactionId xid, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper)
Definition lmgr.c:663
XLTW_Oper
Definition lmgr.h:25
@ XLTW_None
Definition lmgr.h:26
@ XLTW_Lock
Definition lmgr.h:29
@ XLTW_Delete
Definition lmgr.h:28
@ XLTW_LockUpdated
Definition lmgr.h:30
@ XLTW_Update
Definition lmgr.h:27
bool LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode, bool orstronger)
Definition lock.c:643
bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
Definition lock.c:623
bool log_lock_failures
Definition lock.c:54
#define SET_LOCKTAG_RELATION(locktag, dboid, reloid)
Definition lock.h:183
#define SET_LOCKTAG_TUPLE(locktag, dboid, reloid, blocknum, offnum)
Definition lock.h:219
int LOCKMODE
Definition lockdefs.h:26
#define ShareRowExclusiveLock
Definition lockdefs.h:41
#define AccessShareLock
Definition lockdefs.h:36
#define InplaceUpdateTupleLock
Definition lockdefs.h:48
#define ShareUpdateExclusiveLock
Definition lockdefs.h:39
LockWaitPolicy
Definition lockoptions.h:38
@ LockWaitSkip
Definition lockoptions.h:42
@ LockWaitBlock
Definition lockoptions.h:40
@ LockWaitError
Definition lockoptions.h:44
LockTupleMode
Definition lockoptions.h:51
@ LockTupleExclusive
Definition lockoptions.h:59
@ LockTupleNoKeyExclusive
Definition lockoptions.h:57
@ LockTupleShare
Definition lockoptions.h:55
@ LockTupleKeyShare
Definition lockoptions.h:53
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define IsBootstrapProcessingMode()
Definition miscadmin.h:477
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define IsNormalProcessingMode()
Definition miscadmin.h:479
#define END_CRIT_SECTION()
Definition miscadmin.h:152
MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
Definition multixact.c:390
bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
Definition multixact.c:2818
bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
Definition multixact.c:2832
bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
Definition multixact.c:501
void MultiXactIdSetOldestMember(void)
Definition multixact.c:575
MultiXactId MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
Definition multixact.c:694
MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, MultiXactStatus status2)
Definition multixact.c:337
int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly)
Definition multixact.c:1151
#define MultiXactIdIsValid(multi)
Definition multixact.h:29
MultiXactStatus
Definition multixact.h:37
@ MultiXactStatusForShare
Definition multixact.h:39
@ MultiXactStatusForNoKeyUpdate
Definition multixact.h:40
@ MultiXactStatusNoKeyUpdate
Definition multixact.h:43
@ MultiXactStatusUpdate
Definition multixact.h:45
@ MultiXactStatusForUpdate
Definition multixact.h:41
@ MultiXactStatusForKeyShare
Definition multixact.h:38
#define ISUPDATE_from_mxstatus(status)
Definition multixact.h:51
#define InvalidMultiXactId
Definition multixact.h:25
#define MaxMultiXactStatus
Definition multixact.h:48
static char * errmsg
#define InvalidOffsetNumber
Definition off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition off.h:39
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
#define FirstOffsetNumber
Definition off.h:27
#define OffsetNumberPrev(offsetNumber)
Definition off.h:54
#define MaxOffsetNumber
Definition off.h:28
Datum lower(PG_FUNCTION_ARGS)
Datum upper(PG_FUNCTION_ARGS)
Operator oper(ParseState *pstate, List *opname, Oid ltypeId, Oid rtypeId, bool noError, int location)
Definition parse_oper.c:372
int16 attlen
#define ERRCODE_DATA_CORRUPTED
static uint32 pg_nextpower2_32(uint32 num)
static PgChecksumMode mode
static const struct exclude_list_item skip[]
FormData_pg_class * Form_pg_class
Definition pg_class.h:160
END_CATALOG_STRUCT typedef FormData_pg_database * Form_pg_database
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define pgstat_count_heap_getnext(rel)
Definition pgstat.h:698
#define pgstat_count_heap_scan(rel)
Definition pgstat.h:693
void pgstat_count_heap_update(Relation rel, bool hot, bool newpage)
void pgstat_count_heap_delete(Relation rel)
void pgstat_count_heap_insert(Relation rel, PgStat_Counter n)
#define qsort(a, b, c, d)
Definition port.h:495
static Oid DatumGetObjectId(Datum X)
Definition postgres.h:252
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:342
#define InvalidOid
unsigned int Oid
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition predicate.c:4335
void CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
Definition predicate.c:4022
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition predicate.c:2575
void PredicateLockTID(Relation relation, const ItemPointerData *tid, Snapshot snapshot, TransactionId tuple_xid)
Definition predicate.c:2620
bool CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
Definition predicate.c:3990
static int fb(int x)
GlobalVisState * GlobalVisTestFor(Relation rel)
Definition procarray.c:4114
bool TransactionIdIsInProgress(TransactionId xid)
Definition procarray.c:1401
void heap_page_prune_opt(Relation relation, Buffer buffer)
Definition pruneheap.c:212
void read_stream_reset(ReadStream *stream)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
BlockNumber(* ReadStreamBlockNumberCB)(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition read_stream.h:77
#define READ_STREAM_DEFAULT
Definition read_stream.h:21
#define READ_STREAM_SEQUENTIAL
Definition read_stream.h:36
#define RelationGetRelid(relation)
Definition rel.h:514
#define RelationIsLogicallyLogged(relation)
Definition rel.h:710
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition rel.h:389
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetNumberOfAttributes(relation)
Definition rel.h:520
#define RelationGetRelationName(relation)
Definition rel.h:548
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition rel.h:693
#define RelationNeedsWAL(relation)
Definition rel.h:637
#define RelationUsesLocalBuffers(relation)
Definition rel.h:646
#define HEAP_DEFAULT_FILLFACTOR
Definition rel.h:360
void RelationDecrementReferenceCount(Relation rel)
Definition relcache.c:2195
Bitmapset * RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
Definition relcache.c:5298
void RelationIncrementReferenceCount(Relation rel)
Definition relcache.c:2182
@ INDEX_ATTR_BITMAP_KEY
Definition relcache.h:69
@ INDEX_ATTR_BITMAP_HOT_BLOCKING
Definition relcache.h:72
@ INDEX_ATTR_BITMAP_SUMMARIZED
Definition relcache.h:73
@ INDEX_ATTR_BITMAP_IDENTITY_KEY
Definition relcache.h:71
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
struct ParallelBlockTableScanDescData * ParallelBlockTableScanDesc
Definition relscan.h:103
#define ScanDirectionIsForward(direction)
Definition sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition sdir.h:50
ScanDirection
Definition sdir.h:25
@ ForwardScanDirection
Definition sdir.h:28
TransactionId RecentXmin
Definition snapmgr.c:160
void UnregisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:866
TransactionId TransactionXmin
Definition snapmgr.c:159
bool HaveRegisteredOrActiveSnapshot(void)
Definition snapmgr.c:1644
void InvalidateCatalogSnapshot(void)
Definition snapmgr.c:455
#define IsHistoricMVCCSnapshot(snapshot)
Definition snapmgr.h:59
#define SnapshotAny
Definition snapmgr.h:33
#define InitNonVacuumableSnapshot(snapshotdata, vistestp)
Definition snapmgr.h:50
#define IsMVCCSnapshot(snapshot)
Definition snapmgr.h:55
#define InvalidSnapshot
Definition snapshot.h:119
int get_tablespace_maintenance_io_concurrency(Oid spcid)
Definition spccache.c:230
#define init()
BlockNumber last_free
Definition hio.h:49
BufferAccessStrategy strategy
Definition hio.h:31
uint32 already_extended_by
Definition hio.h:50
BlockNumber next_free
Definition hio.h:48
Buffer current_buf
Definition hio.h:32
MultiXactId NoFreezePageRelminMxid
Definition heapam.h:232
TransactionId FreezePageConflictXid
Definition heapam.h:221
TransactionId FreezePageRelfrozenXid
Definition heapam.h:208
bool freeze_required
Definition heapam.h:182
MultiXactId FreezePageRelminMxid
Definition heapam.h:209
TransactionId NoFreezePageRelfrozenXid
Definition heapam.h:231
BufferAccessStrategy rs_strategy
Definition heapam.h:73
ScanDirection rs_dir
Definition heapam.h:88
uint32 rs_ntuples
Definition heapam.h:99
OffsetNumber rs_coffset
Definition heapam.h:68
Buffer rs_cbuf
Definition heapam.h:70
ParallelBlockTableScanWorkerData * rs_parallelworkerdata
Definition heapam.h:95
BlockNumber rs_startblock
Definition heapam.h:62
HeapTupleData rs_ctup
Definition heapam.h:75
OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]
Definition heapam.h:100
BlockNumber rs_numblocks
Definition heapam.h:63
BlockNumber rs_nblocks
Definition heapam.h:61
ReadStream * rs_read_stream
Definition heapam.h:78
uint32 rs_cindex
Definition heapam.h:98
BlockNumber rs_prefetch_block
Definition heapam.h:89
BlockNumber rs_cblock
Definition heapam.h:69
TableScanDescData rs_base
Definition heapam.h:58
ItemPointerData t_self
Definition htup.h:65
uint32 t_len
Definition htup.h:64
HeapTupleHeader t_data
Definition htup.h:68
Oid t_tableOid
Definition htup.h:66
TransactionId t_xmin
union HeapTupleHeaderData::@51 t_choice
ItemPointerData t_ctid
HeapTupleFields t_heap
int16 npromisingtids
Definition heapam.c:198
LockRelId lockRelId
Definition rel.h:46
Oid relId
Definition rel.h:40
Oid dbId
Definition rel.h:41
TransactionId xid
Definition multixact.h:57
MultiXactStatus status
Definition multixact.h:58
LockInfoData rd_lockInfo
Definition rel.h:114
Form_pg_index rd_index
Definition rel.h:192
RelFileLocator rd_locator
Definition rel.h:57
Form_pg_class rd_rel
Definition rel.h:111
bool takenDuringRecovery
Definition snapshot.h:180
TransactionId xmax
Definition tableam.h:150
CommandId cmax
Definition tableam.h:151
ItemPointerData ctid
Definition tableam.h:149
ItemPointerData tid
Definition tableam.h:212
Relation rs_rd
Definition relscan.h:35
uint32 rs_flags
Definition relscan.h:63
struct ScanKeyData * rs_key
Definition relscan.h:38
struct SnapshotData * rs_snapshot
Definition relscan.h:36
struct ParallelTableScanDescData * rs_parallel
Definition relscan.h:65
TransactionId FreezeLimit
Definition vacuum.h:289
TransactionId OldestXmin
Definition vacuum.h:279
TransactionId relfrozenxid
Definition vacuum.h:263
MultiXactId relminmxid
Definition vacuum.h:264
MultiXactId MultiXactCutoff
Definition vacuum.h:290
MultiXactId OldestMxact
Definition vacuum.h:280
Definition c.h:739
OffsetNumber offnum
TransactionId SubTransGetTopmostTransaction(TransactionId xid)
Definition subtrans.c:162
void ss_report_location(Relation rel, BlockNumber location)
Definition syncscan.c:289
BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks)
Definition syncscan.c:254
#define FirstLowInvalidHeapAttributeNumber
Definition sysattr.h:27
#define TableOidAttributeNumber
Definition sysattr.h:26
bool RelationSupportsSysCache(Oid relid)
Definition syscache.c:762
void table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan, BlockNumber startblock, BlockNumber numblocks)
Definition tableam.c:451
BlockNumber table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan)
Definition tableam.c:546
bool synchronize_seqscans
Definition tableam.c:50
@ SO_ALLOW_STRAT
Definition tableam.h:58
@ SO_TYPE_TIDRANGESCAN
Definition tableam.h:53
@ SO_TEMP_SNAPSHOT
Definition tableam.h:65
@ SO_ALLOW_PAGEMODE
Definition tableam.h:62
@ SO_TYPE_SAMPLESCAN
Definition tableam.h:51
@ SO_ALLOW_SYNC
Definition tableam.h:60
@ SO_TYPE_SEQSCAN
Definition tableam.h:49
@ SO_TYPE_BITMAPSCAN
Definition tableam.h:50
TU_UpdateIndexes
Definition tableam.h:111
@ TU_Summarizing
Definition tableam.h:119
@ TU_All
Definition tableam.h:116
@ TU_None
Definition tableam.h:113
TM_Result
Definition tableam.h:73
@ TM_Ok
Definition tableam.h:78
@ TM_BeingModified
Definition tableam.h:100
@ TM_Deleted
Definition tableam.h:93
@ TM_WouldBlock
Definition tableam.h:103
@ TM_Updated
Definition tableam.h:90
@ TM_SelfModified
Definition tableam.h:84
@ TM_Invisible
Definition tableam.h:81
bool tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
Definition tidbitmap.c:1614
bool TransactionIdDidCommit(TransactionId transactionId)
Definition transam.c:126
bool TransactionIdDidAbort(TransactionId transactionId)
Definition transam.c:188
static bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition transam.h:297
#define InvalidTransactionId
Definition transam.h:31
static bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2)
Definition transam.h:282
static bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition transam.h:312
#define TransactionIdEquals(id1, id2)
Definition transam.h:43
#define TransactionIdIsValid(xid)
Definition transam.h:41
#define TransactionIdIsNormal(xid)
Definition transam.h:42
static bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition transam.h:263
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:175
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition tuptable.h:457
static bool HeapKeyTest(HeapTuple tuple, TupleDesc tupdesc, int nkeys, ScanKey keys)
Definition valid.h:28
static bool VARATT_IS_EXTERNAL(const void *PTR)
Definition varatt.h:354
bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
void visibilitymap_set_vmbits(BlockNumber heapBlk, Buffer vmBuf, uint8 flags, const RelFileLocator rlocator)
#define VISIBILITYMAP_VALID_BITS
#define VISIBILITYMAP_ALL_FROZEN
#define VISIBILITYMAP_XLOG_CATALOG_REL
#define VISIBILITYMAP_ALL_VISIBLE
TransactionId GetTopTransactionId(void)
Definition xact.c:428
TransactionId GetTopTransactionIdIfAny(void)
Definition xact.c:443
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition xact.c:943
bool IsInParallelMode(void)
Definition xact.c:1091
TransactionId GetCurrentTransactionId(void)
Definition xact.c:456
CommandId GetCurrentCommandId(bool used)
Definition xact.c:831
#define IsolationIsSerializable()
Definition xact.h:53
#define XLOG_INCLUDE_ORIGIN
Definition xlog.h:165
#define XLogHintBitIsNeeded()
Definition xlog.h:122
#define XLogStandbyInfoActive()
Definition xlog.h:125
uint64 XLogRecPtr
Definition xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition xloginsert.c:478
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition xloginsert.c:409
bool XLogCheckBufferNeedsBackup(Buffer buffer)
void XLogRegisterData(const void *data, uint32 len)
Definition xloginsert.c:368
void XLogSetRecordFlags(uint8 flags)
Definition xloginsert.c:460
void XLogRegisterBlock(uint8 block_id, RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blknum, const PageData *page, uint8 flags)
Definition xloginsert.c:313
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition xloginsert.c:245
void XLogBeginInsert(void)
Definition xloginsert.c:152
#define REGBUF_STANDARD
Definition xloginsert.h:35
#define REGBUF_NO_IMAGE
Definition xloginsert.h:33
#define REGBUF_KEEP_DATA
Definition xloginsert.h:36
#define REGBUF_WILL_INIT
Definition xloginsert.h:34

◆ FRM_INVALIDATE_XMAX

#define FRM_INVALIDATE_XMAX   0x0002

Definition at line 6725 of file heapam.c.

◆ FRM_MARK_COMMITTED

#define FRM_MARK_COMMITTED   0x0010

Definition at line 6728 of file heapam.c.

◆ FRM_NOOP

#define FRM_NOOP   0x0001

Definition at line 6724 of file heapam.c.

◆ FRM_RETURN_IS_MULTI

#define FRM_RETURN_IS_MULTI   0x0008

Definition at line 6727 of file heapam.c.

◆ FRM_RETURN_IS_XID

#define FRM_RETURN_IS_XID   0x0004

Definition at line 6726 of file heapam.c.

◆ LOCKMODE_from_mxstatus

#define LOCKMODE_from_mxstatus (   status)     (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)

Definition at line 159 of file heapam.c.

◆ LockTupleTuplock

#define LockTupleTuplock (   rel,
  tup,
  mode 
)     LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)

Definition at line 167 of file heapam.c.

◆ TUPLOCK_from_mxstatus

#define TUPLOCK_from_mxstatus (   status)     (MultiXactStatusLock[(status)])

Definition at line 218 of file heapam.c.

◆ UnlockTupleTuplock

#define UnlockTupleTuplock (   rel,
  tup,
  mode 
)     UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)

Definition at line 169 of file heapam.c.

Typedef Documentation

◆ IndexDeleteCounts

Function Documentation

◆ AssertHasSnapshotForToast()

static void AssertHasSnapshotForToast ( Relation  rel)
inlinestatic

Definition at line 225 of file heapam.c.

226{
227#ifdef USE_ASSERT_CHECKING
228
229 /* bootstrap mode in particular breaks this rule */
231 return;
232
233 /* if the relation doesn't have a TOAST table, we are good */
234 if (!OidIsValid(rel->rd_rel->reltoastrelid))
235 return;
236
238
239#endif /* USE_ASSERT_CHECKING */
240}

References Assert, HaveRegisteredOrActiveSnapshot(), IsNormalProcessingMode, OidIsValid, and RelationData::rd_rel.

Referenced by heap_delete(), heap_insert(), heap_multi_insert(), and heap_update().

◆ bitmapheap_stream_read_next()

static BlockNumber bitmapheap_stream_read_next ( ReadStream pgsr,
void private_data,
void per_buffer_data 
)
static

Definition at line 317 of file heapam.c.

319{
320 TBMIterateResult *tbmres = per_buffer_data;
323 TableScanDesc sscan = &hscan->rs_base;
324
325 for (;;)
326 {
328
329 /* no more entries in the bitmap */
330 if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
331 return InvalidBlockNumber;
332
333 /*
334 * Ignore any claimed entries past what we think is the end of the
335 * relation. It may have been extended after the start of our scan (we
336 * only hold an AccessShareLock, and it could be inserts from this
337 * backend). We don't take this optimization in SERIALIZABLE
338 * isolation though, as we need to examine all invisible tuples
339 * reachable by the index.
340 */
342 tbmres->blockno >= hscan->rs_nblocks)
343 continue;
344
345 return tbmres->blockno;
346 }
347
348 /* not reachable */
349 Assert(false);
350}

References Assert, CHECK_FOR_INTERRUPTS, fb(), InvalidBlockNumber, IsolationIsSerializable, and tbm_iterate().

Referenced by heap_beginscan().

◆ bottomup_nblocksfavorable()

static int bottomup_nblocksfavorable ( IndexDeleteCounts blockgroups,
int  nblockgroups,
TM_IndexDelete deltids 
)
static

Definition at line 8646 of file heapam.c.

8648{
8649 int64 lastblock = -1;
8650 int nblocksfavorable = 0;
8651
8652 Assert(nblockgroups >= 1);
8654
8655 /*
8656 * We tolerate heap blocks that will be accessed only slightly out of
8657 * physical order. Small blips occur when a pair of almost-contiguous
8658 * blocks happen to fall into different buckets (perhaps due only to a
8659 * small difference in npromisingtids that the bucketing scheme didn't
8660 * quite manage to ignore). We effectively ignore these blips by applying
8661 * a small tolerance. The precise tolerance we use is a little arbitrary,
8662 * but it works well enough in practice.
8663 */
8664 for (int b = 0; b < nblockgroups; b++)
8665 {
8666 IndexDeleteCounts *group = blockgroups + b;
8667 TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
8669
8670 if (lastblock != -1 &&
8673 break;
8674
8676 lastblock = block;
8677 }
8678
8679 /* Always indicate that there is at least 1 favorable block */
8681
8682 return nblocksfavorable;
8683}

References Assert, b, BOTTOMUP_MAX_NBLOCKS, BOTTOMUP_TOLERANCE_NBLOCKS, fb(), IndexDeleteCounts::ifirsttid, and ItemPointerGetBlockNumber().

Referenced by bottomup_sort_and_shrink().

◆ bottomup_sort_and_shrink()

static int bottomup_sort_and_shrink ( TM_IndexDeleteOp delstate)
static

Definition at line 8762 of file heapam.c.

8763{
8767 int nblockgroups = 0;
8768 int ncopied = 0;
8769 int nblocksfavorable = 0;
8770
8771 Assert(delstate->bottomup);
8772 Assert(delstate->ndeltids > 0);
8773
8774 /* Calculate per-heap-block count of TIDs */
8776 for (int i = 0; i < delstate->ndeltids; i++)
8777 {
8778 TM_IndexDelete *ideltid = &delstate->deltids[i];
8779 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8780 ItemPointer htid = &ideltid->tid;
8781 bool promising = istatus->promising;
8782
8784 {
8785 /* New block group */
8786 nblockgroups++;
8787
8790
8792 blockgroups[nblockgroups - 1].ifirsttid = i;
8793 blockgroups[nblockgroups - 1].ntids = 1;
8794 blockgroups[nblockgroups - 1].npromisingtids = 0;
8795 }
8796 else
8797 {
8798 blockgroups[nblockgroups - 1].ntids++;
8799 }
8800
8801 if (promising)
8802 blockgroups[nblockgroups - 1].npromisingtids++;
8803 }
8804
8805 /*
8806 * We're about ready to sort block groups to determine the optimal order
8807 * for visiting heap blocks. But before we do, round the number of
8808 * promising tuples for each block group up to the next power-of-two,
8809 * unless it is very low (less than 4), in which case we round up to 4.
8810 * npromisingtids is far too noisy to trust when choosing between a pair
8811 * of block groups that both have very low values.
8812 *
8813 * This scheme divides heap blocks/block groups into buckets. Each bucket
8814 * contains blocks that have _approximately_ the same number of promising
8815 * TIDs as each other. The goal is to ignore relatively small differences
8816 * in the total number of promising entries, so that the whole process can
8817 * give a little weight to heapam factors (like heap block locality)
8818 * instead. This isn't a trade-off, really -- we have nothing to lose. It
8819 * would be foolish to interpret small differences in npromisingtids
8820 * values as anything more than noise.
8821 *
8822 * We tiebreak on nhtids when sorting block group subsets that have the
8823 * same npromisingtids, but this has the same issues as npromisingtids,
8824 * and so nhtids is subject to the same power-of-two bucketing scheme. The
8825 * only reason that we don't fix nhtids in the same way here too is that
8826 * we'll need accurate nhtids values after the sort. We handle nhtids
8827 * bucketization dynamically instead (in the sort comparator).
8828 *
8829 * See bottomup_nblocksfavorable() for a full explanation of when and how
8830 * heap locality/favorable blocks can significantly influence when and how
8831 * heap blocks are accessed.
8832 */
8833 for (int b = 0; b < nblockgroups; b++)
8834 {
8835 IndexDeleteCounts *group = blockgroups + b;
8836
8837 /* Better off falling back on nhtids with low npromisingtids */
8838 if (group->npromisingtids <= 4)
8839 group->npromisingtids = 4;
8840 else
8841 group->npromisingtids =
8843 }
8844
8845 /* Sort groups and rearrange caller's deltids array */
8848 reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
8849
8851 /* Determine number of favorable blocks at the start of final deltids */
8853 delstate->deltids);
8854
8855 for (int b = 0; b < nblockgroups; b++)
8856 {
8857 IndexDeleteCounts *group = blockgroups + b;
8858 TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
8859
8861 sizeof(TM_IndexDelete) * group->ntids);
8862 ncopied += group->ntids;
8863 }
8864
8865 /* Copy final grouped and sorted TIDs back into start of caller's array */
8867 sizeof(TM_IndexDelete) * ncopied);
8868 delstate->ndeltids = ncopied;
8869
8872
8873 return nblocksfavorable;
8874}

References Assert, b, BlockNumberIsValid(), BOTTOMUP_MAX_NBLOCKS, bottomup_nblocksfavorable(), bottomup_sort_and_shrink_cmp(), fb(), i, IndexDeleteCounts::ifirsttid, InvalidBlockNumber, ItemPointerGetBlockNumber(), Min, IndexDeleteCounts::npromisingtids, IndexDeleteCounts::ntids, palloc(), palloc_array, pfree(), pg_nextpower2_32(), and qsort.

Referenced by heap_index_delete_tuples().

◆ bottomup_sort_and_shrink_cmp()

static int bottomup_sort_and_shrink_cmp ( const void arg1,
const void arg2 
)
static

Definition at line 8689 of file heapam.c.

8690{
8693
8694 /*
8695 * Most significant field is npromisingtids (which we invert the order of
8696 * so as to sort in desc order).
8697 *
8698 * Caller should have already normalized npromisingtids fields into
8699 * power-of-two values (buckets).
8700 */
8701 if (group1->npromisingtids > group2->npromisingtids)
8702 return -1;
8703 if (group1->npromisingtids < group2->npromisingtids)
8704 return 1;
8705
8706 /*
8707 * Tiebreak: desc ntids sort order.
8708 *
8709 * We cannot expect power-of-two values for ntids fields. We should
8710 * behave as if they were already rounded up for us instead.
8711 */
8712 if (group1->ntids != group2->ntids)
8713 {
8716
8717 if (ntids1 > ntids2)
8718 return -1;
8719 if (ntids1 < ntids2)
8720 return 1;
8721 }
8722
8723 /*
8724 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
8725 * block in deltids array) order.
8726 *
8727 * This is equivalent to sorting in ascending heap block number order
8728 * (among otherwise equal subsets of the array). This approach allows us
8729 * to avoid accessing the out-of-line TID. (We rely on the assumption
8730 * that the deltids array was sorted in ascending heap TID order when
8731 * these offsets to the first TID from each heap block group were formed.)
8732 */
8733 if (group1->ifirsttid > group2->ifirsttid)
8734 return 1;
8735 if (group1->ifirsttid < group2->ifirsttid)
8736 return -1;
8737
8739
8740 return 0;
8741}

References fb(), pg_nextpower2_32(), and pg_unreachable.

Referenced by bottomup_sort_and_shrink().

◆ compute_infobits()

◆ compute_new_xmax_infomask()

static void compute_new_xmax_infomask ( TransactionId  xmax,
uint16  old_infomask,
uint16  old_infomask2,
TransactionId  add_to_xmax,
LockTupleMode  mode,
bool  is_update,
TransactionId result_xmax,
uint16 result_infomask,
uint16 result_infomask2 
)
static

Definition at line 5396 of file heapam.c.

5401{
5402 TransactionId new_xmax;
5405
5407
5408l5:
5409 new_infomask = 0;
5410 new_infomask2 = 0;
5412 {
5413 /*
5414 * No previous locker; we just insert our own TransactionId.
5415 *
5416 * Note that it's critical that this case be the first one checked,
5417 * because there are several blocks below that come back to this one
5418 * to implement certain optimizations; old_infomask might contain
5419 * other dirty bits in those cases, but we don't really care.
5420 */
5421 if (is_update)
5422 {
5423 new_xmax = add_to_xmax;
5424 if (mode == LockTupleExclusive)
5426 }
5427 else
5428 {
5430 switch (mode)
5431 {
5432 case LockTupleKeyShare:
5433 new_xmax = add_to_xmax;
5435 break;
5436 case LockTupleShare:
5437 new_xmax = add_to_xmax;
5439 break;
5441 new_xmax = add_to_xmax;
5443 break;
5444 case LockTupleExclusive:
5445 new_xmax = add_to_xmax;
5448 break;
5449 default:
5450 new_xmax = InvalidTransactionId; /* silence compiler */
5451 elog(ERROR, "invalid lock mode");
5452 }
5453 }
5454 }
5456 {
5458
5459 /*
5460 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5461 * cross-check.
5462 */
5464
5465 /*
5466 * A multixact together with LOCK_ONLY set but neither lock bit set
5467 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5468 * anymore. This check is critical for databases upgraded by
5469 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5470 * that such multis are never passed.
5471 */
5473 {
5476 goto l5;
5477 }
5478
5479 /*
5480 * If the XMAX is already a MultiXactId, then we need to expand it to
5481 * include add_to_xmax; but if all the members were lockers and are
5482 * all gone, we can do away with the IS_MULTI bit and just set
5483 * add_to_xmax as the only locker/updater. If all lockers are gone
5484 * and we have an updater that aborted, we can also do without a
5485 * multi.
5486 *
5487 * The cost of doing GetMultiXactIdMembers would be paid by
5488 * MultiXactIdExpand if we weren't to do this, so this check is not
5489 * incurring extra work anyhow.
5490 */
5492 {
5495 old_infomask)))
5496 {
5497 /*
5498 * Reset these bits and restart; otherwise fall through to
5499 * create a new multi below.
5500 */
5503 goto l5;
5504 }
5505 }
5506
5508
5509 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5510 new_status);
5512 }
5514 {
5515 /*
5516 * It's a committed update, so we need to preserve him as updater of
5517 * the tuple.
5518 */
5519 MultiXactStatus status;
5521
5523 status = MultiXactStatusUpdate;
5524 else
5526
5528
5529 /*
5530 * since it's not running, it's obviously impossible for the old
5531 * updater to be identical to the current one, so we need not check
5532 * for that case as we do in the block above.
5533 */
5534 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5536 }
5537 else if (TransactionIdIsInProgress(xmax))
5538 {
5539 /*
5540 * If the XMAX is a valid, in-progress TransactionId, then we need to
5541 * create a new MultiXactId that includes both the old locker or
5542 * updater and our own TransactionId.
5543 */
5547
5549 {
5555 {
5558 else
5560 }
5561 else
5562 {
5563 /*
5564 * LOCK_ONLY can be present alone only when a page has been
5565 * upgraded by pg_upgrade. But in that case,
5566 * TransactionIdIsInProgress() should have returned false. We
5567 * assume it's no longer locked in this case.
5568 */
5569 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5572 goto l5;
5573 }
5574 }
5575 else
5576 {
5577 /* it's an update, but which kind? */
5580 else
5582 }
5583
5585
5586 /*
5587 * If the lock to be acquired is for the same TransactionId as the
5588 * existing lock, there's an optimization possible: consider only the
5589 * strongest of both locks as the only one present, and restart.
5590 */
5591 if (xmax == add_to_xmax)
5592 {
5593 /*
5594 * Note that it's not possible for the original tuple to be
5595 * updated: we wouldn't be here because the tuple would have been
5596 * invisible and we wouldn't try to update it. As a subtlety,
5597 * this code can also run when traversing an update chain to lock
5598 * future versions of a tuple. But we wouldn't be here either,
5599 * because the add_to_xmax would be different from the original
5600 * updater.
5601 */
5603
5604 /* acquire the strongest of both */
5605 if (mode < old_mode)
5606 mode = old_mode;
5607 /* mustn't touch is_update */
5608
5610 goto l5;
5611 }
5612
5613 /* otherwise, just fall back to creating a new multixact */
5615 new_xmax = MultiXactIdCreate(xmax, old_status,
5618 }
5621 {
5622 /*
5623 * It's a committed update, so we gotta preserve him as updater of the
5624 * tuple.
5625 */
5626 MultiXactStatus status;
5628
5630 status = MultiXactStatusUpdate;
5631 else
5633
5635
5636 /*
5637 * since it's not running, it's obviously impossible for the old
5638 * updater to be identical to the current one, so we need not check
5639 * for that case as we do in the block above.
5640 */
5641 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5643 }
5644 else
5645 {
5646 /*
5647 * Can get here iff the locking/updating transaction was running when
5648 * the infomask was extracted from the tuple, but finished before
5649 * TransactionIdIsInProgress got to run. Deal with it as if there was
5650 * no locker at all in the first place.
5651 */
5653 goto l5;
5654 }
5655
5658 *result_xmax = new_xmax;
5659}

References Assert, elog, ERROR, fb(), get_mxact_status_for_lock(), GetMultiXactIdHintBits(), HEAP_KEYS_UPDATED, HEAP_LOCKED_UPGRADED(), HEAP_XMAX_COMMITTED, HEAP_XMAX_EXCL_LOCK, HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HEAP_XMAX_SHR_LOCK, InvalidTransactionId, LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, mode, MultiXactIdCreate(), MultiXactIdExpand(), MultiXactIdGetUpdateXid(), MultiXactIdIsRunning(), MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), TUPLOCK_from_mxstatus, and WARNING.

Referenced by heap_delete(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), and heap_update().

◆ ConditionalMultiXactIdWait()

static bool ConditionalMultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
Relation  rel,
int remaining,
bool  logLockFailure 
)
static

Definition at line 7882 of file heapam.c.

7885{
7886 return Do_MultiXactIdWait(multi, status, infomask, true,
7888}

References Do_MultiXactIdWait(), fb(), remaining, and XLTW_None.

Referenced by heap_lock_tuple().

◆ Do_MultiXactIdWait()

static bool Do_MultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
bool  nowait,
Relation  rel,
const ItemPointerData ctid,
XLTW_Oper  oper,
int remaining,
bool  logLockFailure 
)
static

Definition at line 7782 of file heapam.c.

7786{
7787 bool result = true;
7788 MultiXactMember *members;
7789 int nmembers;
7790 int remain = 0;
7791
7792 /* for pre-pg_upgrade tuples, no need to sleep at all */
7793 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7794 GetMultiXactIdMembers(multi, &members, false,
7796
7797 if (nmembers >= 0)
7798 {
7799 int i;
7800
7801 for (i = 0; i < nmembers; i++)
7802 {
7803 TransactionId memxid = members[i].xid;
7804 MultiXactStatus memstatus = members[i].status;
7805
7807 {
7808 remain++;
7809 continue;
7810 }
7811
7813 LOCKMODE_from_mxstatus(status)))
7814 {
7816 remain++;
7817 continue;
7818 }
7819
7820 /*
7821 * This member conflicts with our multi, so we have to sleep (or
7822 * return failure, if asked to avoid waiting.)
7823 *
7824 * Note that we don't set up an error context callback ourselves,
7825 * but instead we pass the info down to XactLockTableWait. This
7826 * might seem a bit wasteful because the context is set up and
7827 * tore down for each member of the multixact, but in reality it
7828 * should be barely noticeable, and it avoids duplicate code.
7829 */
7830 if (nowait)
7831 {
7833 if (!result)
7834 break;
7835 }
7836 else
7837 XactLockTableWait(memxid, rel, ctid, oper);
7838 }
7839
7840 pfree(members);
7841 }
7842
7843 if (remaining)
7844 *remaining = remain;
7845
7846 return result;
7847}

References ConditionalXactLockTableWait(), DoLockModesConflict(), fb(), GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), i, LOCKMODE_from_mxstatus, oper(), pfree(), remaining, MultiXactMember::status, TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), XactLockTableWait(), and MultiXactMember::xid.

Referenced by ConditionalMultiXactIdWait(), and MultiXactIdWait().

◆ DoesMultiXactIdConflict()

static bool DoesMultiXactIdConflict ( MultiXactId  multi,
uint16  infomask,
LockTupleMode  lockmode,
bool current_is_member 
)
static

Definition at line 7682 of file heapam.c.

7684{
7685 int nmembers;
7686 MultiXactMember *members;
7687 bool result = false;
7688 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7689
7691 return false;
7692
7693 nmembers = GetMultiXactIdMembers(multi, &members, false,
7695 if (nmembers >= 0)
7696 {
7697 int i;
7698
7699 for (i = 0; i < nmembers; i++)
7700 {
7703
7704 if (result && (current_is_member == NULL || *current_is_member))
7705 break;
7706
7707 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7708
7709 /* ignore members from current xact (but track their presence) */
7710 memxid = members[i].xid;
7712 {
7713 if (current_is_member != NULL)
7714 *current_is_member = true;
7715 continue;
7716 }
7717 else if (result)
7718 continue;
7719
7720 /* ignore members that don't conflict with the lock we want */
7722 continue;
7723
7724 if (ISUPDATE_from_mxstatus(members[i].status))
7725 {
7726 /* ignore aborted updaters */
7728 continue;
7729 }
7730 else
7731 {
7732 /* ignore lockers-only that are no longer in progress */
7734 continue;
7735 }
7736
7737 /*
7738 * Whatever remains are either live lockers that conflict with our
7739 * wanted lock, and updaters that are not aborted. Those conflict
7740 * with what we want. Set up to return true, but keep going to
7741 * look for the current transaction among the multixact members,
7742 * if needed.
7743 */
7744 result = true;
7745 }
7746 pfree(members);
7747 }
7748
7749 return result;
7750}

References DoLockModesConflict(), fb(), GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), i, ISUPDATE_from_mxstatus, LOCKMODE_from_mxstatus, pfree(), TransactionIdDidAbort(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), tupleLockExtraInfo, and MultiXactMember::xid.

Referenced by heap_delete(), heap_inplace_lock(), heap_lock_tuple(), and heap_update().

◆ ExtractReplicaIdentity()

static HeapTuple ExtractReplicaIdentity ( Relation  relation,
HeapTuple  tp,
bool  key_required,
bool copy 
)
static

Definition at line 9228 of file heapam.c.

9230{
9231 TupleDesc desc = RelationGetDescr(relation);
9232 char replident = relation->rd_rel->relreplident;
9235 bool nulls[MaxHeapAttributeNumber];
9237
9238 *copy = false;
9239
9240 if (!RelationIsLogicallyLogged(relation))
9241 return NULL;
9242
9243 if (replident == REPLICA_IDENTITY_NOTHING)
9244 return NULL;
9245
9246 if (replident == REPLICA_IDENTITY_FULL)
9247 {
9248 /*
9249 * When logging the entire old tuple, it very well could contain
9250 * toasted columns. If so, force them to be inlined.
9251 */
9252 if (HeapTupleHasExternal(tp))
9253 {
9254 *copy = true;
9255 tp = toast_flatten_tuple(tp, desc);
9256 }
9257 return tp;
9258 }
9259
9260 /* if the key isn't required and we're only logging the key, we're done */
9261 if (!key_required)
9262 return NULL;
9263
9264 /* find out the replica identity columns */
9267
9268 /*
9269 * If there's no defined replica identity columns, treat as !key_required.
9270 * (This case should not be reachable from heap_update, since that should
9271 * calculate key_required accurately. But heap_delete just passes
9272 * constant true for key_required, so we can hit this case in deletes.)
9273 */
9274 if (bms_is_empty(idattrs))
9275 return NULL;
9276
9277 /*
9278 * Construct a new tuple containing only the replica identity columns,
9279 * with nulls elsewhere. While we're at it, assert that the replica
9280 * identity columns aren't null.
9281 */
9282 heap_deform_tuple(tp, desc, values, nulls);
9283
9284 for (int i = 0; i < desc->natts; i++)
9285 {
9287 idattrs))
9288 Assert(!nulls[i]);
9289 else
9290 nulls[i] = true;
9291 }
9292
9293 key_tuple = heap_form_tuple(desc, values, nulls);
9294 *copy = true;
9295
9297
9298 /*
9299 * If the tuple, which by here only contains indexed columns, still has
9300 * toasted columns, force them to be inlined. This is somewhat unlikely
9301 * since there's limits on the size of indexed columns, so we don't
9302 * duplicate toast_flatten_tuple()s functionality in the above loop over
9303 * the indexed columns, even if it would be more efficient.
9304 */
9306 {
9308
9311 }
9312
9313 return key_tuple;
9314}

References Assert, bms_free(), bms_is_empty, bms_is_member(), fb(), FirstLowInvalidHeapAttributeNumber, heap_deform_tuple(), heap_form_tuple(), heap_freetuple(), HeapTupleHasExternal(), i, INDEX_ATTR_BITMAP_IDENTITY_KEY, MaxHeapAttributeNumber, TupleDescData::natts, RelationData::rd_rel, RelationGetDescr, RelationGetIndexAttrBitmap(), RelationIsLogicallyLogged, toast_flatten_tuple(), and values.

Referenced by heap_delete(), and heap_update().

◆ FreeBulkInsertState()

◆ FreezeMultiXactId()

static TransactionId FreezeMultiXactId ( MultiXactId  multi,
uint16  t_infomask,
const struct VacuumCutoffs cutoffs,
uint16 flags,
HeapPageFreeze pagefrz 
)
static

Definition at line 6777 of file heapam.c.

6780{
6782 MultiXactMember *members;
6783 int nmembers;
6784 bool need_replace;
6785 int nnewmembers;
6787 bool has_lockers;
6789 bool update_committed;
6790 TransactionId FreezePageRelfrozenXid;
6791
6792 *flags = 0;
6793
6794 /* We should only be called in Multis */
6795 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6796
6797 if (!MultiXactIdIsValid(multi) ||
6798 HEAP_LOCKED_UPGRADED(t_infomask))
6799 {
6800 *flags |= FRM_INVALIDATE_XMAX;
6801 pagefrz->freeze_required = true;
6802 return InvalidTransactionId;
6803 }
6804 else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
6805 ereport(ERROR,
6807 errmsg_internal("found multixact %u from before relminmxid %u",
6808 multi, cutoffs->relminmxid)));
6809 else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
6810 {
6812
6813 /*
6814 * This old multi cannot possibly have members still running, but
6815 * verify just in case. If it was a locker only, it can be removed
6816 * without any further consideration; but if it contained an update,
6817 * we might need to preserve it.
6818 */
6819 if (MultiXactIdIsRunning(multi,
6820 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6821 ereport(ERROR,
6823 errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
6824 multi, cutoffs->OldestMxact)));
6825
6826 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6827 {
6828 *flags |= FRM_INVALIDATE_XMAX;
6829 pagefrz->freeze_required = true;
6830 return InvalidTransactionId;
6831 }
6832
6833 /* replace multi with single XID for its updater? */
6834 update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
6836 ereport(ERROR,
6838 errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
6839 multi, update_xact,
6840 cutoffs->relfrozenxid)));
6841 else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
6842 {
6843 /*
6844 * Updater XID has to have aborted (otherwise the tuple would have
6845 * been pruned away instead, since updater XID is < OldestXmin).
6846 * Just remove xmax.
6847 */
6849 ereport(ERROR,
6851 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6852 multi, update_xact,
6853 cutoffs->OldestXmin)));
6854 *flags |= FRM_INVALIDATE_XMAX;
6855 pagefrz->freeze_required = true;
6856 return InvalidTransactionId;
6857 }
6858
6859 /* Have to keep updater XID as new xmax */
6860 *flags |= FRM_RETURN_IS_XID;
6861 pagefrz->freeze_required = true;
6862 return update_xact;
6863 }
6864
6865 /*
6866 * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
6867 * need to walk the whole members array to figure out what to do, if
6868 * anything.
6869 */
6870 nmembers =
6871 GetMultiXactIdMembers(multi, &members, false,
6872 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6873 if (nmembers <= 0)
6874 {
6875 /* Nothing worth keeping */
6876 *flags |= FRM_INVALIDATE_XMAX;
6877 pagefrz->freeze_required = true;
6878 return InvalidTransactionId;
6879 }
6880
6881 /*
6882 * The FRM_NOOP case is the only case where we might need to ratchet back
6883 * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
6884 * case where our caller might ratchet back its NoFreezePageRelfrozenXid
6885 * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
6886 * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
6887 * trackers managed by VACUUM being ratcheting back by xmax to the degree
6888 * required to make it safe to leave xmax undisturbed, independent of
6889 * whether or not page freezing is triggered somewhere else.
6890 *
6891 * Our policy is to force freezing in every case other than FRM_NOOP,
6892 * which obviates the need to maintain either set of trackers, anywhere.
6893 * Every other case will reliably execute a freeze plan for xmax that
6894 * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
6895 * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
6896 * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
6897 * OldestXmin/OldestMxact, so later values never need to be tracked here.)
6898 */
6899 need_replace = false;
6900 FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
6901 for (int i = 0; i < nmembers; i++)
6902 {
6903 TransactionId xid = members[i].xid;
6904
6905 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6906
6907 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
6908 {
6909 /* Can't violate the FreezeLimit postcondition */
6910 need_replace = true;
6911 break;
6912 }
6913 if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
6914 FreezePageRelfrozenXid = xid;
6915 }
6916
6917 /* Can't violate the MultiXactCutoff postcondition, either */
6918 if (!need_replace)
6920
6921 if (!need_replace)
6922 {
6923 /*
6924 * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
6925 * both together to make it safe to retain this particular multi after
6926 * freezing its page
6927 */
6928 *flags |= FRM_NOOP;
6929 pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
6930 if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
6931 pagefrz->FreezePageRelminMxid = multi;
6932 pfree(members);
6933 return multi;
6934 }
6935
6936 /*
6937 * Do a more thorough second pass over the multi to figure out which
6938 * member XIDs actually need to be kept. Checking the precise status of
6939 * individual members might even show that we don't need to keep anything.
6940 * That is quite possible even though the Multi must be >= OldestMxact,
6941 * since our second pass only keeps member XIDs when it's truly necessary;
6942 * even member XIDs >= OldestXmin often won't be kept by second pass.
6943 */
6944 nnewmembers = 0;
6946 has_lockers = false;
6948 update_committed = false;
6949
6950 /*
6951 * Determine whether to keep each member xid, or to ignore it instead
6952 */
6953 for (int i = 0; i < nmembers; i++)
6954 {
6955 TransactionId xid = members[i].xid;
6956 MultiXactStatus mstatus = members[i].status;
6957
6958 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6959
6960 if (!ISUPDATE_from_mxstatus(mstatus))
6961 {
6962 /*
6963 * Locker XID (not updater XID). We only keep lockers that are
6964 * still running.
6965 */
6968 {
6969 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6970 ereport(ERROR,
6972 errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
6973 multi, xid,
6974 cutoffs->OldestXmin)));
6975 newmembers[nnewmembers++] = members[i];
6976 has_lockers = true;
6977 }
6978
6979 continue;
6980 }
6981
6982 /*
6983 * Updater XID (not locker XID). Should we keep it?
6984 *
6985 * Since the tuple wasn't totally removed when vacuum pruned, the
6986 * update Xid cannot possibly be older than OldestXmin cutoff unless
6987 * the updater XID aborted. If the updater transaction is known
6988 * aborted or crashed then it's okay to ignore it, otherwise not.
6989 *
6990 * In any case the Multi should never contain two updaters, whatever
6991 * their individual commit status. Check for that first, in passing.
6992 */
6994 ereport(ERROR,
6996 errmsg_internal("multixact %u has two or more updating members",
6997 multi),
6998 errdetail_internal("First updater XID=%u second updater XID=%u.",
6999 update_xid, xid)));
7000
7001 /*
7002 * As with all tuple visibility routines, it's critical to test
7003 * TransactionIdIsInProgress before TransactionIdDidCommit, because of
7004 * race conditions explained in detail in heapam_visibility.c.
7005 */
7008 update_xid = xid;
7009 else if (TransactionIdDidCommit(xid))
7010 {
7011 /*
7012 * The transaction committed, so we can tell caller to set
7013 * HEAP_XMAX_COMMITTED. (We can only do this because we know the
7014 * transaction is not running.)
7015 */
7016 update_committed = true;
7017 update_xid = xid;
7018 }
7019 else
7020 {
7021 /*
7022 * Not in progress, not committed -- must be aborted or crashed;
7023 * we can ignore it.
7024 */
7025 continue;
7026 }
7027
7028 /*
7029 * We determined that updater must be kept -- add it to pending new
7030 * members list
7031 */
7032 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
7033 ereport(ERROR,
7035 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
7036 multi, xid, cutoffs->OldestXmin)));
7037 newmembers[nnewmembers++] = members[i];
7038 }
7039
7040 pfree(members);
7041
7042 /*
7043 * Determine what to do with caller's multi based on information gathered
7044 * during our second pass
7045 */
7046 if (nnewmembers == 0)
7047 {
7048 /* Nothing worth keeping */
7049 *flags |= FRM_INVALIDATE_XMAX;
7051 }
7053 {
7054 /*
7055 * If there's a single member and it's an update, pass it back alone
7056 * without creating a new Multi. (XXX we could do this when there's a
7057 * single remaining locker, too, but that would complicate the API too
7058 * much; moreover, the case with the single updater is more
7059 * interesting, because those are longer-lived.)
7060 */
7061 Assert(nnewmembers == 1);
7062 *flags |= FRM_RETURN_IS_XID;
7063 if (update_committed)
7064 *flags |= FRM_MARK_COMMITTED;
7066 }
7067 else
7068 {
7069 /*
7070 * Create a new multixact with the surviving members of the previous
7071 * one, to set as new Xmax in the tuple
7072 */
7074 *flags |= FRM_RETURN_IS_MULTI;
7075 }
7076
7078
7079 pagefrz->freeze_required = true;
7080 return newxmax;
7081}

References Assert, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errdetail_internal(), errmsg_internal(), ERROR, fb(), HeapPageFreeze::freeze_required, VacuumCutoffs::FreezeLimit, HeapPageFreeze::FreezePageRelfrozenXid, HeapPageFreeze::FreezePageRelminMxid, FRM_INVALIDATE_XMAX, FRM_MARK_COMMITTED, FRM_NOOP, FRM_RETURN_IS_MULTI, FRM_RETURN_IS_XID, GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, i, InvalidTransactionId, ISUPDATE_from_mxstatus, VacuumCutoffs::MultiXactCutoff, MultiXactIdCreateFromMembers(), MultiXactIdGetUpdateXid(), MultiXactIdIsRunning(), MultiXactIdIsValid, MultiXactIdPrecedes(), VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, palloc_array, pfree(), VacuumCutoffs::relfrozenxid, VacuumCutoffs::relminmxid, MultiXactMember::status, TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), TransactionIdIsValid, TransactionIdPrecedes(), and MultiXactMember::xid.

Referenced by heap_prepare_freeze_tuple().

◆ get_mxact_status_for_lock()

static MultiXactStatus get_mxact_status_for_lock ( LockTupleMode  mode,
bool  is_update 
)
static

Definition at line 4598 of file heapam.c.

4599{
4600 int retval;
4601
4602 if (is_update)
4603 retval = tupleLockExtraInfo[mode].updstatus;
4604 else
4605 retval = tupleLockExtraInfo[mode].lockstatus;
4606
4607 if (retval == -1)
4608 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4609 is_update ? "true" : "false");
4610
4611 return (MultiXactStatus) retval;
4612}

References elog, ERROR, fb(), mode, and tupleLockExtraInfo.

Referenced by compute_new_xmax_infomask(), heap_lock_tuple(), and test_lockmode_for_conflict().

◆ GetBulkInsertState()

◆ GetMultiXactIdHintBits()

static void GetMultiXactIdHintBits ( MultiXactId  multi,
uint16 new_infomask,
uint16 new_infomask2 
)
static

Definition at line 7533 of file heapam.c.

7535{
7536 int nmembers;
7537 MultiXactMember *members;
7538 int i;
7540 uint16 bits2 = 0;
7541 bool has_update = false;
7543
7544 /*
7545 * We only use this in multis we just created, so they cannot be values
7546 * pre-pg_upgrade.
7547 */
7548 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7549
7550 for (i = 0; i < nmembers; i++)
7551 {
7553
7554 /*
7555 * Remember the strongest lock mode held by any member of the
7556 * multixact.
7557 */
7558 mode = TUPLOCK_from_mxstatus(members[i].status);
7559 if (mode > strongest)
7560 strongest = mode;
7561
7562 /* See what other bits we need */
7563 switch (members[i].status)
7564 {
7568 break;
7569
7572 break;
7573
7575 has_update = true;
7576 break;
7577
7580 has_update = true;
7581 break;
7582 }
7583 }
7584
7587 bits |= HEAP_XMAX_EXCL_LOCK;
7588 else if (strongest == LockTupleShare)
7589 bits |= HEAP_XMAX_SHR_LOCK;
7590 else if (strongest == LockTupleKeyShare)
7591 bits |= HEAP_XMAX_KEYSHR_LOCK;
7592
7593 if (!has_update)
7594 bits |= HEAP_XMAX_LOCK_ONLY;
7595
7596 if (nmembers > 0)
7597 pfree(members);
7598
7599 *new_infomask = bits;
7601}

References fb(), GetMultiXactIdMembers(), HEAP_KEYS_UPDATED, HEAP_XMAX_EXCL_LOCK, HEAP_XMAX_IS_MULTI, HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HEAP_XMAX_SHR_LOCK, i, LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, mode, MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, pfree(), and TUPLOCK_from_mxstatus.

Referenced by compute_new_xmax_infomask(), heap_prepare_freeze_tuple(), and heap_update().

◆ heap_abort_speculative()

void heap_abort_speculative ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 6256 of file heapam.c.

6257{
6259 ItemId lp;
6260 HeapTupleData tp;
6261 Page page;
6262 BlockNumber block;
6263 Buffer buffer;
6264
6266
6267 block = ItemPointerGetBlockNumber(tid);
6268 buffer = ReadBuffer(relation, block);
6269 page = BufferGetPage(buffer);
6270
6272
6273 /*
6274 * Page can't be all visible, we just inserted into it, and are still
6275 * running.
6276 */
6277 Assert(!PageIsAllVisible(page));
6278
6281
6282 tp.t_tableOid = RelationGetRelid(relation);
6283 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6284 tp.t_len = ItemIdGetLength(lp);
6285 tp.t_self = *tid;
6286
6287 /*
6288 * Sanity check that the tuple really is a speculatively inserted tuple,
6289 * inserted by us.
6290 */
6291 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6292 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6293 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6294 elog(ERROR, "attempted to kill a non-speculative tuple");
6296
6297 /*
6298 * No need to check for serializable conflicts here. There is never a
6299 * need for a combo CID, either. No need to extract replica identity, or
6300 * do anything special with infomask bits.
6301 */
6302
6304
6305 /*
6306 * The tuple will become DEAD immediately. Flag that this page is a
6307 * candidate for pruning by setting xmin to TransactionXmin. While not
6308 * immediately prunable, it is the oldest xid we can cheaply determine
6309 * that's safe against wraparound / being older than the table's
6310 * relfrozenxid. To defend against the unlikely case of a new relation
6311 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6312 * if so (vacuum can't subsequently move relfrozenxid to beyond
6313 * TransactionXmin, so there's no race here).
6314 */
6316 {
6317 TransactionId relfrozenxid = relation->rd_rel->relfrozenxid;
6319
6320 if (TransactionIdPrecedes(TransactionXmin, relfrozenxid))
6321 prune_xid = relfrozenxid;
6322 else
6325 }
6326
6327 /* store transaction information of xact deleting the tuple */
6330
6331 /*
6332 * Set the tuple header xmin to InvalidTransactionId. This makes the
6333 * tuple immediately invisible everyone. (In particular, to any
6334 * transactions waiting on the speculative token, woken up later.)
6335 */
6337
6338 /* Clear the speculative insertion token too */
6339 tp.t_data->t_ctid = tp.t_self;
6340
6341 MarkBufferDirty(buffer);
6342
6343 /*
6344 * XLOG stuff
6345 *
6346 * The WAL records generated here match heap_delete(). The same recovery
6347 * routines are used.
6348 */
6349 if (RelationNeedsWAL(relation))
6350 {
6353
6355 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6356 tp.t_data->t_infomask2);
6358 xlrec.xmax = xid;
6359
6363
6364 /* No replica identity & replication origin logged */
6365
6367
6368 PageSetLSN(page, recptr);
6369 }
6370
6372
6374
6375 if (HeapTupleHasExternal(&tp))
6376 {
6377 Assert(!IsToastRelation(relation));
6378 heap_toast_delete(relation, &tp, true);
6379 }
6380
6381 /*
6382 * Never need to mark tuple for invalidation, since catalogs don't support
6383 * speculative insertion
6384 */
6385
6386 /* Now we can release the buffer */
6387 ReleaseBuffer(buffer);
6388
6389 /* count deletion, as we counted the insertion too */
6390 pgstat_count_heap_delete(relation);
6391}

References Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), compute_infobits(), elog, END_CRIT_SECTION, ERROR, fb(), xl_heap_delete::flags, GetCurrentTransactionId(), HEAP_MOVED, heap_toast_delete(), HEAP_XMAX_BITS, HeapTupleHasExternal(), HeapTupleHeaderIsHeapOnly(), HeapTupleHeaderIsSpeculative(), HeapTupleHeaderSetXmin(), InvalidTransactionId, IsToastRelation(), ItemIdGetLength, ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), MarkBufferDirty(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_delete(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetRelid, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapDelete, START_CRIT_SECTION, HeapTupleHeaderData::t_choice, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_heap, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, HeapTupleFields::t_xmin, TransactionIdIsValid, TransactionIdPrecedes(), TransactionXmin, XLH_DELETE_IS_SUPER, XLOG_HEAP_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by heapam_tuple_complete_speculative(), and toast_delete_datum().

◆ heap_acquire_tuplock()

static bool heap_acquire_tuplock ( Relation  relation,
const ItemPointerData tid,
LockTupleMode  mode,
LockWaitPolicy  wait_policy,
bool have_tuple_lock 
)
static

Definition at line 5347 of file heapam.c.

5349{
5350 if (*have_tuple_lock)
5351 return true;
5352
5353 switch (wait_policy)
5354 {
5355 case LockWaitBlock:
5356 LockTupleTuplock(relation, tid, mode);
5357 break;
5358
5359 case LockWaitSkip:
5360 if (!ConditionalLockTupleTuplock(relation, tid, mode, false))
5361 return false;
5362 break;
5363
5364 case LockWaitError:
5366 ereport(ERROR,
5368 errmsg("could not obtain lock on row in relation \"%s\"",
5369 RelationGetRelationName(relation))));
5370 break;
5371 }
5372 *have_tuple_lock = true;
5373
5374 return true;
5375}

References ConditionalLockTupleTuplock, ereport, errcode(), errmsg, ERROR, fb(), LockTupleTuplock, LockWaitBlock, LockWaitError, LockWaitSkip, log_lock_failures, mode, and RelationGetRelationName.

Referenced by heap_delete(), heap_lock_tuple(), and heap_update().

◆ heap_attr_equals()

static bool heap_attr_equals ( TupleDesc  tupdesc,
int  attrnum,
Datum  value1,
Datum  value2,
bool  isnull1,
bool  isnull2 
)
static

Definition at line 4416 of file heapam.c.

4418{
4419 /*
4420 * If one value is NULL and other is not, then they are certainly not
4421 * equal
4422 */
4423 if (isnull1 != isnull2)
4424 return false;
4425
4426 /*
4427 * If both are NULL, they can be considered equal.
4428 */
4429 if (isnull1)
4430 return true;
4431
4432 /*
4433 * We do simple binary comparison of the two datums. This may be overly
4434 * strict because there can be multiple binary representations for the
4435 * same logical value. But we should be OK as long as there are no false
4436 * positives. Using a type-specific equality operator is messy because
4437 * there could be multiple notions of equality in different operator
4438 * classes; furthermore, we cannot safely invoke user-defined functions
4439 * while holding exclusive buffer lock.
4440 */
4441 if (attrnum <= 0)
4442 {
4443 /* The only allowed system columns are OIDs, so do this */
4445 }
4446 else
4447 {
4449
4451 att = TupleDescCompactAttr(tupdesc, attrnum - 1);
4452 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4453 }
4454}

References Assert, DatumGetObjectId(), datumIsEqual(), fb(), and TupleDescCompactAttr().

Referenced by HeapDetermineColumnsInfo().

◆ heap_beginscan()

TableScanDesc heap_beginscan ( Relation  relation,
Snapshot  snapshot,
int  nkeys,
ScanKey  key,
ParallelTableScanDesc  parallel_scan,
uint32  flags 
)

Definition at line 1164 of file heapam.c.

1168{
1169 HeapScanDesc scan;
1170
1171 /*
1172 * increment relation ref count while scanning relation
1173 *
1174 * This is just to make really sure the relcache entry won't go away while
1175 * the scan has a pointer to it. Caller should be holding the rel open
1176 * anyway, so this is redundant in all normal scenarios...
1177 */
1179
1180 /*
1181 * allocate and initialize scan descriptor
1182 */
1183 if (flags & SO_TYPE_BITMAPSCAN)
1184 {
1186
1187 /*
1188 * Bitmap Heap scans do not have any fields that a normal Heap Scan
1189 * does not have, so no special initializations required here.
1190 */
1191 scan = (HeapScanDesc) bscan;
1192 }
1193 else
1195
1196 scan->rs_base.rs_rd = relation;
1197 scan->rs_base.rs_snapshot = snapshot;
1198 scan->rs_base.rs_nkeys = nkeys;
1199 scan->rs_base.rs_flags = flags;
1200 scan->rs_base.rs_parallel = parallel_scan;
1201 scan->rs_strategy = NULL; /* set in initscan */
1202 scan->rs_cbuf = InvalidBuffer;
1203
1204 /*
1205 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1206 */
1207 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1209
1210 /* Check that a historic snapshot is not used for non-catalog tables */
1211 if (snapshot &&
1212 IsHistoricMVCCSnapshot(snapshot) &&
1214 {
1215 ereport(ERROR,
1217 errmsg("cannot query non-catalog table \"%s\" during logical decoding",
1218 RelationGetRelationName(relation))));
1219 }
1220
1221 /*
1222 * For seqscan and sample scans in a serializable transaction, acquire a
1223 * predicate lock on the entire relation. This is required not only to
1224 * lock all the matching tuples, but also to conflict with new insertions
1225 * into the table. In an indexscan, we take page locks on the index pages
1226 * covering the range specified in the scan qual, but in a heap scan there
1227 * is nothing more fine-grained to lock. A bitmap scan is a different
1228 * story, there we have already scanned the index and locked the index
1229 * pages covering the predicate. But in that case we still have to lock
1230 * any matching heap tuples. For sample scan we could optimize the locking
1231 * to be at least page-level granularity, but we'd need to add per-tuple
1232 * locking for that.
1233 */
1235 {
1236 /*
1237 * Ensure a missing snapshot is noticed reliably, even if the
1238 * isolation mode means predicate locking isn't performed (and
1239 * therefore the snapshot isn't used here).
1240 */
1241 Assert(snapshot);
1242 PredicateLockRelation(relation, snapshot);
1243 }
1244
1245 /* we only need to set this up once */
1246 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1247
1248 /*
1249 * Allocate memory to keep track of page allocation for parallel workers
1250 * when doing a parallel scan.
1251 */
1252 if (parallel_scan != NULL)
1254 else
1256
1257 /*
1258 * we do this here instead of in initscan() because heap_rescan also calls
1259 * initscan() and we don't want to allocate memory again
1260 */
1261 if (nkeys > 0)
1262 scan->rs_base.rs_key = palloc_array(ScanKeyData, nkeys);
1263 else
1264 scan->rs_base.rs_key = NULL;
1265
1266 initscan(scan, key, false);
1267
1268 scan->rs_read_stream = NULL;
1269
1270 /*
1271 * Set up a read stream for sequential scans and TID range scans. This
1272 * should be done after initscan() because initscan() allocates the
1273 * BufferAccessStrategy object passed to the read stream API.
1274 */
1275 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN ||
1277 {
1279
1280 if (scan->rs_base.rs_parallel)
1282 else
1284
1285 /* ---
1286 * It is safe to use batchmode as the only locks taken by `cb`
1287 * are never taken while waiting for IO:
1288 * - SyncScanLock is used in the non-parallel case
1289 * - in the parallel case, only spinlocks and atomics are used
1290 * ---
1291 */
1294 scan->rs_strategy,
1295 scan->rs_base.rs_rd,
1297 cb,
1298 scan,
1299 0);
1300 }
1301 else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
1302 {
1305 scan->rs_strategy,
1306 scan->rs_base.rs_rd,
1309 scan,
1310 sizeof(TBMIterateResult));
1311 }
1312
1313
1314 return (TableScanDesc) scan;
1315}

References Assert, bitmapheap_stream_read_next(), ereport, errcode(), errmsg, ERROR, fb(), heap_scan_stream_read_next_parallel(), heap_scan_stream_read_next_serial(), initscan(), InvalidBuffer, IsHistoricMVCCSnapshot, IsMVCCSnapshot, MAIN_FORKNUM, palloc_array, palloc_object, PredicateLockRelation(), read_stream_begin_relation(), READ_STREAM_DEFAULT, READ_STREAM_SEQUENTIAL, READ_STREAM_USE_BATCHING, RelationGetRelationName, RelationGetRelid, RelationIncrementReferenceCount(), RelationIsAccessibleInLogicalDecoding, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_flags, TableScanDescData::rs_key, TableScanDescData::rs_nkeys, TableScanDescData::rs_parallel, HeapScanDescData::rs_parallelworkerdata, TableScanDescData::rs_rd, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, HeapScanDescData::rs_strategy, SO_TYPE_BITMAPSCAN, SO_TYPE_SAMPLESCAN, SO_TYPE_SEQSCAN, SO_TYPE_TIDRANGESCAN, and HeapTupleData::t_tableOid.

◆ heap_delete()

TM_Result heap_delete ( Relation  relation,
const ItemPointerData tid,
CommandId  cid,
Snapshot  crosscheck,
bool  wait,
TM_FailureData tmfd,
bool  changingPart 
)

Definition at line 2844 of file heapam.c.

2847{
2848 TM_Result result;
2850 ItemId lp;
2851 HeapTupleData tp;
2852 Page page;
2853 BlockNumber block;
2854 Buffer buffer;
2855 Buffer vmbuffer = InvalidBuffer;
2856 TransactionId new_xmax;
2859 bool have_tuple_lock = false;
2860 bool iscombo;
2861 bool all_visible_cleared = false;
2862 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2863 bool old_key_copied = false;
2864
2866
2867 AssertHasSnapshotForToast(relation);
2868
2869 /*
2870 * Forbid this during a parallel operation, lest it allocate a combo CID.
2871 * Other workers might need that combo CID for visibility checks, and we
2872 * have no provision for broadcasting it to them.
2873 */
2874 if (IsInParallelMode())
2875 ereport(ERROR,
2877 errmsg("cannot delete tuples during a parallel operation")));
2878
2879 block = ItemPointerGetBlockNumber(tid);
2880 buffer = ReadBuffer(relation, block);
2881 page = BufferGetPage(buffer);
2882
2883 /*
2884 * Before locking the buffer, pin the visibility map page if it appears to
2885 * be necessary. Since we haven't got the lock yet, someone else might be
2886 * in the middle of changing this, so we'll need to recheck after we have
2887 * the lock.
2888 */
2889 if (PageIsAllVisible(page))
2890 visibilitymap_pin(relation, block, &vmbuffer);
2891
2893
2896
2897 tp.t_tableOid = RelationGetRelid(relation);
2898 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2899 tp.t_len = ItemIdGetLength(lp);
2900 tp.t_self = *tid;
2901
2902l1:
2903
2904 /*
2905 * If we didn't pin the visibility map page and the page has become all
2906 * visible while we were busy locking the buffer, we'll have to unlock and
2907 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2908 * unfortunate, but hopefully shouldn't happen often.
2909 */
2910 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2911 {
2913 visibilitymap_pin(relation, block, &vmbuffer);
2915 }
2916
2917 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2918
2919 if (result == TM_Invisible)
2920 {
2921 UnlockReleaseBuffer(buffer);
2922 ereport(ERROR,
2924 errmsg("attempted to delete invisible tuple")));
2925 }
2926 else if (result == TM_BeingModified && wait)
2927 {
2930
2931 /* must copy state data before unlocking buffer */
2934
2935 /*
2936 * Sleep until concurrent transaction ends -- except when there's a
2937 * single locker and it's our own transaction. Note we don't care
2938 * which lock mode the locker has, because we need the strongest one.
2939 *
2940 * Before sleeping, we need to acquire tuple lock to establish our
2941 * priority for the tuple (see heap_lock_tuple). LockTuple will
2942 * release us when we are next-in-line for the tuple.
2943 *
2944 * If we are forced to "start over" below, we keep the tuple lock;
2945 * this arranges that we stay at the head of the line while rechecking
2946 * tuple state.
2947 */
2949 {
2950 bool current_is_member = false;
2951
2954 {
2956
2957 /*
2958 * Acquire the lock, if necessary (but skip it when we're
2959 * requesting a lock and already have one; avoids deadlock).
2960 */
2961 if (!current_is_member)
2964
2965 /* wait for multixact */
2967 relation, &(tp.t_self), XLTW_Delete,
2968 NULL);
2970
2971 /*
2972 * If xwait had just locked the tuple then some other xact
2973 * could update this tuple before we get to this point. Check
2974 * for xmax change, and start over if so.
2975 *
2976 * We also must start over if we didn't pin the VM page, and
2977 * the page has become all visible.
2978 */
2979 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2982 xwait))
2983 goto l1;
2984 }
2985
2986 /*
2987 * You might think the multixact is necessarily done here, but not
2988 * so: it could have surviving members, namely our own xact or
2989 * other subxacts of this backend. It is legal for us to delete
2990 * the tuple in either case, however (the latter case is
2991 * essentially a situation of upgrading our former shared lock to
2992 * exclusive). We don't bother changing the on-disk hint bits
2993 * since we are about to overwrite the xmax altogether.
2994 */
2995 }
2997 {
2998 /*
2999 * Wait for regular transaction to end; but first, acquire tuple
3000 * lock.
3001 */
3005 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3007
3008 /*
3009 * xwait is done, but if xwait had just locked the tuple then some
3010 * other xact could update this tuple before we get to this point.
3011 * Check for xmax change, and start over if so.
3012 *
3013 * We also must start over if we didn't pin the VM page, and the
3014 * page has become all visible.
3015 */
3016 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
3019 xwait))
3020 goto l1;
3021
3022 /* Otherwise check if it committed or aborted */
3023 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3024 }
3025
3026 /*
3027 * We may overwrite if previous xmax aborted, or if it committed but
3028 * only locked the tuple without updating it.
3029 */
3030 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3033 result = TM_Ok;
3034 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
3035 result = TM_Updated;
3036 else
3037 result = TM_Deleted;
3038 }
3039
3040 /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3041 if (result != TM_Ok)
3042 {
3043 Assert(result == TM_SelfModified ||
3044 result == TM_Updated ||
3045 result == TM_Deleted ||
3046 result == TM_BeingModified);
3048 Assert(result != TM_Updated ||
3050 }
3051
3052 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3053 {
3054 /* Perform additional check for transaction-snapshot mode RI updates */
3055 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3056 result = TM_Updated;
3057 }
3058
3059 if (result != TM_Ok)
3060 {
3061 tmfd->ctid = tp.t_data->t_ctid;
3063 if (result == TM_SelfModified)
3065 else
3066 tmfd->cmax = InvalidCommandId;
3067 UnlockReleaseBuffer(buffer);
3068 if (have_tuple_lock)
3070 if (vmbuffer != InvalidBuffer)
3071 ReleaseBuffer(vmbuffer);
3072 return result;
3073 }
3074
3075 /*
3076 * We're about to do the actual delete -- check for conflict first, to
3077 * avoid possibly having to roll back work we've just done.
3078 *
3079 * This is safe without a recheck as long as there is no possibility of
3080 * another process scanning the page between this check and the delete
3081 * being visible to the scan (i.e., an exclusive buffer content lock is
3082 * continuously held from this point until the tuple delete is visible).
3083 */
3085
3086 /* replace cid with a combo CID if necessary */
3088
3089 /*
3090 * Compute replica identity tuple before entering the critical section so
3091 * we don't PANIC upon a memory allocation failure.
3092 */
3093 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3094
3095 /*
3096 * If this is the first possibly-multixact-able operation in the current
3097 * transaction, set my per-backend OldestMemberMXactId setting. We can be
3098 * certain that the transaction will never become a member of any older
3099 * MultiXactIds than that. (We have to do this even if we end up just
3100 * using our own TransactionId below, since some other backend could
3101 * incorporate our XID into a MultiXact immediately afterwards.)
3102 */
3104
3107 xid, LockTupleExclusive, true,
3108 &new_xmax, &new_infomask, &new_infomask2);
3109
3111
3112 /*
3113 * If this transaction commits, the tuple will become DEAD sooner or
3114 * later. Set flag that this page is a candidate for pruning once our xid
3115 * falls below the OldestXmin horizon. If the transaction finally aborts,
3116 * the subsequent page pruning will be a no-op and the hint will be
3117 * cleared.
3118 */
3119 PageSetPrunable(page, xid);
3120
3121 if (PageIsAllVisible(page))
3122 {
3123 all_visible_cleared = true;
3124 PageClearAllVisible(page);
3125 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3126 vmbuffer, VISIBILITYMAP_VALID_BITS);
3127 }
3128
3129 /* store transaction information of xact deleting the tuple */
3135 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3137 /* Make sure there is no forward chain link in t_ctid */
3138 tp.t_data->t_ctid = tp.t_self;
3139
3140 /* Signal that this is actually a move into another partition */
3141 if (changingPart)
3143
3144 MarkBufferDirty(buffer);
3145
3146 /*
3147 * XLOG stuff
3148 *
3149 * NB: heap_abort_speculative() uses the same xlog record and replay
3150 * routines.
3151 */
3152 if (RelationNeedsWAL(relation))
3153 {
3157
3158 /*
3159 * For logical decode we need combo CIDs to properly decode the
3160 * catalog
3161 */
3163 log_heap_new_cid(relation, &tp);
3164
3165 xlrec.flags = 0;
3168 if (changingPart)
3170 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3171 tp.t_data->t_infomask2);
3173 xlrec.xmax = new_xmax;
3174
3175 if (old_key_tuple != NULL)
3176 {
3177 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3179 else
3181 }
3182
3185
3187
3188 /*
3189 * Log replica identity of the deleted tuple if there is one
3190 */
3191 if (old_key_tuple != NULL)
3192 {
3193 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3194 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3195 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3196
3198 XLogRegisterData((char *) old_key_tuple->t_data
3200 old_key_tuple->t_len
3202 }
3203
3204 /* filtering by origin on a row level is much more efficient */
3206
3208
3209 PageSetLSN(page, recptr);
3210 }
3211
3213
3215
3216 if (vmbuffer != InvalidBuffer)
3217 ReleaseBuffer(vmbuffer);
3218
3219 /*
3220 * If the tuple has toasted out-of-line attributes, we need to delete
3221 * those items too. We have to do this before releasing the buffer
3222 * because we need to look at the contents of the tuple, but it's OK to
3223 * release the content lock on the buffer first.
3224 */
3225 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3226 relation->rd_rel->relkind != RELKIND_MATVIEW)
3227 {
3228 /* toast table entries should never be recursively toasted */
3230 }
3231 else if (HeapTupleHasExternal(&tp))
3232 heap_toast_delete(relation, &tp, false);
3233
3234 /*
3235 * Mark tuple for invalidation from system caches at next command
3236 * boundary. We have to do this before releasing the buffer because we
3237 * need to look at the contents of the tuple.
3238 */
3239 CacheInvalidateHeapTuple(relation, &tp, NULL);
3240
3241 /* Now we can release the buffer */
3242 ReleaseBuffer(buffer);
3243
3244 /*
3245 * Release the lmgr tuple lock, if we had it.
3246 */
3247 if (have_tuple_lock)
3249
3250 pgstat_count_heap_delete(relation);
3251
3254
3255 return TM_Ok;
3256}

References Assert, AssertHasSnapshotForToast(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), TM_FailureData::ctid, DoesMultiXactIdConflict(), END_CRIT_SECTION, ereport, errcode(), errmsg, ERROR, ExtractReplicaIdentity(), fb(), GetCurrentTransactionId(), heap_acquire_tuplock(), heap_freetuple(), HEAP_MOVED, heap_toast_delete(), HEAP_XMAX_BITS, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HeapTupleHasExternal(), HeapTupleHeaderAdjustCmax(), HeapTupleHeaderClearHotUpdated(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetCmax(), HeapTupleHeaderSetMovedPartitions(), HeapTupleHeaderSetXmax(), HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesVisibility(), InvalidBuffer, InvalidCommandId, InvalidSnapshot, IsInParallelMode(), ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), LockTupleExclusive, LockWaitBlock, log_heap_new_cid(), MarkBufferDirty(), MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusUpdate, PageClearAllVisible(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_delete(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetRelid, RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapDelete, SizeOfHeapHeader, SizeofHeapTupleHeader, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TransactionIdEquals, TransactionIdIsCurrentTransactionId(), UnlockReleaseBuffer(), UnlockTupleTuplock, UpdateXmaxHintBits(), visibilitymap_clear(), visibilitymap_pin(), VISIBILITYMAP_VALID_BITS, XactLockTableWait(), XLH_DELETE_ALL_VISIBLE_CLEARED, XLH_DELETE_CONTAINS_OLD_KEY, XLH_DELETE_CONTAINS_OLD_TUPLE, XLH_DELETE_IS_PARTITION_MOVE, XLOG_HEAP_DELETE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLogSetRecordFlags(), XLTW_Delete, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_delete(), and simple_heap_delete().

◆ heap_endscan()

void heap_endscan ( TableScanDesc  sscan)

Definition at line 1371 of file heapam.c.

1372{
1374
1375 /* Note: no locking manipulations needed */
1376
1377 /*
1378 * unpin scan buffers
1379 */
1380 if (BufferIsValid(scan->rs_cbuf))
1381 ReleaseBuffer(scan->rs_cbuf);
1382
1383 /*
1384 * Must free the read stream before freeing the BufferAccessStrategy.
1385 */
1386 if (scan->rs_read_stream)
1388
1389 /*
1390 * decrement relation reference count and free scan descriptor storage
1391 */
1393
1394 if (scan->rs_base.rs_key)
1395 pfree(scan->rs_base.rs_key);
1396
1397 if (scan->rs_strategy != NULL)
1399
1400 if (scan->rs_parallelworkerdata != NULL)
1402
1403 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1405
1406 pfree(scan);
1407}

References BufferIsValid(), fb(), FreeAccessStrategy(), pfree(), read_stream_end(), RelationDecrementReferenceCount(), ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, TableScanDescData::rs_key, HeapScanDescData::rs_parallelworkerdata, TableScanDescData::rs_rd, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, HeapScanDescData::rs_strategy, SO_TEMP_SNAPSHOT, and UnregisterSnapshot().

◆ heap_fetch()

bool heap_fetch ( Relation  relation,
Snapshot  snapshot,
HeapTuple  tuple,
Buffer userbuf,
bool  keep_buf 
)

Definition at line 1659 of file heapam.c.

1664{
1665 ItemPointer tid = &(tuple->t_self);
1666 ItemId lp;
1667 Buffer buffer;
1668 Page page;
1669 OffsetNumber offnum;
1670 bool valid;
1671
1672 /*
1673 * Fetch and pin the appropriate page of the relation.
1674 */
1675 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1676
1677 /*
1678 * Need share lock on buffer to examine tuple commit status.
1679 */
1681 page = BufferGetPage(buffer);
1682
1683 /*
1684 * We'd better check for out-of-range offnum in case of VACUUM since the
1685 * TID was obtained.
1686 */
1687 offnum = ItemPointerGetOffsetNumber(tid);
1689 {
1691 ReleaseBuffer(buffer);
1693 tuple->t_data = NULL;
1694 return false;
1695 }
1696
1697 /*
1698 * get the item line pointer corresponding to the requested tid
1699 */
1700 lp = PageGetItemId(page, offnum);
1701
1702 /*
1703 * Must check for deleted tuple.
1704 */
1705 if (!ItemIdIsNormal(lp))
1706 {
1708 ReleaseBuffer(buffer);
1710 tuple->t_data = NULL;
1711 return false;
1712 }
1713
1714 /*
1715 * fill in *tuple fields
1716 */
1717 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1718 tuple->t_len = ItemIdGetLength(lp);
1719 tuple->t_tableOid = RelationGetRelid(relation);
1720
1721 /*
1722 * check tuple visibility, then release lock
1723 */
1724 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1725
1726 if (valid)
1727 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1729
1730 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1731
1733
1734 if (valid)
1735 {
1736 /*
1737 * All checks passed, so return the tuple as valid. Caller is now
1738 * responsible for releasing the buffer.
1739 */
1740 *userbuf = buffer;
1741
1742 return true;
1743 }
1744
1745 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1746 if (keep_buf)
1747 *userbuf = buffer;
1748 else
1749 {
1750 ReleaseBuffer(buffer);
1752 tuple->t_data = NULL;
1753 }
1754
1755 return false;
1756}

References BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetPage(), fb(), HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetXmin(), HeapTupleSatisfiesVisibility(), InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PredicateLockTID(), ReadBuffer(), RelationGetRelid, ReleaseBuffer(), HeapTupleData::t_data, HeapTupleData::t_len, HeapTupleData::t_self, and HeapTupleData::t_tableOid.

Referenced by heap_lock_updated_tuple_rec(), heapam_fetch_row_version(), and heapam_tuple_lock().

◆ heap_fetch_next_buffer()

static void heap_fetch_next_buffer ( HeapScanDesc  scan,
ScanDirection  dir 
)
inlinestatic

Definition at line 707 of file heapam.c.

708{
709 Assert(scan->rs_read_stream);
710
711 /* release previous scan buffer, if any */
712 if (BufferIsValid(scan->rs_cbuf))
713 {
714 ReleaseBuffer(scan->rs_cbuf);
715 scan->rs_cbuf = InvalidBuffer;
716 }
717
718 /*
719 * Be sure to check for interrupts at least once per page. Checks at
720 * higher code levels won't be able to stop a seqscan that encounters many
721 * pages' worth of consecutive dead tuples.
722 */
724
725 /*
726 * If the scan direction is changing, reset the prefetch block to the
727 * current block. Otherwise, we will incorrectly prefetch the blocks
728 * between the prefetch block and the current block again before
729 * prefetching blocks in the new, correct scan direction.
730 */
731 if (unlikely(scan->rs_dir != dir))
732 {
733 scan->rs_prefetch_block = scan->rs_cblock;
735 }
736
737 scan->rs_dir = dir;
738
740 if (BufferIsValid(scan->rs_cbuf))
742}

References Assert, BufferGetBlockNumber(), BufferIsValid(), CHECK_FOR_INTERRUPTS, fb(), InvalidBuffer, read_stream_next_buffer(), read_stream_reset(), ReleaseBuffer(), HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_dir, HeapScanDescData::rs_prefetch_block, HeapScanDescData::rs_read_stream, and unlikely.

Referenced by heapgettup(), and heapgettup_pagemode().

◆ heap_finish_speculative()

void heap_finish_speculative ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 6169 of file heapam.c.

6170{
6171 Buffer buffer;
6172 Page page;
6173 OffsetNumber offnum;
6174 ItemId lp;
6175 HeapTupleHeader htup;
6176
6177 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
6179 page = BufferGetPage(buffer);
6180
6181 offnum = ItemPointerGetOffsetNumber(tid);
6183 elog(ERROR, "offnum out of range");
6184 lp = PageGetItemId(page, offnum);
6185 if (!ItemIdIsNormal(lp))
6186 elog(ERROR, "invalid lp");
6187
6188 htup = (HeapTupleHeader) PageGetItem(page, lp);
6189
6190 /* NO EREPORT(ERROR) from here till changes are logged */
6192
6194
6195 MarkBufferDirty(buffer);
6196
6197 /*
6198 * Replace the speculative insertion token with a real t_ctid, pointing to
6199 * itself like it does on regular tuples.
6200 */
6201 htup->t_ctid = *tid;
6202
6203 /* XLOG stuff */
6204 if (RelationNeedsWAL(relation))
6205 {
6208
6210
6212
6213 /* We want the same filtering on this as on a plain insert */
6215
6218
6220
6221 PageSetLSN(page, recptr);
6222 }
6223
6225
6226 UnlockReleaseBuffer(buffer);
6227}

References Assert, BUFFER_LOCK_EXCLUSIVE, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, fb(), HeapTupleHeaderIsSpeculative(), ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), MarkBufferDirty(), xl_heap_confirm::offnum, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PageSetLSN(), ReadBuffer(), REGBUF_STANDARD, RelationNeedsWAL, SizeOfHeapConfirm, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, UnlockReleaseBuffer(), XLOG_HEAP_CONFIRM, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heapam_tuple_complete_speculative().

◆ heap_freeze_prepared_tuples()

void heap_freeze_prepared_tuples ( Buffer  buffer,
HeapTupleFreeze tuples,
int  ntuples 
)

Definition at line 7466 of file heapam.c.

7467{
7468 Page page = BufferGetPage(buffer);
7469
7470 for (int i = 0; i < ntuples; i++)
7471 {
7472 HeapTupleFreeze *frz = tuples + i;
7473 ItemId itemid = PageGetItemId(page, frz->offset);
7474 HeapTupleHeader htup;
7475
7476 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7478 }
7479}

References BufferGetPage(), fb(), heap_execute_freeze_tuple(), i, PageGetItem(), and PageGetItemId().

Referenced by heap_page_prune_and_freeze().

◆ heap_freeze_tuple()

bool heap_freeze_tuple ( HeapTupleHeader  tuple,
TransactionId  relfrozenxid,
TransactionId  relminmxid,
TransactionId  FreezeLimit,
TransactionId  MultiXactCutoff 
)

Definition at line 7488 of file heapam.c.

7491{
7493 bool do_freeze;
7494 bool totally_frozen;
7495 struct VacuumCutoffs cutoffs;
7496 HeapPageFreeze pagefrz;
7497
7498 cutoffs.relfrozenxid = relfrozenxid;
7499 cutoffs.relminmxid = relminmxid;
7500 cutoffs.OldestXmin = FreezeLimit;
7501 cutoffs.OldestMxact = MultiXactCutoff;
7502 cutoffs.FreezeLimit = FreezeLimit;
7503 cutoffs.MultiXactCutoff = MultiXactCutoff;
7504
7505 pagefrz.freeze_required = true;
7506 pagefrz.FreezePageRelfrozenXid = FreezeLimit;
7507 pagefrz.FreezePageRelminMxid = MultiXactCutoff;
7508 pagefrz.FreezePageConflictXid = InvalidTransactionId;
7509 pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
7510 pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
7511
7512 do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
7513 &pagefrz, &frz, &totally_frozen);
7514
7515 /*
7516 * Note that because this is not a WAL-logged operation, we don't need to
7517 * fill in the offset in the freeze record.
7518 */
7519
7520 if (do_freeze)
7522 return do_freeze;
7523}

References fb(), VacuumCutoffs::FreezeLimit, heap_execute_freeze_tuple(), heap_prepare_freeze_tuple(), InvalidTransactionId, VacuumCutoffs::MultiXactCutoff, VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, VacuumCutoffs::relfrozenxid, and VacuumCutoffs::relminmxid.

Referenced by rewrite_heap_tuple().

◆ heap_get_latest_tid()

void heap_get_latest_tid ( TableScanDesc  sscan,
ItemPointer  tid 
)

Definition at line 1931 of file heapam.c.

1933{
1934 Relation relation = sscan->rs_rd;
1935 Snapshot snapshot = sscan->rs_snapshot;
1936 ItemPointerData ctid;
1938
1939 /*
1940 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1941 * Assume that t_ctid links are valid however - there shouldn't be invalid
1942 * ones in the table.
1943 */
1945
1946 /*
1947 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1948 * need to examine, and *tid is the TID we will return if ctid turns out
1949 * to be bogus.
1950 *
1951 * Note that we will loop until we reach the end of the t_ctid chain.
1952 * Depending on the snapshot passed, there might be at most one visible
1953 * version of the row, but we don't try to optimize for that.
1954 */
1955 ctid = *tid;
1956 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1957 for (;;)
1958 {
1959 Buffer buffer;
1960 Page page;
1961 OffsetNumber offnum;
1962 ItemId lp;
1963 HeapTupleData tp;
1964 bool valid;
1965
1966 /*
1967 * Read, pin, and lock the page.
1968 */
1969 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1971 page = BufferGetPage(buffer);
1972
1973 /*
1974 * Check for bogus item number. This is not treated as an error
1975 * condition because it can happen while following a t_ctid link. We
1976 * just assume that the prior tid is OK and return it unchanged.
1977 */
1978 offnum = ItemPointerGetOffsetNumber(&ctid);
1980 {
1981 UnlockReleaseBuffer(buffer);
1982 break;
1983 }
1984 lp = PageGetItemId(page, offnum);
1985 if (!ItemIdIsNormal(lp))
1986 {
1987 UnlockReleaseBuffer(buffer);
1988 break;
1989 }
1990
1991 /* OK to access the tuple */
1992 tp.t_self = ctid;
1993 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1994 tp.t_len = ItemIdGetLength(lp);
1995 tp.t_tableOid = RelationGetRelid(relation);
1996
1997 /*
1998 * After following a t_ctid link, we might arrive at an unrelated
1999 * tuple. Check for XMIN match.
2000 */
2003 {
2004 UnlockReleaseBuffer(buffer);
2005 break;
2006 }
2007
2008 /*
2009 * Check tuple visibility; if visible, set it as the new result
2010 * candidate.
2011 */
2012 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2013 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2014 if (valid)
2015 *tid = ctid;
2016
2017 /*
2018 * If there's a valid t_ctid link, follow it, else we're done.
2019 */
2020 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2024 {
2025 UnlockReleaseBuffer(buffer);
2026 break;
2027 }
2028
2029 ctid = tp.t_data->t_ctid;
2031 UnlockReleaseBuffer(buffer);
2032 } /* end of loop */
2033}

References Assert, BUFFER_LOCK_SHARE, BufferGetPage(), fb(), HEAP_XMAX_INVALID, HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIndicatesMovedPartitions(), HeapTupleHeaderIsOnlyLocked(), HeapTupleSatisfiesVisibility(), InvalidTransactionId, ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), ReadBuffer(), RelationGetRelid, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TransactionIdEquals, TransactionIdIsValid, and UnlockReleaseBuffer().

◆ heap_getnext()

HeapTuple heap_getnext ( TableScanDesc  sscan,
ScanDirection  direction 
)

Definition at line 1410 of file heapam.c.

1411{
1413
1414 /*
1415 * This is still widely used directly, without going through table AM, so
1416 * add a safety check. It's possible we should, at a later point,
1417 * downgrade this to an assert. The reason for checking the AM routine,
1418 * rather than the AM oid, is that this allows to write regression tests
1419 * that create another AM reusing the heap handler.
1420 */
1421 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1422 ereport(ERROR,
1424 errmsg_internal("only heap AM is supported")));
1425
1426 /* Note: no locking manipulations needed */
1427
1429 heapgettup_pagemode(scan, direction,
1430 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1431 else
1432 heapgettup(scan, direction,
1433 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1434
1435 if (scan->rs_ctup.t_data == NULL)
1436 return NULL;
1437
1438 /*
1439 * if we get here it means we have a new current scan tuple, so point to
1440 * the proper return buffer and return the tuple.
1441 */
1442
1444
1445 return &scan->rs_ctup;
1446}

References ereport, errcode(), errmsg_internal(), ERROR, fb(), GetHeapamTableAmRoutine(), heapgettup(), heapgettup_pagemode(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_ctup, TableScanDescData::rs_flags, TableScanDescData::rs_key, TableScanDescData::rs_nkeys, TableScanDescData::rs_rd, SO_ALLOW_PAGEMODE, HeapTupleData::t_data, and unlikely.

Referenced by AlterTableMoveAll(), AlterTableSpaceOptions(), check_db_file_conflict(), CreateDatabaseUsingFileCopy(), do_autovacuum(), DropSetting(), DropTableSpace(), find_typed_table_dependencies(), get_all_vacuum_rels(), get_database_list(), get_subscription_list(), get_tables_to_repack(), get_tablespace_name(), get_tablespace_oid(), GetAllPublicationRelations(), getRelationsInNamespace(), GetSchemaPublicationRelations(), heapam_index_build_range_scan(), heapam_index_validate_scan(), objectsInSchemaToOids(), pgrowlocks(), pgstat_heap(), populate_typ_list(), ReindexMultipleTables(), remove_dbtablespaces(), RemoveSubscriptionRel(), RenameTableSpace(), ThereIsAtLeastOneRole(), and vac_truncate_clog().

◆ heap_getnextslot()

bool heap_getnextslot ( TableScanDesc  sscan,
ScanDirection  direction,
TupleTableSlot slot 
)

Definition at line 1449 of file heapam.c.

1450{
1452
1453 /* Note: no locking manipulations needed */
1454
1455 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1456 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1457 else
1458 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1459
1460 if (scan->rs_ctup.t_data == NULL)
1461 {
1462 ExecClearTuple(slot);
1463 return false;
1464 }
1465
1466 /*
1467 * if we get here it means we have a new current scan tuple, so point to
1468 * the proper return buffer and return the tuple.
1469 */
1470
1472
1473 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1474 scan->rs_cbuf);
1475 return true;
1476}

References ExecClearTuple(), ExecStoreBufferHeapTuple(), fb(), heapgettup(), heapgettup_pagemode(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_rd, SO_ALLOW_PAGEMODE, and HeapTupleData::t_data.

◆ heap_getnextslot_tidrange()

bool heap_getnextslot_tidrange ( TableScanDesc  sscan,
ScanDirection  direction,
TupleTableSlot slot 
)

Definition at line 1552 of file heapam.c.

1554{
1556 ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
1557 ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
1558
1559 /* Note: no locking manipulations needed */
1560 for (;;)
1561 {
1562 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1563 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1564 else
1565 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1566
1567 if (scan->rs_ctup.t_data == NULL)
1568 {
1569 ExecClearTuple(slot);
1570 return false;
1571 }
1572
1573 /*
1574 * heap_set_tidrange will have used heap_setscanlimits to limit the
1575 * range of pages we scan to only ones that can contain the TID range
1576 * we're scanning for. Here we must filter out any tuples from these
1577 * pages that are outside of that range.
1578 */
1579 if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1580 {
1581 ExecClearTuple(slot);
1582
1583 /*
1584 * When scanning backwards, the TIDs will be in descending order.
1585 * Future tuples in this direction will be lower still, so we can
1586 * just return false to indicate there will be no more tuples.
1587 */
1588 if (ScanDirectionIsBackward(direction))
1589 return false;
1590
1591 continue;
1592 }
1593
1594 /*
1595 * Likewise for the final page, we must filter out TIDs greater than
1596 * maxtid.
1597 */
1598 if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1599 {
1600 ExecClearTuple(slot);
1601
1602 /*
1603 * When scanning forward, the TIDs will be in ascending order.
1604 * Future tuples in this direction will be higher still, so we can
1605 * just return false to indicate there will be no more tuples.
1606 */
1607 if (ScanDirectionIsForward(direction))
1608 return false;
1609 continue;
1610 }
1611
1612 break;
1613 }
1614
1615 /*
1616 * if we get here it means we have a new current scan tuple, so point to
1617 * the proper return buffer and return the tuple.
1618 */
1620
1621 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1622 return true;
1623}

References ExecClearTuple(), ExecStoreBufferHeapTuple(), fb(), heapgettup(), heapgettup_pagemode(), ItemPointerCompare(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_rd, ScanDirectionIsBackward, ScanDirectionIsForward, SO_ALLOW_PAGEMODE, HeapTupleData::t_data, and HeapTupleData::t_self.

◆ heap_hot_search_buffer()

bool heap_hot_search_buffer ( ItemPointer  tid,
Relation  relation,
Buffer  buffer,
Snapshot  snapshot,
HeapTuple  heapTuple,
bool all_dead,
bool  first_call 
)

Definition at line 1779 of file heapam.c.

1782{
1783 Page page = BufferGetPage(buffer);
1785 BlockNumber blkno;
1786 OffsetNumber offnum;
1787 bool at_chain_start;
1788 bool valid;
1789 bool skip;
1790 GlobalVisState *vistest = NULL;
1791
1792 /* If this is not the first call, previous call returned a (live!) tuple */
1793 if (all_dead)
1795
1796 blkno = ItemPointerGetBlockNumber(tid);
1797 offnum = ItemPointerGetOffsetNumber(tid);
1799 skip = !first_call;
1800
1801 /* XXX: we should assert that a snapshot is pushed or registered */
1803 Assert(BufferGetBlockNumber(buffer) == blkno);
1804
1805 /* Scan through possible multiple members of HOT-chain */
1806 for (;;)
1807 {
1808 ItemId lp;
1809
1810 /* check for bogus TID */
1812 break;
1813
1814 lp = PageGetItemId(page, offnum);
1815
1816 /* check for unused, dead, or redirected items */
1817 if (!ItemIdIsNormal(lp))
1818 {
1819 /* We should only see a redirect at start of chain */
1821 {
1822 /* Follow the redirect */
1823 offnum = ItemIdGetRedirect(lp);
1824 at_chain_start = false;
1825 continue;
1826 }
1827 /* else must be end of chain */
1828 break;
1829 }
1830
1831 /*
1832 * Update heapTuple to point to the element of the HOT chain we're
1833 * currently investigating. Having t_self set correctly is important
1834 * because the SSI checks and the *Satisfies routine for historical
1835 * MVCC snapshots need the correct tid to decide about the visibility.
1836 */
1837 heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1838 heapTuple->t_len = ItemIdGetLength(lp);
1839 heapTuple->t_tableOid = RelationGetRelid(relation);
1840 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1841
1842 /*
1843 * Shouldn't see a HEAP_ONLY tuple at chain start.
1844 */
1846 break;
1847
1848 /*
1849 * The xmin should match the previous xmax value, else chain is
1850 * broken.
1851 */
1855 break;
1856
1857 /*
1858 * When first_call is true (and thus, skip is initially false) we'll
1859 * return the first tuple we find. But on later passes, heapTuple
1860 * will initially be pointing to the tuple we returned last time.
1861 * Returning it again would be incorrect (and would loop forever), so
1862 * we skip it and return the next match we find.
1863 */
1864 if (!skip)
1865 {
1866 /* If it's visible per the snapshot, we must return it */
1867 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1869 buffer, snapshot);
1870
1871 if (valid)
1872 {
1873 ItemPointerSetOffsetNumber(tid, offnum);
1874 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1876 if (all_dead)
1877 *all_dead = false;
1878 return true;
1879 }
1880 }
1881 skip = false;
1882
1883 /*
1884 * If we can't see it, maybe no one else can either. At caller
1885 * request, check whether all chain members are dead to all
1886 * transactions.
1887 *
1888 * Note: if you change the criterion here for what is "dead", fix the
1889 * planner's get_actual_variable_range() function to match.
1890 */
1891 if (all_dead && *all_dead)
1892 {
1893 if (!vistest)
1894 vistest = GlobalVisTestFor(relation);
1895
1896 if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1897 *all_dead = false;
1898 }
1899
1900 /*
1901 * Check to see if HOT chain continues past this tuple; if so fetch
1902 * the next offnum and loop around.
1903 */
1905 {
1906 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1907 blkno);
1908 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1909 at_chain_start = false;
1911 }
1912 else
1913 break; /* end of chain */
1914 }
1915
1916 return false;
1917}

References Assert, BufferGetBlockNumber(), BufferGetPage(), fb(), GlobalVisTestFor(), HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleIsHeapOnly(), HeapTupleIsHotUpdated(), HeapTupleIsSurelyDead(), HeapTupleSatisfiesVisibility(), InvalidTransactionId, ItemIdGetLength, ItemIdGetRedirect, ItemIdIsNormal, ItemIdIsRedirected, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerSet(), ItemPointerSetOffsetNumber(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PredicateLockTID(), RecentXmin, RelationGetRelid, skip, TransactionIdEquals, and TransactionIdIsValid.

Referenced by BitmapHeapScanNextBlock(), heap_index_delete_tuples(), and heapam_index_fetch_tuple().

◆ heap_index_delete_tuples()

TransactionId heap_index_delete_tuples ( Relation  rel,
TM_IndexDeleteOp delstate 
)

Definition at line 8205 of file heapam.c.

8206{
8207 /* Initial assumption is that earlier pruning took care of conflict */
8208 TransactionId snapshotConflictHorizon = InvalidTransactionId;
8211 Page page = NULL;
8214#ifdef USE_PREFETCH
8217#endif
8219 int finalndeltids = 0,
8220 nblocksaccessed = 0;
8221
8222 /* State that's only used in bottom-up index deletion case */
8223 int nblocksfavorable = 0;
8224 int curtargetfreespace = delstate->bottomupfreespace,
8225 lastfreespace = 0,
8226 actualfreespace = 0;
8227 bool bottomup_final_block = false;
8228
8230
8231 /* Sort caller's deltids array by TID for further processing */
8233
8234 /*
8235 * Bottom-up case: resort deltids array in an order attuned to where the
8236 * greatest number of promising TIDs are to be found, and determine how
8237 * many blocks from the start of sorted array should be considered
8238 * favorable. This will also shrink the deltids array in order to
8239 * eliminate completely unfavorable blocks up front.
8240 */
8241 if (delstate->bottomup)
8243
8244#ifdef USE_PREFETCH
8245 /* Initialize prefetch state. */
8247 prefetch_state.next_item = 0;
8248 prefetch_state.ndeltids = delstate->ndeltids;
8249 prefetch_state.deltids = delstate->deltids;
8250
8251 /*
8252 * Determine the prefetch distance that we will attempt to maintain.
8253 *
8254 * Since the caller holds a buffer lock somewhere in rel, we'd better make
8255 * sure that isn't a catalog relation before we call code that does
8256 * syscache lookups, to avoid risk of deadlock.
8257 */
8258 if (IsCatalogRelation(rel))
8260 else
8263
8264 /* Cap initial prefetch distance for bottom-up deletion caller */
8265 if (delstate->bottomup)
8266 {
8270 }
8271
8272 /* Start prefetching. */
8274#endif
8275
8276 /* Iterate over deltids, determine which to delete, check their horizon */
8277 Assert(delstate->ndeltids > 0);
8278 for (int i = 0; i < delstate->ndeltids; i++)
8279 {
8280 TM_IndexDelete *ideltid = &delstate->deltids[i];
8281 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8282 ItemPointer htid = &ideltid->tid;
8283 OffsetNumber offnum;
8284
8285 /*
8286 * Read buffer, and perform required extra steps each time a new block
8287 * is encountered. Avoid refetching if it's the same block as the one
8288 * from the last htid.
8289 */
8290 if (blkno == InvalidBlockNumber ||
8292 {
8293 /*
8294 * Consider giving up early for bottom-up index deletion caller
8295 * first. (Only prefetch next-next block afterwards, when it
8296 * becomes clear that we're at least going to access the next
8297 * block in line.)
8298 *
8299 * Sometimes the first block frees so much space for bottom-up
8300 * caller that the deletion process can end without accessing any
8301 * more blocks. It is usually necessary to access 2 or 3 blocks
8302 * per bottom-up deletion operation, though.
8303 */
8304 if (delstate->bottomup)
8305 {
8306 /*
8307 * We often allow caller to delete a few additional items
8308 * whose entries we reached after the point that space target
8309 * from caller was satisfied. The cost of accessing the page
8310 * was already paid at that point, so it made sense to finish
8311 * it off. When that happened, we finalize everything here
8312 * (by finishing off the whole bottom-up deletion operation
8313 * without needlessly paying the cost of accessing any more
8314 * blocks).
8315 */
8317 break;
8318
8319 /*
8320 * Give up when we didn't enable our caller to free any
8321 * additional space as a result of processing the page that we
8322 * just finished up with. This rule is the main way in which
8323 * we keep the cost of bottom-up deletion under control.
8324 */
8326 break;
8327 lastfreespace = actualfreespace; /* for next time */
8328
8329 /*
8330 * Deletion operation (which is bottom-up) will definitely
8331 * access the next block in line. Prepare for that now.
8332 *
8333 * Decay target free space so that we don't hang on for too
8334 * long with a marginal case. (Space target is only truly
8335 * helpful when it allows us to recognize that we don't need
8336 * to access more than 1 or 2 blocks to satisfy caller due to
8337 * agreeable workload characteristics.)
8338 *
8339 * We are a bit more patient when we encounter contiguous
8340 * blocks, though: these are treated as favorable blocks. The
8341 * decay process is only applied when the next block in line
8342 * is not a favorable/contiguous block. This is not an
8343 * exception to the general rule; we still insist on finding
8344 * at least one deletable item per block accessed. See
8345 * bottomup_nblocksfavorable() for full details of the theory
8346 * behind favorable blocks and heap block locality in general.
8347 *
8348 * Note: The first block in line is always treated as a
8349 * favorable block, so the earliest possible point that the
8350 * decay can be applied is just before we access the second
8351 * block in line. The Assert() verifies this for us.
8352 */
8354 if (nblocksfavorable > 0)
8356 else
8357 curtargetfreespace /= 2;
8358 }
8359
8360 /* release old buffer */
8361 if (BufferIsValid(buf))
8363
8365 buf = ReadBuffer(rel, blkno);
8367 Assert(!delstate->bottomup ||
8369
8370#ifdef USE_PREFETCH
8371
8372 /*
8373 * To maintain the prefetch distance, prefetch one more page for
8374 * each page we read.
8375 */
8377#endif
8378
8380
8381 page = BufferGetPage(buf);
8382 maxoff = PageGetMaxOffsetNumber(page);
8383 }
8384
8385 /*
8386 * In passing, detect index corruption involving an index page with a
8387 * TID that points to a location in the heap that couldn't possibly be
8388 * correct. We only do this with actual TIDs from caller's index page
8389 * (not items reached by traversing through a HOT chain).
8390 */
8392
8393 if (istatus->knowndeletable)
8394 Assert(!delstate->bottomup && !istatus->promising);
8395 else
8396 {
8397 ItemPointerData tmp = *htid;
8399
8400 /* Are any tuples from this HOT chain non-vacuumable? */
8402 &heapTuple, NULL, true))
8403 continue; /* can't delete entry */
8404
8405 /* Caller will delete, since whole HOT chain is vacuumable */
8406 istatus->knowndeletable = true;
8407
8408 /* Maintain index free space info for bottom-up deletion case */
8409 if (delstate->bottomup)
8410 {
8411 Assert(istatus->freespace > 0);
8412 actualfreespace += istatus->freespace;
8414 bottomup_final_block = true;
8415 }
8416 }
8417
8418 /*
8419 * Maintain snapshotConflictHorizon value for deletion operation as a
8420 * whole by advancing current value using heap tuple headers. This is
8421 * loosely based on the logic for pruning a HOT chain.
8422 */
8424 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
8425 for (;;)
8426 {
8427 ItemId lp;
8428 HeapTupleHeader htup;
8429
8430 /* Sanity check (pure paranoia) */
8431 if (offnum < FirstOffsetNumber)
8432 break;
8433
8434 /*
8435 * An offset past the end of page's line pointer array is possible
8436 * when the array was truncated
8437 */
8438 if (offnum > maxoff)
8439 break;
8440
8441 lp = PageGetItemId(page, offnum);
8443 {
8444 offnum = ItemIdGetRedirect(lp);
8445 continue;
8446 }
8447
8448 /*
8449 * We'll often encounter LP_DEAD line pointers (especially with an
8450 * entry marked knowndeletable by our caller up front). No heap
8451 * tuple headers get examined for an htid that leads us to an
8452 * LP_DEAD item. This is okay because the earlier pruning
8453 * operation that made the line pointer LP_DEAD in the first place
8454 * must have considered the original tuple header as part of
8455 * generating its own snapshotConflictHorizon value.
8456 *
8457 * Relying on XLOG_HEAP2_PRUNE_VACUUM_SCAN records like this is
8458 * the same strategy that index vacuuming uses in all cases. Index
8459 * VACUUM WAL records don't even have a snapshotConflictHorizon
8460 * field of their own for this reason.
8461 */
8462 if (!ItemIdIsNormal(lp))
8463 break;
8464
8465 htup = (HeapTupleHeader) PageGetItem(page, lp);
8466
8467 /*
8468 * Check the tuple XMIN against prior XMAX, if any
8469 */
8472 break;
8473
8475 &snapshotConflictHorizon);
8476
8477 /*
8478 * If the tuple is not HOT-updated, then we are at the end of this
8479 * HOT-chain. No need to visit later tuples from the same update
8480 * chain (they get their own index entries) -- just move on to
8481 * next htid from index AM caller.
8482 */
8483 if (!HeapTupleHeaderIsHotUpdated(htup))
8484 break;
8485
8486 /* Advance to next HOT chain member */
8487 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
8488 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
8490 }
8491
8492 /* Enable further/final shrinking of deltids for caller */
8493 finalndeltids = i + 1;
8494 }
8495
8497
8498 /*
8499 * Shrink deltids array to exclude non-deletable entries at the end. This
8500 * is not just a minor optimization. Final deltids array size might be
8501 * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
8502 * ndeltids being zero in all cases with zero total deletable entries.
8503 */
8504 Assert(finalndeltids > 0 || delstate->bottomup);
8505 delstate->ndeltids = finalndeltids;
8506
8507 return snapshotConflictHorizon;
8508}

References Assert, BOTTOMUP_MAX_NBLOCKS, bottomup_sort_and_shrink(), buf, BUFFER_LOCK_SHARE, BufferGetPage(), BufferIsValid(), fb(), FirstOffsetNumber, get_tablespace_maintenance_io_concurrency(), GlobalVisTestFor(), heap_hot_search_buffer(), HeapTupleHeaderAdvanceConflictHorizon(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIsHotUpdated(), i, index_delete_check_htid(), index_delete_sort(), InitNonVacuumableSnapshot, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, InvalidTransactionId, IsCatalogRelation(), ItemIdGetRedirect, ItemIdIsNormal, ItemIdIsRedirected, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), maintenance_io_concurrency, Min, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationData::rd_rel, ReadBuffer(), HeapTupleHeaderData::t_ctid, TransactionIdEquals, TransactionIdIsValid, and UnlockReleaseBuffer().

◆ heap_inplace_lock()

bool heap_inplace_lock ( Relation  relation,
HeapTuple  oldtup_ptr,
Buffer  buffer,
void(*)(void *)  release_callback,
void arg 
)

Definition at line 6438 of file heapam.c.

6441{
6442 HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */
6443 TM_Result result;
6444 bool ret;
6445
6446#ifdef USE_ASSERT_CHECKING
6447 if (RelationGetRelid(relation) == RelationRelationId)
6449#endif
6450
6451 Assert(BufferIsValid(buffer));
6452
6453 /*
6454 * Register shared cache invals if necessary. Other sessions may finish
6455 * inplace updates of this tuple between this step and LockTuple(). Since
6456 * inplace updates don't change cache keys, that's harmless.
6457 *
6458 * While it's tempting to register invals only after confirming we can
6459 * return true, the following obstacle precludes reordering steps that
6460 * way. Registering invals might reach a CatalogCacheInitializeCache()
6461 * that locks "buffer". That would hang indefinitely if running after our
6462 * own LockBuffer(). Hence, we must register invals before LockBuffer().
6463 */
6465
6466 LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6468
6469 /*----------
6470 * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
6471 *
6472 * - wait unconditionally
6473 * - already locked tuple above, since inplace needs that unconditionally
6474 * - don't recheck header after wait: simpler to defer to next iteration
6475 * - don't try to continue even if the updater aborts: likewise
6476 * - no crosscheck
6477 */
6479 buffer);
6480
6481 if (result == TM_Invisible)
6482 {
6483 /* no known way this can happen */
6484 ereport(ERROR,
6486 errmsg_internal("attempted to overwrite invisible tuple")));
6487 }
6488 else if (result == TM_SelfModified)
6489 {
6490 /*
6491 * CREATE INDEX might reach this if an expression is silly enough to
6492 * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
6493 * statements might get here after a heap_update() of the same row, in
6494 * the absence of an intervening CommandCounterIncrement().
6495 */
6496 ereport(ERROR,
6498 errmsg("tuple to be updated was already modified by an operation triggered by the current command")));
6499 }
6500 else if (result == TM_BeingModified)
6501 {
6504
6506 infomask = oldtup.t_data->t_infomask;
6507
6509 {
6512 int remain;
6513
6515 lockmode, NULL))
6516 {
6519 ret = false;
6521 relation, &oldtup.t_self, XLTW_Update,
6522 &remain);
6523 }
6524 else
6525 ret = true;
6526 }
6528 ret = true;
6530 ret = true;
6531 else
6532 {
6535 ret = false;
6536 XactLockTableWait(xwait, relation, &oldtup.t_self,
6537 XLTW_Update);
6538 }
6539 }
6540 else
6541 {
6542 ret = (result == TM_Ok);
6543 if (!ret)
6544 {
6547 }
6548 }
6549
6550 /*
6551 * GetCatalogSnapshot() relies on invalidation messages to know when to
6552 * take a new snapshot. COMMIT of xwait is responsible for sending the
6553 * invalidation. We're not acquiring heavyweight locks sufficient to
6554 * block if not yet sent, so we must take a new snapshot to ensure a later
6555 * attempt has a fair chance. While we don't need this if xwait aborted,
6556 * don't bother optimizing that.
6557 */
6558 if (!ret)
6559 {
6560 UnlockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6563 }
6564 return ret;
6565}

References arg, Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsValid(), CacheInvalidateHeapTupleInplace(), DoesMultiXactIdConflict(), ereport, errcode(), errmsg, errmsg_internal(), ERROR, fb(), ForgetInplace_Inval(), GetCurrentCommandId(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleSatisfiesUpdate(), InplaceUpdateTupleLock, InvalidateCatalogSnapshot(), LockBuffer(), LockTuple(), LockTupleNoKeyExclusive, MultiXactIdWait(), MultiXactStatusNoKeyUpdate, RelationGetRelid, TM_BeingModified, TM_Invisible, TM_Ok, TM_SelfModified, TransactionIdIsCurrentTransactionId(), UnlockTuple(), XactLockTableWait(), and XLTW_Update.

Referenced by systable_inplace_update_begin().

◆ heap_inplace_unlock()

void heap_inplace_unlock ( Relation  relation,
HeapTuple  oldtup,
Buffer  buffer 
)

◆ heap_inplace_update_and_unlock()

void heap_inplace_update_and_unlock ( Relation  relation,
HeapTuple  oldtup,
HeapTuple  tuple,
Buffer  buffer 
)

Definition at line 6576 of file heapam.c.

6579{
6580 HeapTupleHeader htup = oldtup->t_data;
6581 uint32 oldlen;
6582 uint32 newlen;
6583 char *dst;
6584 char *src;
6585 int nmsgs = 0;
6587 bool RelcacheInitFileInval = false;
6588
6589 Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
6590 oldlen = oldtup->t_len - htup->t_hoff;
6591 newlen = tuple->t_len - tuple->t_data->t_hoff;
6592 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6593 elog(ERROR, "wrong tuple length");
6594
6595 dst = (char *) htup + htup->t_hoff;
6596 src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6597
6598 /* Like RecordTransactionCommit(), log only if needed */
6601 &RelcacheInitFileInval);
6602
6603 /*
6604 * Unlink relcache init files as needed. If unlinking, acquire
6605 * RelCacheInitLock until after associated invalidations. By doing this
6606 * in advance, if we checkpoint and then crash between inplace
6607 * XLogInsert() and inval, we don't rely on StartupXLOG() ->
6608 * RelationCacheInitFileRemove(). That uses elevel==LOG, so replay would
6609 * neglect to PANIC on EIO.
6610 */
6612
6613 /*----------
6614 * NO EREPORT(ERROR) from here till changes are complete
6615 *
6616 * Our exclusive buffer lock won't stop a reader having already pinned and
6617 * checked visibility for this tuple. With the usual order of changes
6618 * (i.e. updating the buffer contents before WAL logging), a reader could
6619 * observe our not-yet-persistent update to relfrozenxid and update
6620 * datfrozenxid based on that. A crash in that moment could allow
6621 * datfrozenxid to overtake relfrozenxid:
6622 *
6623 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6624 * ["R" is a VACUUM tbl]
6625 * D: vac_update_datfrozenxid() -> systable_beginscan(pg_class)
6626 * D: systable_getnext() returns pg_class tuple of tbl
6627 * R: memcpy() into pg_class tuple of tbl
6628 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6629 * [crash]
6630 * [recovery restores datfrozenxid w/o relfrozenxid]
6631 *
6632 * We avoid that by using a temporary copy of the buffer to hide our
6633 * change from other backends until the change has been WAL-logged. We
6634 * apply our change to the temporary copy and WAL-log it, before modifying
6635 * the real page. That way any action a reader of the in-place-updated
6636 * value takes will be WAL logged after this change.
6637 */
6639
6640 MarkBufferDirty(buffer);
6641
6642 /* XLOG stuff */
6643 if (RelationNeedsWAL(relation))
6644 {
6647 char *origdata = (char *) BufferGetBlock(buffer);
6648 Page page = BufferGetPage(buffer);
6649 uint16 lower = ((PageHeader) page)->pd_lower;
6650 uint16 upper = ((PageHeader) page)->pd_upper;
6652 RelFileLocator rlocator;
6653 ForkNumber forkno;
6654 BlockNumber blkno;
6656
6657 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6658 xlrec.dbId = MyDatabaseId;
6660 xlrec.relcacheInitFileInval = RelcacheInitFileInval;
6661 xlrec.nmsgs = nmsgs;
6662
6665 if (nmsgs != 0)
6667 nmsgs * sizeof(SharedInvalidationMessage));
6668
6669 /* register block matching what buffer will look like after changes */
6674 BufferGetTag(buffer, &rlocator, &forkno, &blkno);
6675 Assert(forkno == MAIN_FORKNUM);
6676 XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
6678 XLogRegisterBufData(0, src, newlen);
6679
6680 /* inplace updates aren't decoded atm, don't log the origin */
6681
6683
6684 PageSetLSN(page, recptr);
6685 }
6686
6687 memcpy(dst, src, newlen);
6688
6690
6691 /*
6692 * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we
6693 * do this before UnlockTuple().
6694 */
6696
6698 UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
6699
6700 AcceptInvalidationMessages(); /* local processing of just-sent inval */
6701
6702 /*
6703 * Queue a transactional inval, for logical decoding and for third-party
6704 * code that might have been relying on it since long before inplace
6705 * update adopted immediate invalidation. See README.tuplock section
6706 * "Reading inplace-updated columns" for logical decoding details.
6707 */
6709 CacheInvalidateHeapTuple(relation, tuple, NULL);
6710}

References AcceptInvalidationMessages(), Assert, AtInplace_Inval(), BUFFER_LOCK_UNLOCK, BufferGetBlock(), BufferGetPage(), BufferGetTag(), CacheInvalidateHeapTuple(), elog, END_CRIT_SECTION, ERROR, fb(), inplaceGetInvalidationMessages(), InplaceUpdateTupleLock, IsBootstrapProcessingMode, ItemPointerEquals(), ItemPointerGetOffsetNumber(), LockBuffer(), lower(), MAIN_FORKNUM, MarkBufferDirty(), MinSizeOfHeapInplace, MyDatabaseId, MyDatabaseTableSpace, PageSetLSN(), PreInplace_Inval(), REGBUF_STANDARD, RelationNeedsWAL, START_CRIT_SECTION, HeapTupleData::t_data, HeapTupleHeaderData::t_hoff, HeapTupleData::t_len, HeapTupleData::t_self, UnlockTuple(), upper(), XLOG_HEAP_INPLACE, XLogBeginInsert(), XLogInsert(), XLogRegisterBlock(), XLogRegisterBufData(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by systable_inplace_update_finish().

◆ heap_insert()

void heap_insert ( Relation  relation,
HeapTuple  tup,
CommandId  cid,
int  options,
BulkInsertState  bistate 
)

Definition at line 2142 of file heapam.c.

2144{
2147 Buffer buffer;
2148 Buffer vmbuffer = InvalidBuffer;
2149 bool all_visible_cleared = false;
2150
2151 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2154
2155 AssertHasSnapshotForToast(relation);
2156
2157 /*
2158 * Fill in tuple header fields and toast the tuple if necessary.
2159 *
2160 * Note: below this point, heaptup is the data we actually intend to store
2161 * into the relation; tup is the caller's original untoasted data.
2162 */
2163 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2164
2165 /*
2166 * Find buffer to insert this tuple into. If the page is all visible,
2167 * this will also pin the requisite visibility map page.
2168 */
2169 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2170 InvalidBuffer, options, bistate,
2171 &vmbuffer, NULL,
2172 0);
2173
2174 /*
2175 * We're about to do the actual insert -- but check for conflict first, to
2176 * avoid possibly having to roll back work we've just done.
2177 *
2178 * This is safe without a recheck as long as there is no possibility of
2179 * another process scanning the page between this check and the insert
2180 * being visible to the scan (i.e., an exclusive buffer content lock is
2181 * continuously held from this point until the tuple insert is visible).
2182 *
2183 * For a heap insert, we only need to check for table-level SSI locks. Our
2184 * new tuple can't possibly conflict with existing tuple locks, and heap
2185 * page locks are only consolidated versions of tuple locks; they do not
2186 * lock "gaps" as index page locks do. So we don't need to specify a
2187 * buffer when making the call, which makes for a faster check.
2188 */
2190
2191 /* NO EREPORT(ERROR) from here till changes are logged */
2193
2194 RelationPutHeapTuple(relation, buffer, heaptup,
2196
2197 if (PageIsAllVisible(BufferGetPage(buffer)))
2198 {
2199 all_visible_cleared = true;
2201 visibilitymap_clear(relation,
2203 vmbuffer, VISIBILITYMAP_VALID_BITS);
2204 }
2205
2206 /*
2207 * XXX Should we set PageSetPrunable on this page ?
2208 *
2209 * The inserting transaction may eventually abort thus making this tuple
2210 * DEAD and hence available for pruning. Though we don't want to optimize
2211 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2212 * aborted tuple will never be pruned until next vacuum is triggered.
2213 *
2214 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2215 */
2216
2217 MarkBufferDirty(buffer);
2218
2219 /* XLOG stuff */
2220 if (RelationNeedsWAL(relation))
2221 {
2225 Page page = BufferGetPage(buffer);
2226 uint8 info = XLOG_HEAP_INSERT;
2227 int bufflags = 0;
2228
2229 /*
2230 * If this is a catalog, we need to transmit combo CIDs to properly
2231 * decode, so log that as well.
2232 */
2234 log_heap_new_cid(relation, heaptup);
2235
2236 /*
2237 * If this is the single and first tuple on page, we can reinit the
2238 * page instead of restoring the whole thing. Set flag, and hide
2239 * buffer references from XLogInsert.
2240 */
2243 {
2244 info |= XLOG_HEAP_INIT_PAGE;
2246 }
2247
2248 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2249 xlrec.flags = 0;
2255
2256 /*
2257 * For logical decoding, we need the tuple even if we're doing a full
2258 * page write, so make sure it's included even if we take a full-page
2259 * image. (XXX We could alternatively store a pointer into the FPW).
2260 */
2261 if (RelationIsLogicallyLogged(relation) &&
2263 {
2266
2267 if (IsToastRelation(relation))
2269 }
2270
2273
2274 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2275 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2276 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2277
2278 /*
2279 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2280 * write the whole page to the xlog, we don't need to store
2281 * xl_heap_header in the xlog.
2282 */
2285 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2287 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2289
2290 /* filtering by origin on a row level is much more efficient */
2292
2293 recptr = XLogInsert(RM_HEAP_ID, info);
2294
2295 PageSetLSN(page, recptr);
2296 }
2297
2299
2300 UnlockReleaseBuffer(buffer);
2301 if (vmbuffer != InvalidBuffer)
2302 ReleaseBuffer(vmbuffer);
2303
2304 /*
2305 * If tuple is cacheable, mark it for invalidation from the caches in case
2306 * we abort. Note it is OK to do this after releasing the buffer, because
2307 * the heaptup data structure is all in local memory, not in the shared
2308 * buffer.
2309 */
2311
2312 /* Note: speculative insertions are counted too, even if aborted later */
2313 pgstat_count_heap_insert(relation, 1);
2314
2315 /*
2316 * If heaptup is a private copy, release it. Don't forget to copy t_self
2317 * back to the caller's image, too.
2318 */
2319 if (heaptup != tup)
2320 {
2321 tup->t_self = heaptup->t_self;
2323 }
2324}

References Assert, AssertHasSnapshotForToast(), BufferGetBlockNumber(), BufferGetPage(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), END_CRIT_SECTION, fb(), FirstOffsetNumber, GetCurrentTransactionId(), heap_freetuple(), HEAP_INSERT_NO_LOGICAL, HEAP_INSERT_SPECULATIVE, heap_prepare_insert(), HeapTupleHeaderGetNatts, InvalidBlockNumber, InvalidBuffer, IsToastRelation(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), log_heap_new_cid(), MarkBufferDirty(), PageClearAllVisible(), PageGetMaxOffsetNumber(), PageIsAllVisible(), PageSetLSN(), pgstat_count_heap_insert(), REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetBufferForTuple(), RelationGetNumberOfAttributes, RelationIsAccessibleInLogicalDecoding, RelationIsLogicallyLogged, RelationNeedsWAL, RelationPutHeapTuple(), ReleaseBuffer(), SizeOfHeapHeader, SizeOfHeapInsert, SizeofHeapTupleHeader, START_CRIT_SECTION, UnlockReleaseBuffer(), visibilitymap_clear(), VISIBILITYMAP_VALID_BITS, XLH_INSERT_ALL_VISIBLE_CLEARED, XLH_INSERT_CONTAINS_NEW_TUPLE, XLH_INSERT_IS_SPECULATIVE, XLH_INSERT_ON_TOAST_RELATION, XLOG_HEAP_INIT_PAGE, XLOG_HEAP_INSERT, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heapam_tuple_insert(), heapam_tuple_insert_speculative(), simple_heap_insert(), and toast_save_datum().

◆ heap_lock_tuple()

TM_Result heap_lock_tuple ( Relation  relation,
HeapTuple  tuple,
CommandId  cid,
LockTupleMode  mode,
LockWaitPolicy  wait_policy,
bool  follow_updates,
Buffer buffer,
TM_FailureData tmfd 
)

Definition at line 4645 of file heapam.c.

4649{
4650 TM_Result result;
4651 ItemPointer tid = &(tuple->t_self);
4652 ItemId lp;
4653 Page page;
4654 Buffer vmbuffer = InvalidBuffer;
4655 BlockNumber block;
4656 TransactionId xid,
4657 xmax;
4661 bool first_time = true;
4662 bool skip_tuple_lock = false;
4663 bool have_tuple_lock = false;
4664 bool cleared_all_frozen = false;
4665
4666 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4667 block = ItemPointerGetBlockNumber(tid);
4668
4669 /*
4670 * Before locking the buffer, pin the visibility map page if it appears to
4671 * be necessary. Since we haven't got the lock yet, someone else might be
4672 * in the middle of changing this, so we'll need to recheck after we have
4673 * the lock.
4674 */
4675 if (PageIsAllVisible(BufferGetPage(*buffer)))
4676 visibilitymap_pin(relation, block, &vmbuffer);
4677
4679
4680 page = BufferGetPage(*buffer);
4683
4684 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4685 tuple->t_len = ItemIdGetLength(lp);
4686 tuple->t_tableOid = RelationGetRelid(relation);
4687
4688l3:
4689 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4690
4691 if (result == TM_Invisible)
4692 {
4693 /*
4694 * This is possible, but only when locking a tuple for ON CONFLICT DO
4695 * SELECT/UPDATE. We return this value here rather than throwing an
4696 * error in order to give that case the opportunity to throw a more
4697 * specific error.
4698 */
4699 result = TM_Invisible;
4700 goto out_locked;
4701 }
4702 else if (result == TM_BeingModified ||
4703 result == TM_Updated ||
4704 result == TM_Deleted)
4705 {
4709 bool require_sleep;
4710 ItemPointerData t_ctid;
4711
4712 /* must copy state data before unlocking buffer */
4714 infomask = tuple->t_data->t_infomask;
4715 infomask2 = tuple->t_data->t_infomask2;
4716 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4717
4719
4720 /*
4721 * If any subtransaction of the current top transaction already holds
4722 * a lock as strong as or stronger than what we're requesting, we
4723 * effectively hold the desired lock already. We *must* succeed
4724 * without trying to take the tuple lock, else we will deadlock
4725 * against anyone wanting to acquire a stronger lock.
4726 *
4727 * Note we only do this the first time we loop on the HTSU result;
4728 * there is no point in testing in subsequent passes, because
4729 * evidently our own transaction cannot have acquired a new lock after
4730 * the first time we checked.
4731 */
4732 if (first_time)
4733 {
4734 first_time = false;
4735
4737 {
4738 int i;
4739 int nmembers;
4740 MultiXactMember *members;
4741
4742 /*
4743 * We don't need to allow old multixacts here; if that had
4744 * been the case, HeapTupleSatisfiesUpdate would have returned
4745 * MayBeUpdated and we wouldn't be here.
4746 */
4747 nmembers =
4748 GetMultiXactIdMembers(xwait, &members, false,
4750
4751 for (i = 0; i < nmembers; i++)
4752 {
4753 /* only consider members of our own transaction */
4754 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4755 continue;
4756
4757 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4758 {
4759 pfree(members);
4760 result = TM_Ok;
4761 goto out_unlocked;
4762 }
4763 else
4764 {
4765 /*
4766 * Disable acquisition of the heavyweight tuple lock.
4767 * Otherwise, when promoting a weaker lock, we might
4768 * deadlock with another locker that has acquired the
4769 * heavyweight tuple lock and is waiting for our
4770 * transaction to finish.
4771 *
4772 * Note that in this case we still need to wait for
4773 * the multixact if required, to avoid acquiring
4774 * conflicting locks.
4775 */
4776 skip_tuple_lock = true;
4777 }
4778 }
4779
4780 if (members)
4781 pfree(members);
4782 }
4784 {
4785 switch (mode)
4786 {
4787 case LockTupleKeyShare:
4791 result = TM_Ok;
4792 goto out_unlocked;
4793 case LockTupleShare:
4796 {
4797 result = TM_Ok;
4798 goto out_unlocked;
4799 }
4800 break;
4803 {
4804 result = TM_Ok;
4805 goto out_unlocked;
4806 }
4807 break;
4808 case LockTupleExclusive:
4811 {
4812 result = TM_Ok;
4813 goto out_unlocked;
4814 }
4815 break;
4816 }
4817 }
4818 }
4819
4820 /*
4821 * Initially assume that we will have to wait for the locking
4822 * transaction(s) to finish. We check various cases below in which
4823 * this can be turned off.
4824 */
4825 require_sleep = true;
4826 if (mode == LockTupleKeyShare)
4827 {
4828 /*
4829 * If we're requesting KeyShare, and there's no update present, we
4830 * don't need to wait. Even if there is an update, we can still
4831 * continue if the key hasn't been modified.
4832 *
4833 * However, if there are updates, we need to walk the update chain
4834 * to mark future versions of the row as locked, too. That way,
4835 * if somebody deletes that future version, we're protected
4836 * against the key going away. This locking of future versions
4837 * could block momentarily, if a concurrent transaction is
4838 * deleting a key; or it could return a value to the effect that
4839 * the transaction deleting the key has already committed. So we
4840 * do this before re-locking the buffer; otherwise this would be
4841 * prone to deadlocks.
4842 *
4843 * Note that the TID we're locking was grabbed before we unlocked
4844 * the buffer. For it to change while we're not looking, the
4845 * other properties we're testing for below after re-locking the
4846 * buffer would also change, in which case we would restart this
4847 * loop above.
4848 */
4850 {
4851 bool updated;
4852
4854
4855 /*
4856 * If there are updates, follow the update chain; bail out if
4857 * that cannot be done.
4858 */
4859 if (follow_updates && updated &&
4860 !ItemPointerEquals(&tuple->t_self, &t_ctid))
4861 {
4862 TM_Result res;
4863
4864 res = heap_lock_updated_tuple(relation,
4865 infomask, xwait, &t_ctid,
4867 mode);
4868 if (res != TM_Ok)
4869 {
4870 result = res;
4871 /* recovery code expects to have buffer lock held */
4873 goto failed;
4874 }
4875 }
4876
4878
4879 /*
4880 * Make sure it's still an appropriate lock, else start over.
4881 * Also, if it wasn't updated before we released the lock, but
4882 * is updated now, we start over too; the reason is that we
4883 * now need to follow the update chain to lock the new
4884 * versions.
4885 */
4886 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4887 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4888 !updated))
4889 goto l3;
4890
4891 /* Things look okay, so we can skip sleeping */
4892 require_sleep = false;
4893
4894 /*
4895 * Note we allow Xmax to change here; other updaters/lockers
4896 * could have modified it before we grabbed the buffer lock.
4897 * However, this is not a problem, because with the recheck we
4898 * just did we ensure that they still don't conflict with the
4899 * lock we want.
4900 */
4901 }
4902 }
4903 else if (mode == LockTupleShare)
4904 {
4905 /*
4906 * If we're requesting Share, we can similarly avoid sleeping if
4907 * there's no update and no exclusive lock present.
4908 */
4911 {
4913
4914 /*
4915 * Make sure it's still an appropriate lock, else start over.
4916 * See above about allowing xmax to change.
4917 */
4920 goto l3;
4921 require_sleep = false;
4922 }
4923 }
4924 else if (mode == LockTupleNoKeyExclusive)
4925 {
4926 /*
4927 * If we're requesting NoKeyExclusive, we might also be able to
4928 * avoid sleeping; just ensure that there no conflicting lock
4929 * already acquired.
4930 */
4932 {
4934 mode, NULL))
4935 {
4936 /*
4937 * No conflict, but if the xmax changed under us in the
4938 * meantime, start over.
4939 */
4943 xwait))
4944 goto l3;
4945
4946 /* otherwise, we're good */
4947 require_sleep = false;
4948 }
4949 }
4951 {
4953
4954 /* if the xmax changed in the meantime, start over */
4957 xwait))
4958 goto l3;
4959 /* otherwise, we're good */
4960 require_sleep = false;
4961 }
4962 }
4963
4964 /*
4965 * As a check independent from those above, we can also avoid sleeping
4966 * if the current transaction is the sole locker of the tuple. Note
4967 * that the strength of the lock already held is irrelevant; this is
4968 * not about recording the lock in Xmax (which will be done regardless
4969 * of this optimization, below). Also, note that the cases where we
4970 * hold a lock stronger than we are requesting are already handled
4971 * above by not doing anything.
4972 *
4973 * Note we only deal with the non-multixact case here; MultiXactIdWait
4974 * is well equipped to deal with this situation on its own.
4975 */
4978 {
4979 /* ... but if the xmax changed in the meantime, start over */
4983 xwait))
4984 goto l3;
4986 require_sleep = false;
4987 }
4988
4989 /*
4990 * Time to sleep on the other transaction/multixact, if necessary.
4991 *
4992 * If the other transaction is an update/delete that's already
4993 * committed, then sleeping cannot possibly do any good: if we're
4994 * required to sleep, get out to raise an error instead.
4995 *
4996 * By here, we either have already acquired the buffer exclusive lock,
4997 * or we must wait for the locking transaction or multixact; so below
4998 * we ensure that we grab buffer lock after the sleep.
4999 */
5000 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
5001 {
5003 goto failed;
5004 }
5005 else if (require_sleep)
5006 {
5007 /*
5008 * Acquire tuple lock to establish our priority for the tuple, or
5009 * die trying. LockTuple will release us when we are next-in-line
5010 * for the tuple. We must do this even if we are share-locking,
5011 * but not if we already have a weaker lock on the tuple.
5012 *
5013 * If we are forced to "start over" below, we keep the tuple lock;
5014 * this arranges that we stay at the head of the line while
5015 * rechecking tuple state.
5016 */
5017 if (!skip_tuple_lock &&
5018 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
5020 {
5021 /*
5022 * This can only happen if wait_policy is Skip and the lock
5023 * couldn't be obtained.
5024 */
5025 result = TM_WouldBlock;
5026 /* recovery code expects to have buffer lock held */
5028 goto failed;
5029 }
5030
5032 {
5034
5035 /* We only ever lock tuples, never update them */
5036 if (status >= MultiXactStatusNoKeyUpdate)
5037 elog(ERROR, "invalid lock mode in heap_lock_tuple");
5038
5039 /* wait for multixact to end, or die trying */
5040 switch (wait_policy)
5041 {
5042 case LockWaitBlock:
5044 relation, &tuple->t_self, XLTW_Lock, NULL);
5045 break;
5046 case LockWaitSkip:
5048 status, infomask, relation,
5049 NULL, false))
5050 {
5051 result = TM_WouldBlock;
5052 /* recovery code expects to have buffer lock held */
5054 goto failed;
5055 }
5056 break;
5057 case LockWaitError:
5059 status, infomask, relation,
5061 ereport(ERROR,
5063 errmsg("could not obtain lock on row in relation \"%s\"",
5064 RelationGetRelationName(relation))));
5065
5066 break;
5067 }
5068
5069 /*
5070 * Of course, the multixact might not be done here: if we're
5071 * requesting a light lock mode, other transactions with light
5072 * locks could still be alive, as well as locks owned by our
5073 * own xact or other subxacts of this backend. We need to
5074 * preserve the surviving MultiXact members. Note that it
5075 * isn't absolutely necessary in the latter case, but doing so
5076 * is simpler.
5077 */
5078 }
5079 else
5080 {
5081 /* wait for regular transaction to end, or die trying */
5082 switch (wait_policy)
5083 {
5084 case LockWaitBlock:
5085 XactLockTableWait(xwait, relation, &tuple->t_self,
5086 XLTW_Lock);
5087 break;
5088 case LockWaitSkip:
5090 {
5091 result = TM_WouldBlock;
5092 /* recovery code expects to have buffer lock held */
5094 goto failed;
5095 }
5096 break;
5097 case LockWaitError:
5099 ereport(ERROR,
5101 errmsg("could not obtain lock on row in relation \"%s\"",
5102 RelationGetRelationName(relation))));
5103 break;
5104 }
5105 }
5106
5107 /* if there are updates, follow the update chain */
5109 !ItemPointerEquals(&tuple->t_self, &t_ctid))
5110 {
5111 TM_Result res;
5112
5113 res = heap_lock_updated_tuple(relation,
5114 infomask, xwait, &t_ctid,
5116 mode);
5117 if (res != TM_Ok)
5118 {
5119 result = res;
5120 /* recovery code expects to have buffer lock held */
5122 goto failed;
5123 }
5124 }
5125
5127
5128 /*
5129 * xwait is done, but if xwait had just locked the tuple then some
5130 * other xact could update this tuple before we get to this point.
5131 * Check for xmax change, and start over if so.
5132 */
5135 xwait))
5136 goto l3;
5137
5139 {
5140 /*
5141 * Otherwise check if it committed or aborted. Note we cannot
5142 * be here if the tuple was only locked by somebody who didn't
5143 * conflict with us; that would have been handled above. So
5144 * that transaction must necessarily be gone by now. But
5145 * don't check for this in the multixact case, because some
5146 * locker transactions might still be running.
5147 */
5148 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5149 }
5150 }
5151
5152 /* By here, we're certain that we hold buffer exclusive lock again */
5153
5154 /*
5155 * We may lock if previous xmax aborted, or if it committed but only
5156 * locked the tuple without updating it; or if we didn't have to wait
5157 * at all for whatever reason.
5158 */
5159 if (!require_sleep ||
5160 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5163 result = TM_Ok;
5164 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
5165 result = TM_Updated;
5166 else
5167 result = TM_Deleted;
5168 }
5169
5170failed:
5171 if (result != TM_Ok)
5172 {
5173 Assert(result == TM_SelfModified || result == TM_Updated ||
5174 result == TM_Deleted || result == TM_WouldBlock);
5175
5176 /*
5177 * When locking a tuple under LockWaitSkip semantics and we fail with
5178 * TM_WouldBlock above, it's possible for concurrent transactions to
5179 * release the lock and set HEAP_XMAX_INVALID in the meantime. So
5180 * this assert is slightly different from the equivalent one in
5181 * heap_delete and heap_update.
5182 */
5183 Assert((result == TM_WouldBlock) ||
5184 !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5185 Assert(result != TM_Updated ||
5186 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
5187 tmfd->ctid = tuple->t_data->t_ctid;
5188 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5189 if (result == TM_SelfModified)
5190 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5191 else
5192 tmfd->cmax = InvalidCommandId;
5193 goto out_locked;
5194 }
5195
5196 /*
5197 * If we didn't pin the visibility map page and the page has become all
5198 * visible while we were busy locking the buffer, or during some
5199 * subsequent window during which we had it unlocked, we'll have to unlock
5200 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5201 * unfortunate, especially since we'll now have to recheck whether the
5202 * tuple has been locked or updated under us, but hopefully it won't
5203 * happen very often.
5204 */
5205 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5206 {
5208 visibilitymap_pin(relation, block, &vmbuffer);
5210 goto l3;
5211 }
5212
5213 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5214 old_infomask = tuple->t_data->t_infomask;
5215
5216 /*
5217 * If this is the first possibly-multixact-able operation in the current
5218 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5219 * certain that the transaction will never become a member of any older
5220 * MultiXactIds than that. (We have to do this even if we end up just
5221 * using our own TransactionId below, since some other backend could
5222 * incorporate our XID into a MultiXact immediately afterwards.)
5223 */
5225
5226 /*
5227 * Compute the new xmax and infomask to store into the tuple. Note we do
5228 * not modify the tuple just yet, because that would leave it in the wrong
5229 * state if multixact.c elogs.
5230 */
5232 GetCurrentTransactionId(), mode, false,
5233 &xid, &new_infomask, &new_infomask2);
5234
5236
5237 /*
5238 * Store transaction information of xact locking the tuple.
5239 *
5240 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5241 * possibly generating a useless combo CID. Moreover, if we're locking a
5242 * previously updated tuple, it's important to preserve the Cmax.
5243 *
5244 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5245 * we would break the HOT chain.
5246 */
5249 tuple->t_data->t_infomask |= new_infomask;
5250 tuple->t_data->t_infomask2 |= new_infomask2;
5253 HeapTupleHeaderSetXmax(tuple->t_data, xid);
5254
5255 /*
5256 * Make sure there is no forward chain link in t_ctid. Note that in the
5257 * cases where the tuple has been updated, we must not overwrite t_ctid,
5258 * because it was set by the updater. Moreover, if the tuple has been
5259 * updated, we need to follow the update chain to lock the new versions of
5260 * the tuple as well.
5261 */
5263 tuple->t_data->t_ctid = *tid;
5264
5265 /* Clear only the all-frozen bit on visibility map if needed */
5266 if (PageIsAllVisible(page) &&
5267 visibilitymap_clear(relation, block, vmbuffer,
5269 cleared_all_frozen = true;
5270
5271
5272 MarkBufferDirty(*buffer);
5273
5274 /*
5275 * XLOG stuff. You might think that we don't need an XLOG record because
5276 * there is no state change worth restoring after a crash. You would be
5277 * wrong however: we have just written either a TransactionId or a
5278 * MultiXactId that may never have been seen on disk before, and we need
5279 * to make sure that there are XLOG entries covering those ID numbers.
5280 * Else the same IDs might be re-used after a crash, which would be
5281 * disastrous if this page made it to disk before the crash. Essentially
5282 * we have to enforce the WAL log-before-data rule even in this case.
5283 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5284 * entries for everything anyway.)
5285 */
5286 if (RelationNeedsWAL(relation))
5287 {
5290
5293
5294 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5295 xlrec.xmax = xid;
5296 xlrec.infobits_set = compute_infobits(new_infomask,
5297 tuple->t_data->t_infomask2);
5300
5301 /* we don't decode row locks atm, so no need to log the origin */
5302
5304
5305 PageSetLSN(page, recptr);
5306 }
5307
5309
5310 result = TM_Ok;
5311
5314
5316 if (BufferIsValid(vmbuffer))
5317 ReleaseBuffer(vmbuffer);
5318
5319 /*
5320 * Don't update the visibility map here. Locking a tuple doesn't change
5321 * visibility info.
5322 */
5323
5324 /*
5325 * Now that we have successfully marked the tuple as locked, we can
5326 * release the lmgr tuple lock, if we had it.
5327 */
5328 if (have_tuple_lock)
5329 UnlockTupleTuplock(relation, tid, mode);
5330
5331 return result;
5332}

References Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), BufferIsValid(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), ConditionalMultiXactIdWait(), ConditionalXactLockTableWait(), TM_FailureData::ctid, DoesMultiXactIdConflict(), elog, END_CRIT_SECTION, ereport, errcode(), errmsg, ERROR, fb(), get_mxact_status_for_lock(), GetCurrentTransactionId(), GetMultiXactIdMembers(), heap_acquire_tuplock(), HEAP_KEYS_UPDATED, heap_lock_updated_tuple(), HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HeapTupleHeaderClearHotUpdated(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetXmax(), HeapTupleSatisfiesUpdate(), i, InvalidBuffer, InvalidCommandId, ItemIdGetLength, ItemIdIsNormal, ItemPointerCopy(), ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, LockWaitBlock, LockWaitError, LockWaitSkip, log_lock_failures, MarkBufferDirty(), mode, MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusNoKeyUpdate, PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), pfree(), ReadBuffer(), REGBUF_STANDARD, RelationGetRelationName, RelationGetRelid, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapLock, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TM_WouldBlock, TransactionIdEquals, TransactionIdIsCurrentTransactionId(), TUPLOCK_from_mxstatus, UnlockTupleTuplock, UpdateXmaxHintBits(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP_LOCK, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLTW_Lock, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_lock().

◆ heap_lock_updated_tuple()

static TM_Result heap_lock_updated_tuple ( Relation  rel,
uint16  prior_infomask,
TransactionId  prior_raw_xmax,
const ItemPointerData prior_ctid,
TransactionId  xid,
LockTupleMode  mode 
)
static

Definition at line 6116 of file heapam.c.

6121{
6122 INJECTION_POINT("heap_lock_updated_tuple", NULL);
6123
6124 /*
6125 * If the tuple has moved into another partition (effectively a delete)
6126 * stop here.
6127 */
6129 {
6131
6132 /*
6133 * If this is the first possibly-multixact-able operation in the
6134 * current transaction, set my per-backend OldestMemberMXactId
6135 * setting. We can be certain that the transaction will never become a
6136 * member of any older MultiXactIds than that. (We have to do this
6137 * even if we end up just using our own TransactionId below, since
6138 * some other backend could incorporate our XID into a MultiXact
6139 * immediately afterwards.)
6140 */
6142
6146 }
6147
6148 /* nothing to lock */
6149 return TM_Ok;
6150}

References fb(), heap_lock_updated_tuple_rec(), HEAP_XMAX_IS_MULTI, INJECTION_POINT, ItemPointerIndicatesMovedPartitions(), mode, MultiXactIdGetUpdateXid(), MultiXactIdSetOldestMember(), and TM_Ok.

Referenced by heap_lock_tuple().

◆ heap_lock_updated_tuple_rec()

static TM_Result heap_lock_updated_tuple_rec ( Relation  rel,
TransactionId  priorXmax,
const ItemPointerData tid,
TransactionId  xid,
LockTupleMode  mode 
)
static

Definition at line 5768 of file heapam.c.

5771{
5772 TM_Result result;
5775 Buffer buf;
5780 TransactionId xmax,
5781 new_xmax;
5782 bool cleared_all_frozen = false;
5784 Buffer vmbuffer = InvalidBuffer;
5785 BlockNumber block;
5786
5787 ItemPointerCopy(tid, &tupid);
5788
5789 for (;;)
5790 {
5791 new_infomask = 0;
5792 new_xmax = InvalidTransactionId;
5794 ItemPointerCopy(&tupid, &(mytup.t_self));
5795
5796 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
5797 {
5798 /*
5799 * if we fail to find the updated version of the tuple, it's
5800 * because it was vacuumed/pruned away after its creator
5801 * transaction aborted. So behave as if we got to the end of the
5802 * chain, and there's no further tuple to lock: return success to
5803 * caller.
5804 */
5805 result = TM_Ok;
5806 goto out_unlocked;
5807 }
5808
5809l4:
5811
5812 /*
5813 * Before locking the buffer, pin the visibility map page if it
5814 * appears to be necessary. Since we haven't got the lock yet,
5815 * someone else might be in the middle of changing this, so we'll need
5816 * to recheck after we have the lock.
5817 */
5819 {
5820 visibilitymap_pin(rel, block, &vmbuffer);
5821 pinned_desired_page = true;
5822 }
5823 else
5824 pinned_desired_page = false;
5825
5827
5828 /*
5829 * If we didn't pin the visibility map page and the page has become
5830 * all visible while we were busy locking the buffer, we'll have to
5831 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5832 * That's a bit unfortunate, but hopefully shouldn't happen often.
5833 *
5834 * Note: in some paths through this function, we will reach here
5835 * holding a pin on a vm page that may or may not be the one matching
5836 * this page. If this page isn't all-visible, we won't use the vm
5837 * page, but we hold onto such a pin till the end of the function.
5838 */
5840 {
5842 visibilitymap_pin(rel, block, &vmbuffer);
5844 }
5845
5846 /*
5847 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5848 * end of the chain, we're done, so return success.
5849 */
5852 priorXmax))
5853 {
5854 result = TM_Ok;
5855 goto out_locked;
5856 }
5857
5858 /*
5859 * Also check Xmin: if this tuple was created by an aborted
5860 * (sub)transaction, then we already locked the last live one in the
5861 * chain, thus we're done, so return success.
5862 */
5864 {
5865 result = TM_Ok;
5866 goto out_locked;
5867 }
5868
5869 old_infomask = mytup.t_data->t_infomask;
5870 old_infomask2 = mytup.t_data->t_infomask2;
5871 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5872
5873 /*
5874 * If this tuple version has been updated or locked by some concurrent
5875 * transaction(s), what we do depends on whether our lock mode
5876 * conflicts with what those other transactions hold, and also on the
5877 * status of them.
5878 */
5880 {
5882 bool needwait;
5883
5886 {
5887 int nmembers;
5888 int i;
5889 MultiXactMember *members;
5890
5891 /*
5892 * We don't need a test for pg_upgrade'd tuples: this is only
5893 * applied to tuples after the first in an update chain. Said
5894 * first tuple in the chain may well be locked-in-9.2-and-
5895 * pg_upgraded, but that one was already locked by our caller,
5896 * not us; and any subsequent ones cannot be because our
5897 * caller must necessarily have obtained a snapshot later than
5898 * the pg_upgrade itself.
5899 */
5900 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5901
5902 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5904 for (i = 0; i < nmembers; i++)
5905 {
5906 result = test_lockmode_for_conflict(members[i].status,
5907 members[i].xid,
5908 mode,
5909 &mytup,
5910 &needwait);
5911
5912 /*
5913 * If the tuple was already locked by ourselves in a
5914 * previous iteration of this (say heap_lock_tuple was
5915 * forced to restart the locking loop because of a change
5916 * in xmax), then we hold the lock already on this tuple
5917 * version and we don't need to do anything; and this is
5918 * not an error condition either. We just need to skip
5919 * this tuple and continue locking the next version in the
5920 * update chain.
5921 */
5922 if (result == TM_SelfModified)
5923 {
5924 pfree(members);
5925 goto next;
5926 }
5927
5928 if (needwait)
5929 {
5931 XactLockTableWait(members[i].xid, rel,
5932 &mytup.t_self,
5934 pfree(members);
5935 goto l4;
5936 }
5937 if (result != TM_Ok)
5938 {
5939 pfree(members);
5940 goto out_locked;
5941 }
5942 }
5943 if (members)
5944 pfree(members);
5945 }
5946 else
5947 {
5948 MultiXactStatus status;
5949
5950 /*
5951 * For a non-multi Xmax, we first need to compute the
5952 * corresponding MultiXactStatus by using the infomask bits.
5953 */
5955 {
5959 status = MultiXactStatusForShare;
5961 {
5963 status = MultiXactStatusForUpdate;
5964 else
5966 }
5967 else
5968 {
5969 /*
5970 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5971 * as share-locked in the old cluster) shouldn't be
5972 * seen in the middle of an update chain.
5973 */
5974 elog(ERROR, "invalid lock status in tuple");
5975 }
5976 }
5977 else
5978 {
5979 /* it's an update, but which kind? */
5981 status = MultiXactStatusUpdate;
5982 else
5984 }
5985
5986 result = test_lockmode_for_conflict(status, rawxmax, mode,
5987 &mytup, &needwait);
5988
5989 /*
5990 * If the tuple was already locked by ourselves in a previous
5991 * iteration of this (say heap_lock_tuple was forced to
5992 * restart the locking loop because of a change in xmax), then
5993 * we hold the lock already on this tuple version and we don't
5994 * need to do anything; and this is not an error condition
5995 * either. We just need to skip this tuple and continue
5996 * locking the next version in the update chain.
5997 */
5998 if (result == TM_SelfModified)
5999 goto next;
6000
6001 if (needwait)
6002 {
6004 XactLockTableWait(rawxmax, rel, &mytup.t_self,
6006 goto l4;
6007 }
6008 if (result != TM_Ok)
6009 {
6010 goto out_locked;
6011 }
6012 }
6013 }
6014
6015 /* compute the new Xmax and infomask values for the tuple ... */
6016 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
6017 xid, mode, false,
6018 &new_xmax, &new_infomask, &new_infomask2);
6019
6021 visibilitymap_clear(rel, block, vmbuffer,
6023 cleared_all_frozen = true;
6024
6026
6027 /* ... and set them */
6028 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
6029 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
6030 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6031 mytup.t_data->t_infomask |= new_infomask;
6032 mytup.t_data->t_infomask2 |= new_infomask2;
6033
6035
6036 /* XLOG stuff */
6037 if (RelationNeedsWAL(rel))
6038 {
6041 Page page = BufferGetPage(buf);
6042
6045
6046 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
6047 xlrec.xmax = new_xmax;
6049 xlrec.flags =
6051
6053
6055
6056 PageSetLSN(page, recptr);
6057 }
6058
6060
6061next:
6062 /* if we find the end of update chain, we're done. */
6063 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
6065 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
6067 {
6068 result = TM_Ok;
6069 goto out_locked;
6070 }
6071
6072 /* tail recursion */
6074 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
6076 }
6077
6078 result = TM_Ok;
6079
6082
6084 if (vmbuffer != InvalidBuffer)
6085 ReleaseBuffer(vmbuffer);
6086
6087 return result;
6088}

References Assert, buf, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), CHECK_FOR_INTERRUPTS, compute_infobits(), compute_new_xmax_infomask(), elog, END_CRIT_SECTION, ERROR, fb(), GetMultiXactIdMembers(), heap_fetch(), HEAP_KEYS_UPDATED, HEAP_LOCKED_UPGRADED(), HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIndicatesMovedPartitions(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetXmax(), i, InvalidBuffer, InvalidTransactionId, ItemPointerCopy(), ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), MarkBufferDirty(), mode, MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, next, PageIsAllVisible(), PageSetLSN(), pfree(), REGBUF_STANDARD, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapLockUpdated, SnapshotAny, START_CRIT_SECTION, test_lockmode_for_conflict(), TM_Ok, TM_SelfModified, TransactionIdDidAbort(), TransactionIdEquals, TransactionIdIsValid, UnlockReleaseBuffer(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP2_LOCK_UPDATED, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), and XLTW_LockUpdated.

Referenced by heap_lock_updated_tuple().

◆ heap_multi_insert()

void heap_multi_insert ( Relation  relation,
TupleTableSlot **  slots,
int  ntuples,
CommandId  cid,
int  options,
BulkInsertState  bistate 
)

Definition at line 2413 of file heapam.c.

2415{
2418 int i;
2419 int ndone;
2421 Page page;
2422 Buffer vmbuffer = InvalidBuffer;
2423 bool needwal;
2427 bool starting_with_empty_page = false;
2428 int npages = 0;
2429 int npages_used = 0;
2430
2431 /* currently not needed (thus unsupported) for heap_multi_insert() */
2433
2434 AssertHasSnapshotForToast(relation);
2435
2436 needwal = RelationNeedsWAL(relation);
2439
2440 /* Toast and set header data in all the slots */
2441 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2442 for (i = 0; i < ntuples; i++)
2443 {
2444 HeapTuple tuple;
2445
2446 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2447 slots[i]->tts_tableOid = RelationGetRelid(relation);
2448 tuple->t_tableOid = slots[i]->tts_tableOid;
2449 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2450 options);
2451 }
2452
2453 /*
2454 * We're about to do the actual inserts -- but check for conflict first,
2455 * to minimize the possibility of having to roll back work we've just
2456 * done.
2457 *
2458 * A check here does not definitively prevent a serialization anomaly;
2459 * that check MUST be done at least past the point of acquiring an
2460 * exclusive buffer content lock on every buffer that will be affected,
2461 * and MAY be done after all inserts are reflected in the buffers and
2462 * those locks are released; otherwise there is a race condition. Since
2463 * multiple buffers can be locked and unlocked in the loop below, and it
2464 * would not be feasible to identify and lock all of those buffers before
2465 * the loop, we must do a final check at the end.
2466 *
2467 * The check here could be omitted with no loss of correctness; it is
2468 * present strictly as an optimization.
2469 *
2470 * For heap inserts, we only need to check for table-level SSI locks. Our
2471 * new tuples can't possibly conflict with existing tuple locks, and heap
2472 * page locks are only consolidated versions of tuple locks; they do not
2473 * lock "gaps" as index page locks do. So we don't need to specify a
2474 * buffer when making the call, which makes for a faster check.
2475 */
2477
2478 ndone = 0;
2479 while (ndone < ntuples)
2480 {
2481 Buffer buffer;
2482 bool all_visible_cleared = false;
2483 bool all_frozen_set = false;
2484 int nthispage;
2485
2487
2488 /*
2489 * Compute number of pages needed to fit the to-be-inserted tuples in
2490 * the worst case. This will be used to determine how much to extend
2491 * the relation by in RelationGetBufferForTuple(), if needed. If we
2492 * filled a prior page from scratch, we can just update our last
2493 * computation, but if we started with a partially filled page,
2494 * recompute from scratch, the number of potentially required pages
2495 * can vary due to tuples needing to fit onto the page, page headers
2496 * etc.
2497 */
2498 if (ndone == 0 || !starting_with_empty_page)
2499 {
2500 npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2502 npages_used = 0;
2503 }
2504 else
2505 npages_used++;
2506
2507 /*
2508 * Find buffer where at least the next tuple will fit. If the page is
2509 * all-visible, this will also pin the requisite visibility map page.
2510 *
2511 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2512 * empty page. See all_frozen_set below.
2513 */
2514 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2515 InvalidBuffer, options, bistate,
2516 &vmbuffer, NULL,
2517 npages - npages_used);
2518 page = BufferGetPage(buffer);
2519
2521
2523 {
2524 all_frozen_set = true;
2525 /* Lock the vmbuffer before entering the critical section */
2527 }
2528
2529 /* NO EREPORT(ERROR) from here till changes are logged */
2531
2532 /*
2533 * RelationGetBufferForTuple has ensured that the first tuple fits.
2534 * Put that on the page, and then as many other tuples as fit.
2535 */
2536 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2537
2538 /*
2539 * For logical decoding we need combo CIDs to properly decode the
2540 * catalog.
2541 */
2542 if (needwal && need_cids)
2543 log_heap_new_cid(relation, heaptuples[ndone]);
2544
2545 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2546 {
2548
2549 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2550 break;
2551
2552 RelationPutHeapTuple(relation, buffer, heaptup, false);
2553
2554 /*
2555 * For logical decoding we need combo CIDs to properly decode the
2556 * catalog.
2557 */
2558 if (needwal && need_cids)
2559 log_heap_new_cid(relation, heaptup);
2560 }
2561
2562 /*
2563 * If the page is all visible, need to clear that, unless we're only
2564 * going to add further frozen rows to it.
2565 *
2566 * If we're only adding already frozen rows to a previously empty
2567 * page, mark it as all-frozen and update the visibility map. We're
2568 * already holding a pin on the vmbuffer.
2569 */
2571 {
2572 all_visible_cleared = true;
2573 PageClearAllVisible(page);
2574 visibilitymap_clear(relation,
2575 BufferGetBlockNumber(buffer),
2576 vmbuffer, VISIBILITYMAP_VALID_BITS);
2577 }
2578 else if (all_frozen_set)
2579 {
2580 PageSetAllVisible(page);
2581 PageClearPrunable(page);
2583 vmbuffer,
2586 relation->rd_locator);
2587 }
2588
2589 /*
2590 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2591 */
2592
2593 MarkBufferDirty(buffer);
2594
2595 /* XLOG stuff */
2596 if (needwal)
2597 {
2601 char *tupledata;
2602 int totaldatalen;
2603 char *scratchptr = scratch.data;
2604 bool init;
2605 int bufflags = 0;
2606
2607 /*
2608 * If the page was previously empty, we can reinit the page
2609 * instead of restoring the whole thing.
2610 */
2612
2613 /* allocate xl_heap_multi_insert struct from the scratch area */
2616
2617 /*
2618 * Allocate offsets array. Unless we're reinitializing the page,
2619 * in that case the tuples are stored in order starting at
2620 * FirstOffsetNumber and we don't need to store the offsets
2621 * explicitly.
2622 */
2623 if (!init)
2624 scratchptr += nthispage * sizeof(OffsetNumber);
2625
2626 /* the rest of the scratch space is used for tuple data */
2627 tupledata = scratchptr;
2628
2629 /* check that the mutually exclusive flags are not both set */
2631
2632 xlrec->flags = 0;
2635
2636 /*
2637 * We don't have to worry about including a conflict xid in the
2638 * WAL record, as HEAP_INSERT_FROZEN intentionally violates
2639 * visibility rules.
2640 */
2641 if (all_frozen_set)
2643
2644 xlrec->ntuples = nthispage;
2645
2646 /*
2647 * Write out an xl_multi_insert_tuple and the tuple data itself
2648 * for each tuple.
2649 */
2650 for (i = 0; i < nthispage; i++)
2651 {
2653 xl_multi_insert_tuple *tuphdr;
2654 int datalen;
2655
2656 if (!init)
2657 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2658 /* xl_multi_insert_tuple needs two-byte alignment. */
2660 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2661
2662 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2663 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2664 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2665
2666 /* write bitmap [+ padding] [+ oid] + data */
2667 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2669 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2670 datalen);
2671 tuphdr->datalen = datalen;
2672 scratchptr += datalen;
2673 }
2674 totaldatalen = scratchptr - tupledata;
2675 Assert((scratchptr - scratch.data) < BLCKSZ);
2676
2677 if (need_tuple_data)
2679
2680 /*
2681 * Signal that this is the last xl_heap_multi_insert record
2682 * emitted by this call to heap_multi_insert(). Needed for logical
2683 * decoding so it knows when to cleanup temporary data.
2684 */
2685 if (ndone + nthispage == ntuples)
2687
2688 if (init)
2689 {
2690 info |= XLOG_HEAP_INIT_PAGE;
2692 }
2693
2694 /*
2695 * If we're doing logical decoding, include the new tuple data
2696 * even if we take a full-page image of the page.
2697 */
2698 if (need_tuple_data)
2700
2702 XLogRegisterData(xlrec, tupledata - scratch.data);
2704 if (all_frozen_set)
2705 XLogRegisterBuffer(1, vmbuffer, 0);
2706
2707 XLogRegisterBufData(0, tupledata, totaldatalen);
2708
2709 /* filtering by origin on a row level is much more efficient */
2711
2712 recptr = XLogInsert(RM_HEAP2_ID, info);
2713
2714 PageSetLSN(page, recptr);
2715 if (all_frozen_set)
2716 {
2717 Assert(BufferIsDirty(vmbuffer));
2718 PageSetLSN(BufferGetPage(vmbuffer), recptr);
2719 }
2720 }
2721
2723
2724 if (all_frozen_set)
2725 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
2726
2727 UnlockReleaseBuffer(buffer);
2728 ndone += nthispage;
2729
2730 /*
2731 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2732 * likely that we'll insert into subsequent heap pages that are likely
2733 * to use the same vm page.
2734 */
2735 }
2736
2737 /* We're done with inserting all tuples, so release the last vmbuffer. */
2738 if (vmbuffer != InvalidBuffer)
2739 ReleaseBuffer(vmbuffer);
2740
2741 /*
2742 * We're done with the actual inserts. Check for conflicts again, to
2743 * ensure that all rw-conflicts in to these inserts are detected. Without
2744 * this final check, a sequential scan of the heap may have locked the
2745 * table after the "before" check, missing one opportunity to detect the
2746 * conflict, and then scanned the table before the new tuples were there,
2747 * missing the other chance to detect the conflict.
2748 *
2749 * For heap inserts, we only need to check for table-level SSI locks. Our
2750 * new tuples can't possibly conflict with existing tuple locks, and heap
2751 * page locks are only consolidated versions of tuple locks; they do not
2752 * lock "gaps" as index page locks do. So we don't need to specify a
2753 * buffer when making the call.
2754 */
2756
2757 /*
2758 * If tuples are cacheable, mark them for invalidation from the caches in
2759 * case we abort. Note it is OK to do this after releasing the buffer,
2760 * because the heaptuples data structure is all in local memory, not in
2761 * the shared buffer.
2762 */
2763 if (IsCatalogRelation(relation))
2764 {
2765 for (i = 0; i < ntuples; i++)
2767 }
2768
2769 /* copy t_self fields back to the caller's slots */
2770 for (i = 0; i < ntuples; i++)
2771 slots[i]->tts_tid = heaptuples[i]->t_self;
2772
2773 pgstat_count_heap_insert(relation, ntuples);
2774}

References Assert, AssertHasSnapshotForToast(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), BufferIsDirty(), CacheInvalidateHeapTuple(), CHECK_FOR_INTERRUPTS, CheckForSerializableConflictIn(), xl_multi_insert_tuple::datalen, END_CRIT_SECTION, ExecFetchSlotHeapTuple(), fb(), GetCurrentTransactionId(), HEAP_DEFAULT_FILLFACTOR, HEAP_INSERT_FROZEN, HEAP_INSERT_NO_LOGICAL, heap_multi_insert_pages(), heap_prepare_insert(), i, init, InvalidBlockNumber, InvalidBuffer, IsCatalogRelation(), ItemPointerGetOffsetNumber(), LockBuffer(), log_heap_new_cid(), MarkBufferDirty(), MAXALIGN, PageClearAllVisible(), PageClearPrunable, PageGetHeapFreeSpace(), PageGetMaxOffsetNumber(), PageIsAllVisible(), PageSetAllVisible(), PageSetLSN(), palloc(), pgstat_count_heap_insert(), RelationData::rd_locator, REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetBufferForTuple(), RelationGetRelid, RelationGetTargetPageFreeSpace, RelationIsAccessibleInLogicalDecoding, RelationIsLogicallyLogged, RelationNeedsWAL, RelationPutHeapTuple(), ReleaseBuffer(), SHORTALIGN, SizeOfHeapMultiInsert, SizeofHeapTupleHeader, SizeOfMultiInsertTuple, START_CRIT_SECTION, xl_multi_insert_tuple::t_hoff, xl_multi_insert_tuple::t_infomask, xl_multi_insert_tuple::t_infomask2, HeapTupleData::t_tableOid, TupleTableSlot::tts_tableOid, UnlockReleaseBuffer(), VISIBILITYMAP_ALL_FROZEN, VISIBILITYMAP_ALL_VISIBLE, visibilitymap_clear(), visibilitymap_set_vmbits(), VISIBILITYMAP_VALID_BITS, XLH_INSERT_ALL_FROZEN_SET, XLH_INSERT_ALL_VISIBLE_CLEARED, XLH_INSERT_CONTAINS_NEW_TUPLE, XLH_INSERT_LAST_IN_MULTI, XLOG_HEAP2_MULTI_INSERT, XLOG_HEAP_INIT_PAGE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by CatalogTuplesMultiInsertWithInfo().

◆ heap_multi_insert_pages()

static int heap_multi_insert_pages ( HeapTuple heaptuples,
int  done,
int  ntuples,
Size  saveFreeSpace 
)
static

Definition at line 2381 of file heapam.c.

2382{
2384 int npages = 1;
2385
2386 for (int i = done; i < ntuples; i++)
2387 {
2388 size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2389
2390 if (page_avail < tup_sz)
2391 {
2392 npages++;
2394 }
2395 page_avail -= tup_sz;
2396 }
2397
2398 return npages;
2399}

References fb(), i, MAXALIGN, and SizeOfPageHeaderData.

Referenced by heap_multi_insert().

◆ heap_pre_freeze_checks()

void heap_pre_freeze_checks ( Buffer  buffer,
HeapTupleFreeze tuples,
int  ntuples 
)

Definition at line 7413 of file heapam.c.

7415{
7416 Page page = BufferGetPage(buffer);
7417
7418 for (int i = 0; i < ntuples; i++)
7419 {
7420 HeapTupleFreeze *frz = tuples + i;
7421 ItemId itemid = PageGetItemId(page, frz->offset);
7422 HeapTupleHeader htup;
7423
7424 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7425
7426 /* Deliberately avoid relying on tuple hint bits here */
7427 if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
7428 {
7430
7432 if (unlikely(!TransactionIdDidCommit(xmin)))
7433 ereport(ERROR,
7435 errmsg_internal("uncommitted xmin %u needs to be frozen",
7436 xmin)));
7437 }
7438
7439 /*
7440 * TransactionIdDidAbort won't work reliably in the presence of XIDs
7441 * left behind by transactions that were in progress during a crash,
7442 * so we can only check that xmax didn't commit
7443 */
7444 if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
7445 {
7447
7450 ereport(ERROR,
7452 errmsg_internal("cannot freeze committed xmax %u",
7453 xmax)));
7454 }
7455 }
7456}

References Assert, BufferGetPage(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errmsg_internal(), ERROR, fb(), HEAP_FREEZE_CHECK_XMAX_ABORTED, HEAP_FREEZE_CHECK_XMIN_COMMITTED, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetRawXmin(), HeapTupleHeaderXminFrozen(), i, PageGetItem(), PageGetItemId(), TransactionIdDidCommit(), TransactionIdIsNormal, and unlikely.

Referenced by heap_page_will_freeze().

◆ heap_prepare_freeze_tuple()

bool heap_prepare_freeze_tuple ( HeapTupleHeader  tuple,
const struct VacuumCutoffs cutoffs,
HeapPageFreeze pagefrz,
HeapTupleFreeze frz,
bool totally_frozen 
)

Definition at line 7133 of file heapam.c.

7137{
7138 bool xmin_already_frozen = false,
7139 xmax_already_frozen = false;
7140 bool freeze_xmin = false,
7141 replace_xvac = false,
7142 replace_xmax = false,
7143 freeze_xmax = false;
7144 TransactionId xid;
7145
7146 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
7147 frz->t_infomask2 = tuple->t_infomask2;
7148 frz->t_infomask = tuple->t_infomask;
7149 frz->frzflags = 0;
7150 frz->checkflags = 0;
7151
7152 /*
7153 * Process xmin, while keeping track of whether it's already frozen, or
7154 * will become frozen iff our freeze plan is executed by caller (could be
7155 * neither).
7156 */
7157 xid = HeapTupleHeaderGetXmin(tuple);
7158 if (!TransactionIdIsNormal(xid))
7159 xmin_already_frozen = true;
7160 else
7161 {
7162 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7163 ereport(ERROR,
7165 errmsg_internal("found xmin %u from before relfrozenxid %u",
7166 xid, cutoffs->relfrozenxid)));
7167
7168 /* Will set freeze_xmin flags in freeze plan below */
7170
7171 /* Verify that xmin committed if and when freeze plan is executed */
7172 if (freeze_xmin)
7173 {
7176 pagefrz->FreezePageConflictXid = xid;
7177 }
7178 }
7179
7180 /*
7181 * Old-style VACUUM FULL is gone, but we have to process xvac for as long
7182 * as we support having MOVED_OFF/MOVED_IN tuples in the database
7183 */
7184 xid = HeapTupleHeaderGetXvac(tuple);
7185 if (TransactionIdIsNormal(xid))
7186 {
7188 Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
7189
7190 /*
7191 * For Xvac, we always freeze proactively. This allows totally_frozen
7192 * tracking to ignore xvac.
7193 */
7194 replace_xvac = pagefrz->freeze_required = true;
7195
7197 pagefrz->FreezePageConflictXid = xid;
7198
7199 /* Will set replace_xvac flags in freeze plan below */
7200 }
7201
7202 /* Now process xmax */
7203 xid = frz->xmax;
7204 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7205 {
7206 /* Raw xmax is a MultiXactId */
7208 uint16 flags;
7209
7210 /*
7211 * We will either remove xmax completely (in the "freeze_xmax" path),
7212 * process xmax by replacing it (in the "replace_xmax" path), or
7213 * perform no-op xmax processing. The only constraint is that the
7214 * FreezeLimit/MultiXactCutoff postcondition must never be violated.
7215 */
7216 newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
7217 &flags, pagefrz);
7218
7219 if (flags & FRM_NOOP)
7220 {
7221 /*
7222 * xmax is a MultiXactId, and nothing about it changes for now.
7223 * This is the only case where 'freeze_required' won't have been
7224 * set for us by FreezeMultiXactId, as well as the only case where
7225 * neither freeze_xmax nor replace_xmax are set (given a multi).
7226 *
7227 * This is a no-op, but the call to FreezeMultiXactId might have
7228 * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
7229 * for us (the "freeze page" variants, specifically). That'll
7230 * make it safe for our caller to freeze the page later on, while
7231 * leaving this particular xmax undisturbed.
7232 *
7233 * FreezeMultiXactId is _not_ responsible for the "no freeze"
7234 * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
7235 * job. A call to heap_tuple_should_freeze for this same tuple
7236 * will take place below if 'freeze_required' isn't set already.
7237 * (This repeats work from FreezeMultiXactId, but allows "no
7238 * freeze" tracker maintenance to happen in only one place.)
7239 */
7242 }
7243 else if (flags & FRM_RETURN_IS_XID)
7244 {
7245 /*
7246 * xmax will become an updater Xid (original MultiXact's updater
7247 * member Xid will be carried forward as a simple Xid in Xmax).
7248 */
7250
7251 /*
7252 * NB -- some of these transformations are only valid because we
7253 * know the return Xid is a tuple updater (i.e. not merely a
7254 * locker.) Also note that the only reason we don't explicitly
7255 * worry about HEAP_KEYS_UPDATED is because it lives in
7256 * t_infomask2 rather than t_infomask.
7257 */
7258 frz->t_infomask &= ~HEAP_XMAX_BITS;
7259 frz->xmax = newxmax;
7260 if (flags & FRM_MARK_COMMITTED)
7261 frz->t_infomask |= HEAP_XMAX_COMMITTED;
7262 replace_xmax = true;
7263 }
7264 else if (flags & FRM_RETURN_IS_MULTI)
7265 {
7268
7269 /*
7270 * xmax is an old MultiXactId that we have to replace with a new
7271 * MultiXactId, to carry forward two or more original member XIDs.
7272 */
7274
7275 /*
7276 * We can't use GetMultiXactIdHintBits directly on the new multi
7277 * here; that routine initializes the masks to all zeroes, which
7278 * would lose other bits we need. Doing it this way ensures all
7279 * unrelated bits remain untouched.
7280 */
7281 frz->t_infomask &= ~HEAP_XMAX_BITS;
7282 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7284 frz->t_infomask |= newbits;
7285 frz->t_infomask2 |= newbits2;
7286 frz->xmax = newxmax;
7287 replace_xmax = true;
7288 }
7289 else
7290 {
7291 /*
7292 * Freeze plan for tuple "freezes xmax" in the strictest sense:
7293 * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
7294 */
7295 Assert(flags & FRM_INVALIDATE_XMAX);
7297
7298 /* Will set freeze_xmax flags in freeze plan below */
7299 freeze_xmax = true;
7300 }
7301
7302 /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
7303 Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
7304 }
7305 else if (TransactionIdIsNormal(xid))
7306 {
7307 /* Raw xmax is normal XID */
7308 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7309 ereport(ERROR,
7311 errmsg_internal("found xmax %u from before relfrozenxid %u",
7312 xid, cutoffs->relfrozenxid)));
7313
7314 /* Will set freeze_xmax flags in freeze plan below */
7316
7317 /*
7318 * Verify that xmax aborted if and when freeze plan is executed,
7319 * provided it's from an update. (A lock-only xmax can be removed
7320 * independent of this, since the lock is released at xact end.)
7321 */
7323 frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
7324 }
7325 else if (!TransactionIdIsValid(xid))
7326 {
7327 /* Raw xmax is InvalidTransactionId XID */
7328 Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
7329 xmax_already_frozen = true;
7330 }
7331 else
7332 ereport(ERROR,
7334 errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
7335 xid, tuple->t_infomask)));
7336
7337 if (freeze_xmin)
7338 {
7340
7341 frz->t_infomask |= HEAP_XMIN_FROZEN;
7342 }
7343 if (replace_xvac)
7344 {
7345 /*
7346 * If a MOVED_OFF tuple is not dead, the xvac transaction must have
7347 * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
7348 * transaction succeeded.
7349 */
7350 Assert(pagefrz->freeze_required);
7351 if (tuple->t_infomask & HEAP_MOVED_OFF)
7352 frz->frzflags |= XLH_INVALID_XVAC;
7353 else
7354 frz->frzflags |= XLH_FREEZE_XVAC;
7355 }
7356 if (replace_xmax)
7357 {
7359 Assert(pagefrz->freeze_required);
7360
7361 /* Already set replace_xmax flags in freeze plan earlier */
7362 }
7363 if (freeze_xmax)
7364 {
7366
7367 frz->xmax = InvalidTransactionId;
7368
7369 /*
7370 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7371 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
7372 * Also get rid of the HEAP_KEYS_UPDATED bit.
7373 */
7374 frz->t_infomask &= ~HEAP_XMAX_BITS;
7375 frz->t_infomask |= HEAP_XMAX_INVALID;
7376 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7377 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7378 }
7379
7380 /*
7381 * Determine if this tuple is already totally frozen, or will become
7382 * totally frozen (provided caller executes freeze plans for the page)
7383 */
7386
7387 if (!pagefrz->freeze_required && !(xmin_already_frozen &&
7389 {
7390 /*
7391 * So far no previous tuple from the page made freezing mandatory.
7392 * Does this tuple force caller to freeze the entire page?
7393 */
7394 pagefrz->freeze_required =
7395 heap_tuple_should_freeze(tuple, cutoffs,
7396 &pagefrz->NoFreezePageRelfrozenXid,
7397 &pagefrz->NoFreezePageRelminMxid);
7398 }
7399
7400 /* Tell caller if this tuple has a usable freeze plan set in *frz */
7402}

References Assert, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errmsg_internal(), ERROR, fb(), HeapPageFreeze::freeze_required, FreezeMultiXactId(), HeapPageFreeze::FreezePageConflictXid, FRM_INVALIDATE_XMAX, FRM_MARK_COMMITTED, FRM_NOOP, FRM_RETURN_IS_MULTI, FRM_RETURN_IS_XID, GetMultiXactIdHintBits(), HEAP_FREEZE_CHECK_XMAX_ABORTED, HEAP_FREEZE_CHECK_XMIN_COMMITTED, HEAP_MOVED_OFF, heap_tuple_should_freeze(), HEAP_XMAX_COMMITTED, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMIN_FROZEN, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), InvalidTransactionId, VacuumCutoffs::MultiXactCutoff, MultiXactIdIsValid, MultiXactIdPrecedes(), HeapPageFreeze::NoFreezePageRelfrozenXid, HeapPageFreeze::NoFreezePageRelminMxid, VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, VacuumCutoffs::relfrozenxid, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, TransactionIdFollows(), TransactionIdIsNormal, TransactionIdIsValid, TransactionIdPrecedes(), TransactionIdPrecedesOrEquals(), XLH_FREEZE_XVAC, and XLH_INVALID_XVAC.

Referenced by heap_freeze_tuple(), and heap_prune_record_unchanged_lp_normal().

◆ heap_prepare_insert()

static HeapTuple heap_prepare_insert ( Relation  relation,
HeapTuple  tup,
TransactionId  xid,
CommandId  cid,
int  options 
)
static

Definition at line 2333 of file heapam.c.

2335{
2336 /*
2337 * To allow parallel inserts, we need to ensure that they are safe to be
2338 * performed in workers. We have the infrastructure to allow parallel
2339 * inserts in general except for the cases where inserts generate a new
2340 * CommandId (eg. inserts into a table having a foreign key column).
2341 */
2342 if (IsParallelWorker())
2343 ereport(ERROR,
2345 errmsg("cannot insert tuples in a parallel worker")));
2346
2347 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2348 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2349 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2350 HeapTupleHeaderSetXmin(tup->t_data, xid);
2353
2354 HeapTupleHeaderSetCmin(tup->t_data, cid);
2355 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2356 tup->t_tableOid = RelationGetRelid(relation);
2357
2358 /*
2359 * If the new tuple is too big for storage or contains already toasted
2360 * out-of-line attributes from some other relation, invoke the toaster.
2361 */
2362 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2363 relation->rd_rel->relkind != RELKIND_MATVIEW)
2364 {
2365 /* toast table entries should never be recursively toasted */
2367 return tup;
2368 }
2369 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2370 return heap_toast_insert_or_update(relation, tup, NULL, options);
2371 else
2372 return tup;
2373}

References Assert, ereport, errcode(), errmsg, ERROR, fb(), HEAP2_XACT_MASK, HEAP_INSERT_FROZEN, heap_toast_insert_or_update(), HEAP_XACT_MASK, HEAP_XMAX_INVALID, HeapTupleHasExternal(), HeapTupleHeaderSetCmin(), HeapTupleHeaderSetXmax(), HeapTupleHeaderSetXmin(), HeapTupleHeaderSetXminFrozen(), IsParallelWorker, RelationData::rd_rel, RelationGetRelid, and TOAST_TUPLE_THRESHOLD.

Referenced by heap_insert(), and heap_multi_insert().

◆ heap_prepare_pagescan()

void heap_prepare_pagescan ( TableScanDesc  sscan)

Definition at line 616 of file heapam.c.

617{
619 Buffer buffer = scan->rs_cbuf;
620 BlockNumber block = scan->rs_cblock;
621 Snapshot snapshot;
622 Page page;
623 int lines;
624 bool all_visible;
626
627 Assert(BufferGetBlockNumber(buffer) == block);
628
629 /* ensure we're not accidentally being used when not in pagemode */
631 snapshot = scan->rs_base.rs_snapshot;
632
633 /*
634 * Prune and repair fragmentation for the whole page, if possible.
635 */
636 heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
637
638 /*
639 * We must hold share lock on the buffer content while examining tuple
640 * visibility. Afterwards, however, the tuples we have found to be
641 * visible are guaranteed good as long as we hold the buffer pin.
642 */
644
645 page = BufferGetPage(buffer);
646 lines = PageGetMaxOffsetNumber(page);
647
648 /*
649 * If the all-visible flag indicates that all tuples on the page are
650 * visible to everyone, we can skip the per-tuple visibility tests.
651 *
652 * Note: In hot standby, a tuple that's already visible to all
653 * transactions on the primary might still be invisible to a read-only
654 * transaction in the standby. We partly handle this problem by tracking
655 * the minimum xmin of visible tuples as the cut-off XID while marking a
656 * page all-visible on the primary and WAL log that along with the
657 * visibility map SET operation. In hot standby, we wait for (or abort)
658 * all transactions that can potentially may not see one or more tuples on
659 * the page. That's how index-only scans work fine in hot standby. A
660 * crucial difference between index-only scans and heap scans is that the
661 * index-only scan completely relies on the visibility map where as heap
662 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
663 * the page-level flag can be trusted in the same way, because it might
664 * get propagated somehow without being explicitly WAL-logged, e.g. via a
665 * full page write. Until we can prove that beyond doubt, let's check each
666 * tuple for visibility the hard way.
667 */
668 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
671
672 /*
673 * We call page_collect_tuples() with constant arguments, to get the
674 * compiler to constant fold the constant arguments. Separate calls with
675 * constant arguments, rather than variables, are needed on several
676 * compilers to actually perform constant folding.
677 */
678 if (likely(all_visible))
679 {
681 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
682 block, lines, true, false);
683 else
684 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
685 block, lines, true, true);
686 }
687 else
688 {
690 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
691 block, lines, false, false);
692 else
693 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
694 block, lines, false, true);
695 }
696
698}

References Assert, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), CheckForSerializableConflictOutNeeded(), fb(), heap_page_prune_opt(), likely, LockBuffer(), page_collect_tuples(), PageGetMaxOffsetNumber(), PageIsAllVisible(), HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, HeapScanDescData::rs_ntuples, TableScanDescData::rs_rd, TableScanDescData::rs_snapshot, SO_ALLOW_PAGEMODE, and SnapshotData::takenDuringRecovery.

Referenced by heapam_scan_sample_next_block(), and heapgettup_pagemode().

◆ heap_rescan()

void heap_rescan ( TableScanDesc  sscan,
ScanKey  key,
bool  set_params,
bool  allow_strat,
bool  allow_sync,
bool  allow_pagemode 
)

Definition at line 1318 of file heapam.c.

1320{
1322
1323 if (set_params)
1324 {
1325 if (allow_strat)
1327 else
1329
1330 if (allow_sync)
1332 else
1334
1335 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1338 else
1340 }
1341
1342 /*
1343 * unpin scan buffers
1344 */
1345 if (BufferIsValid(scan->rs_cbuf))
1346 {
1347 ReleaseBuffer(scan->rs_cbuf);
1348 scan->rs_cbuf = InvalidBuffer;
1349 }
1350
1351 /*
1352 * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any
1353 * additional data vs a normal HeapScan
1354 */
1355
1356 /*
1357 * The read stream is reset on rescan. This must be done before
1358 * initscan(), as some state referred to by read_stream_reset() is reset
1359 * in initscan().
1360 */
1361 if (scan->rs_read_stream)
1363
1364 /*
1365 * reinitialize scan descriptor
1366 */
1367 initscan(scan, key, true);
1368}

References BufferIsValid(), fb(), initscan(), InvalidBuffer, IsMVCCSnapshot, read_stream_reset(), ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, SO_ALLOW_PAGEMODE, SO_ALLOW_STRAT, and SO_ALLOW_SYNC.

◆ heap_scan_stream_read_next_parallel()

static BlockNumber heap_scan_stream_read_next_parallel ( ReadStream stream,
void callback_private_data,
void per_buffer_data 
)
static

◆ heap_scan_stream_read_next_serial()

static BlockNumber heap_scan_stream_read_next_serial ( ReadStream stream,
void callback_private_data,
void per_buffer_data 
)
static

Definition at line 292 of file heapam.c.

295{
296 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
297
298 if (unlikely(!scan->rs_inited))
299 {
301 scan->rs_inited = true;
302 }
303 else
305 scan->rs_prefetch_block,
306 scan->rs_dir);
307
308 return scan->rs_prefetch_block;
309}

References heapgettup_advance_block(), heapgettup_initial_block(), HeapScanDescData::rs_dir, HeapScanDescData::rs_inited, HeapScanDescData::rs_prefetch_block, and unlikely.

Referenced by heap_beginscan().

◆ heap_set_tidrange()

void heap_set_tidrange ( TableScanDesc  sscan,
ItemPointer  mintid,
ItemPointer  maxtid 
)

Definition at line 1479 of file heapam.c.

1481{
1487
1488 /*
1489 * For relations without any pages, we can simply leave the TID range
1490 * unset. There will be no tuples to scan, therefore no tuples outside
1491 * the given TID range.
1492 */
1493 if (scan->rs_nblocks == 0)
1494 return;
1495
1496 /*
1497 * Set up some ItemPointers which point to the first and last possible
1498 * tuples in the heap.
1499 */
1502
1503 /*
1504 * If the given maximum TID is below the highest possible TID in the
1505 * relation, then restrict the range to that, otherwise we scan to the end
1506 * of the relation.
1507 */
1510
1511 /*
1512 * If the given minimum TID is above the lowest possible TID in the
1513 * relation, then restrict the range to only scan for TIDs above that.
1514 */
1517
1518 /*
1519 * Check for an empty range and protect from would be negative results
1520 * from the numBlks calculation below.
1521 */
1523 {
1524 /* Set an empty range of blocks to scan */
1526 return;
1527 }
1528
1529 /*
1530 * Calculate the first block and the number of blocks we must scan. We
1531 * could be more aggressive here and perform some more validation to try
1532 * and further narrow the scope of blocks to scan by checking if the
1533 * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1534 * advance startBlk by one. Likewise, if highestItem has an offset of 0
1535 * we could scan one fewer blocks. However, such an optimization does not
1536 * seem worth troubling over, currently.
1537 */
1539
1542
1543 /* Set the start block and number of blocks to scan */
1545
1546 /* Finally, set the TID range in sscan */
1547 ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
1548 ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
1549}

References fb(), FirstOffsetNumber, heap_setscanlimits(), ItemPointerCompare(), ItemPointerCopy(), ItemPointerGetBlockNumberNoCheck(), ItemPointerSet(), MaxOffsetNumber, and HeapScanDescData::rs_nblocks.

◆ heap_setscanlimits()

void heap_setscanlimits ( TableScanDesc  sscan,
BlockNumber  startBlk,
BlockNumber  numBlks 
)

Definition at line 500 of file heapam.c.

501{
503
504 Assert(!scan->rs_inited); /* else too late to change */
505 /* else rs_startblock is significant */
507
508 /* Check startBlk is valid (but allow case of zero blocks...) */
509 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
510
511 scan->rs_startblock = startBlk;
512 scan->rs_numblocks = numBlks;
513}

References Assert, fb(), HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, HeapScanDescData::rs_numblocks, HeapScanDescData::rs_startblock, and SO_ALLOW_SYNC.

Referenced by heap_set_tidrange(), and heapam_index_build_range_scan().

◆ heap_tuple_needs_eventual_freeze()

bool heap_tuple_needs_eventual_freeze ( HeapTupleHeader  tuple)

Definition at line 7897 of file heapam.c.

7898{
7899 TransactionId xid;
7900
7901 /*
7902 * If xmin is a normal transaction ID, this tuple is definitely not
7903 * frozen.
7904 */
7905 xid = HeapTupleHeaderGetXmin(tuple);
7906 if (TransactionIdIsNormal(xid))
7907 return true;
7908
7909 /*
7910 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7911 */
7912 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7913 {
7914 MultiXactId multi;
7915
7916 multi = HeapTupleHeaderGetRawXmax(tuple);
7917 if (MultiXactIdIsValid(multi))
7918 return true;
7919 }
7920 else
7921 {
7922 xid = HeapTupleHeaderGetRawXmax(tuple);
7923 if (TransactionIdIsNormal(xid))
7924 return true;
7925 }
7926
7927 if (tuple->t_infomask & HEAP_MOVED)
7928 {
7929 xid = HeapTupleHeaderGetXvac(tuple);
7930 if (TransactionIdIsNormal(xid))
7931 return true;
7932 }
7933
7934 return false;
7935}

References HEAP_MOVED, HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), MultiXactIdIsValid, HeapTupleHeaderData::t_infomask, and TransactionIdIsNormal.

Referenced by collect_corrupt_items(), and heap_page_would_be_all_visible().

◆ heap_tuple_should_freeze()

bool heap_tuple_should_freeze ( HeapTupleHeader  tuple,
const struct VacuumCutoffs cutoffs,
TransactionId NoFreezePageRelfrozenXid,
MultiXactId NoFreezePageRelminMxid 
)

Definition at line 7952 of file heapam.c.

7956{
7957 TransactionId xid;
7958 MultiXactId multi;
7959 bool freeze = false;
7960
7961 /* First deal with xmin */
7962 xid = HeapTupleHeaderGetXmin(tuple);
7963 if (TransactionIdIsNormal(xid))
7964 {
7966 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7967 *NoFreezePageRelfrozenXid = xid;
7968 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7969 freeze = true;
7970 }
7971
7972 /* Now deal with xmax */
7974 multi = InvalidMultiXactId;
7975 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7976 multi = HeapTupleHeaderGetRawXmax(tuple);
7977 else
7978 xid = HeapTupleHeaderGetRawXmax(tuple);
7979
7980 if (TransactionIdIsNormal(xid))
7981 {
7983 /* xmax is a non-permanent XID */
7984 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7985 *NoFreezePageRelfrozenXid = xid;
7986 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7987 freeze = true;
7988 }
7989 else if (!MultiXactIdIsValid(multi))
7990 {
7991 /* xmax is a permanent XID or invalid MultiXactId/XID */
7992 }
7993 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7994 {
7995 /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
7996 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
7997 *NoFreezePageRelminMxid = multi;
7998 /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
7999 freeze = true;
8000 }
8001 else
8002 {
8003 /* xmax is a MultiXactId that may have an updater XID */
8004 MultiXactMember *members;
8005 int nmembers;
8006
8008 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
8009 *NoFreezePageRelminMxid = multi;
8010 if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
8011 freeze = true;
8012
8013 /* need to check whether any member of the mxact is old */
8014 nmembers = GetMultiXactIdMembers(multi, &members, false,
8016
8017 for (int i = 0; i < nmembers; i++)
8018 {
8019 xid = members[i].xid;
8021 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8022 *NoFreezePageRelfrozenXid = xid;
8023 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
8024 freeze = true;
8025 }
8026 if (nmembers > 0)
8027 pfree(members);
8028 }
8029
8030 if (tuple->t_infomask & HEAP_MOVED)
8031 {
8032 xid = HeapTupleHeaderGetXvac(tuple);
8033 if (TransactionIdIsNormal(xid))
8034 {
8036 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8037 *NoFreezePageRelfrozenXid = xid;
8038 /* heap_prepare_freeze_tuple forces xvac freezing */
8039 freeze = true;
8040 }
8041 }
8042
8043 return freeze;
8044}

References Assert, VacuumCutoffs::FreezeLimit, GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_MOVED, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), i, InvalidMultiXactId, InvalidTransactionId, VacuumCutoffs::MultiXactCutoff, MultiXactIdIsValid, MultiXactIdPrecedes(), MultiXactIdPrecedesOrEquals(), pfree(), VacuumCutoffs::relfrozenxid, VacuumCutoffs::relminmxid, HeapTupleHeaderData::t_infomask, TransactionIdIsNormal, TransactionIdPrecedes(), TransactionIdPrecedesOrEquals(), and MultiXactMember::xid.

Referenced by heap_prepare_freeze_tuple(), and lazy_scan_noprune().

◆ heap_update()

TM_Result heap_update ( Relation  relation,
const ItemPointerData otid,
HeapTuple  newtup,
CommandId  cid,
Snapshot  crosscheck,
bool  wait,
TM_FailureData tmfd,
LockTupleMode lockmode,
TU_UpdateIndexes update_indexes 
)

Definition at line 3313 of file heapam.c.

3317{
3318 TM_Result result;
3326 ItemId lp;
3330 bool old_key_copied = false;
3331 Page page;
3332 BlockNumber block;
3334 Buffer buffer,
3335 newbuf,
3336 vmbuffer = InvalidBuffer,
3338 bool need_toast;
3340 pagefree;
3341 bool have_tuple_lock = false;
3342 bool iscombo;
3343 bool use_hot_update = false;
3344 bool summarized_update = false;
3345 bool key_intact;
3346 bool all_visible_cleared = false;
3347 bool all_visible_cleared_new = false;
3348 bool checked_lockers;
3349 bool locker_remains;
3350 bool id_has_external = false;
3357
3359
3360 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3363
3364 AssertHasSnapshotForToast(relation);
3365
3366 /*
3367 * Forbid this during a parallel operation, lest it allocate a combo CID.
3368 * Other workers might need that combo CID for visibility checks, and we
3369 * have no provision for broadcasting it to them.
3370 */
3371 if (IsInParallelMode())
3372 ereport(ERROR,
3374 errmsg("cannot update tuples during a parallel operation")));
3375
3376#ifdef USE_ASSERT_CHECKING
3378#endif
3379
3380 /*
3381 * Fetch the list of attributes to be checked for various operations.
3382 *
3383 * For HOT considerations, this is wasted effort if we fail to update or
3384 * have to put the new tuple on a different page. But we must compute the
3385 * list before obtaining buffer lock --- in the worst case, if we are
3386 * doing an update on one of the relevant system catalogs, we could
3387 * deadlock if we try to fetch the list later. In any case, the relcache
3388 * caches the data so this is usually pretty cheap.
3389 *
3390 * We also need columns used by the replica identity and columns that are
3391 * considered the "key" of rows in the table.
3392 *
3393 * Note that we get copies of each bitmap, so we need not worry about
3394 * relcache flush happening midway through.
3395 */
3408
3410 INJECTION_POINT("heap_update-before-pin", NULL);
3411 buffer = ReadBuffer(relation, block);
3412 page = BufferGetPage(buffer);
3413
3414 /*
3415 * Before locking the buffer, pin the visibility map page if it appears to
3416 * be necessary. Since we haven't got the lock yet, someone else might be
3417 * in the middle of changing this, so we'll need to recheck after we have
3418 * the lock.
3419 */
3420 if (PageIsAllVisible(page))
3421 visibilitymap_pin(relation, block, &vmbuffer);
3422
3424
3426
3427 /*
3428 * Usually, a buffer pin and/or snapshot blocks pruning of otid, ensuring
3429 * we see LP_NORMAL here. When the otid origin is a syscache, we may have
3430 * neither a pin nor a snapshot. Hence, we may see other LP_ states, each
3431 * of which indicates concurrent pruning.
3432 *
3433 * Failing with TM_Updated would be most accurate. However, unlike other
3434 * TM_Updated scenarios, we don't know the successor ctid in LP_UNUSED and
3435 * LP_DEAD cases. While the distinction between TM_Updated and TM_Deleted
3436 * does matter to SQL statements UPDATE and MERGE, those SQL statements
3437 * hold a snapshot that ensures LP_NORMAL. Hence, the choice between
3438 * TM_Updated and TM_Deleted affects only the wording of error messages.
3439 * Settle on TM_Deleted, for two reasons. First, it avoids complicating
3440 * the specification of when tmfd->ctid is valid. Second, it creates
3441 * error log evidence that we took this branch.
3442 *
3443 * Since it's possible to see LP_UNUSED at otid, it's also possible to see
3444 * LP_NORMAL for a tuple that replaced LP_UNUSED. If it's a tuple for an
3445 * unrelated row, we'll fail with "duplicate key value violates unique".
3446 * XXX if otid is the live, newer version of the newtup row, we'll discard
3447 * changes originating in versions of this catalog row after the version
3448 * the caller got from syscache. See syscache-update-pruned.spec.
3449 */
3450 if (!ItemIdIsNormal(lp))
3451 {
3453
3454 UnlockReleaseBuffer(buffer);
3456 if (vmbuffer != InvalidBuffer)
3457 ReleaseBuffer(vmbuffer);
3458 tmfd->ctid = *otid;
3459 tmfd->xmax = InvalidTransactionId;
3460 tmfd->cmax = InvalidCommandId;
3462
3467 /* modified_attrs not yet initialized */
3469 return TM_Deleted;
3470 }
3471
3472 /*
3473 * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3474 * properly.
3475 */
3476 oldtup.t_tableOid = RelationGetRelid(relation);
3477 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3478 oldtup.t_len = ItemIdGetLength(lp);
3479 oldtup.t_self = *otid;
3480
3481 /* the new tuple is ready, except for this: */
3482 newtup->t_tableOid = RelationGetRelid(relation);
3483
3484 /*
3485 * Determine columns modified by the update. Additionally, identify
3486 * whether any of the unmodified replica identity key attributes in the
3487 * old tuple is externally stored or not. This is required because for
3488 * such attributes the flattened value won't be WAL logged as part of the
3489 * new tuple so we must include it as part of the old_key_tuple. See
3490 * ExtractReplicaIdentity.
3491 */
3493 id_attrs, &oldtup,
3495
3496 /*
3497 * If we're not updating any "key" column, we can grab a weaker lock type.
3498 * This allows for more concurrency when we are running simultaneously
3499 * with foreign key checks.
3500 *
3501 * Note that if a column gets detoasted while executing the update, but
3502 * the value ends up being the same, this test will fail and we will use
3503 * the stronger lock. This is acceptable; the important case to optimize
3504 * is updates that don't manipulate key columns, not those that
3505 * serendipitously arrive at the same key values.
3506 */
3508 {
3509 *lockmode = LockTupleNoKeyExclusive;
3511 key_intact = true;
3512
3513 /*
3514 * If this is the first possibly-multixact-able operation in the
3515 * current transaction, set my per-backend OldestMemberMXactId
3516 * setting. We can be certain that the transaction will never become a
3517 * member of any older MultiXactIds than that. (We have to do this
3518 * even if we end up just using our own TransactionId below, since
3519 * some other backend could incorporate our XID into a MultiXact
3520 * immediately afterwards.)
3521 */
3523 }
3524 else
3525 {
3526 *lockmode = LockTupleExclusive;
3528 key_intact = false;
3529 }
3530
3531 /*
3532 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3533 * otid may very well point at newtup->t_self, which we will overwrite
3534 * with the new tuple's location, so there's great risk of confusion if we
3535 * use otid anymore.
3536 */
3537
3538l2:
3539 checked_lockers = false;
3540 locker_remains = false;
3541 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3542
3543 /* see below about the "no wait" case */
3544 Assert(result != TM_BeingModified || wait);
3545
3546 if (result == TM_Invisible)
3547 {
3548 UnlockReleaseBuffer(buffer);
3549 ereport(ERROR,
3551 errmsg("attempted to update invisible tuple")));
3552 }
3553 else if (result == TM_BeingModified && wait)
3554 {
3557 bool can_continue = false;
3558
3559 /*
3560 * XXX note that we don't consider the "no wait" case here. This
3561 * isn't a problem currently because no caller uses that case, but it
3562 * should be fixed if such a caller is introduced. It wasn't a
3563 * problem previously because this code would always wait, but now
3564 * that some tuple locks do not conflict with one of the lock modes we
3565 * use, it is possible that this case is interesting to handle
3566 * specially.
3567 *
3568 * This may cause failures with third-party code that calls
3569 * heap_update directly.
3570 */
3571
3572 /* must copy state data before unlocking buffer */
3574 infomask = oldtup.t_data->t_infomask;
3575
3576 /*
3577 * Now we have to do something about the existing locker. If it's a
3578 * multi, sleep on it; we might be awakened before it is completely
3579 * gone (or even not sleep at all in some cases); we need to preserve
3580 * it as locker, unless it is gone completely.
3581 *
3582 * If it's not a multi, we need to check for sleeping conditions
3583 * before actually going to sleep. If the update doesn't conflict
3584 * with the locks, we just continue without sleeping (but making sure
3585 * it is preserved).
3586 *
3587 * Before sleeping, we need to acquire tuple lock to establish our
3588 * priority for the tuple (see heap_lock_tuple). LockTuple will
3589 * release us when we are next-in-line for the tuple. Note we must
3590 * not acquire the tuple lock until we're sure we're going to sleep;
3591 * otherwise we're open for race conditions with other transactions
3592 * holding the tuple lock which sleep on us.
3593 *
3594 * If we are forced to "start over" below, we keep the tuple lock;
3595 * this arranges that we stay at the head of the line while rechecking
3596 * tuple state.
3597 */
3599 {
3601 int remain;
3602 bool current_is_member = false;
3603
3605 *lockmode, &current_is_member))
3606 {
3608
3609 /*
3610 * Acquire the lock, if necessary (but skip it when we're
3611 * requesting a lock and already have one; avoids deadlock).
3612 */
3613 if (!current_is_member)
3614 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3616
3617 /* wait for multixact */
3619 relation, &oldtup.t_self, XLTW_Update,
3620 &remain);
3621 checked_lockers = true;
3622 locker_remains = remain != 0;
3624
3625 /*
3626 * If xwait had just locked the tuple then some other xact
3627 * could update this tuple before we get to this point. Check
3628 * for xmax change, and start over if so.
3629 */
3630 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3631 infomask) ||
3633 xwait))
3634 goto l2;
3635 }
3636
3637 /*
3638 * Note that the multixact may not be done by now. It could have
3639 * surviving members; our own xact or other subxacts of this
3640 * backend, and also any other concurrent transaction that locked
3641 * the tuple with LockTupleKeyShare if we only got
3642 * LockTupleNoKeyExclusive. If this is the case, we have to be
3643 * careful to mark the updated tuple with the surviving members in
3644 * Xmax.
3645 *
3646 * Note that there could have been another update in the
3647 * MultiXact. In that case, we need to check whether it committed
3648 * or aborted. If it aborted we are safe to update it again;
3649 * otherwise there is an update conflict, and we have to return
3650 * TableTuple{Deleted, Updated} below.
3651 *
3652 * In the LockTupleExclusive case, we still need to preserve the
3653 * surviving members: those would include the tuple locks we had
3654 * before this one, which are important to keep in case this
3655 * subxact aborts.
3656 */
3657 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3659 else
3661
3662 /*
3663 * There was no UPDATE in the MultiXact; or it aborted. No
3664 * TransactionIdIsInProgress() call needed here, since we called
3665 * MultiXactIdWait() above.
3666 */
3669 can_continue = true;
3670 }
3672 {
3673 /*
3674 * The only locker is ourselves; we can avoid grabbing the tuple
3675 * lock here, but must preserve our locking information.
3676 */
3677 checked_lockers = true;
3678 locker_remains = true;
3679 can_continue = true;
3680 }
3682 {
3683 /*
3684 * If it's just a key-share locker, and we're not changing the key
3685 * columns, we don't need to wait for it to end; but we need to
3686 * preserve it as locker.
3687 */
3688 checked_lockers = true;
3689 locker_remains = true;
3690 can_continue = true;
3691 }
3692 else
3693 {
3694 /*
3695 * Wait for regular transaction to end; but first, acquire tuple
3696 * lock.
3697 */
3699 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3701 XactLockTableWait(xwait, relation, &oldtup.t_self,
3702 XLTW_Update);
3703 checked_lockers = true;
3705
3706 /*
3707 * xwait is done, but if xwait had just locked the tuple then some
3708 * other xact could update this tuple before we get to this point.
3709 * Check for xmax change, and start over if so.
3710 */
3711 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3714 goto l2;
3715
3716 /* Otherwise check if it committed or aborted */
3717 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3718 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3719 can_continue = true;
3720 }
3721
3722 if (can_continue)
3723 result = TM_Ok;
3724 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3725 result = TM_Updated;
3726 else
3727 result = TM_Deleted;
3728 }
3729
3730 /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3731 if (result != TM_Ok)
3732 {
3733 Assert(result == TM_SelfModified ||
3734 result == TM_Updated ||
3735 result == TM_Deleted ||
3736 result == TM_BeingModified);
3737 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3738 Assert(result != TM_Updated ||
3739 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3740 }
3741
3742 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3743 {
3744 /* Perform additional check for transaction-snapshot mode RI updates */
3746 result = TM_Updated;
3747 }
3748
3749 if (result != TM_Ok)
3750 {
3751 tmfd->ctid = oldtup.t_data->t_ctid;
3752 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3753 if (result == TM_SelfModified)
3754 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3755 else
3756 tmfd->cmax = InvalidCommandId;
3757 UnlockReleaseBuffer(buffer);
3758 if (have_tuple_lock)
3759 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3760 if (vmbuffer != InvalidBuffer)
3761 ReleaseBuffer(vmbuffer);
3763
3770 return result;
3771 }
3772
3773 /*
3774 * If we didn't pin the visibility map page and the page has become all
3775 * visible while we were busy locking the buffer, or during some
3776 * subsequent window during which we had it unlocked, we'll have to unlock
3777 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3778 * bit unfortunate, especially since we'll now have to recheck whether the
3779 * tuple has been locked or updated under us, but hopefully it won't
3780 * happen very often.
3781 */
3782 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3783 {
3785 visibilitymap_pin(relation, block, &vmbuffer);
3787 goto l2;
3788 }
3789
3790 /* Fill in transaction status data */
3791
3792 /*
3793 * If the tuple we're updating is locked, we need to preserve the locking
3794 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3795 */
3797 oldtup.t_data->t_infomask,
3798 oldtup.t_data->t_infomask2,
3799 xid, *lockmode, true,
3802
3803 /*
3804 * And also prepare an Xmax value for the new copy of the tuple. If there
3805 * was no xmax previously, or there was one but all lockers are now gone,
3806 * then use InvalidTransactionId; otherwise, get the xmax from the old
3807 * tuple. (In rare cases that might also be InvalidTransactionId and yet
3808 * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3809 */
3810 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3811 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3814 else
3816
3818 {
3821 }
3822 else
3823 {
3824 /*
3825 * If we found a valid Xmax for the new tuple, then the infomask bits
3826 * to use on the new tuple depend on what was there on the old one.
3827 * Note that since we're doing an update, the only possibility is that
3828 * the lockers had FOR KEY SHARE lock.
3829 */
3830 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3831 {
3834 }
3835 else
3836 {
3839 }
3840 }
3841
3842 /*
3843 * Prepare the new tuple with the appropriate initial values of Xmin and
3844 * Xmax, as well as initial infomask bits as computed above.
3845 */
3846 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3847 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3848 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3850 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3851 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3853
3854 /*
3855 * Replace cid with a combo CID if necessary. Note that we already put
3856 * the plain cid into the new tuple.
3857 */
3859
3860 /*
3861 * If the toaster needs to be activated, OR if the new tuple will not fit
3862 * on the same page as the old, then we need to release the content lock
3863 * (but not the pin!) on the old tuple's buffer while we are off doing
3864 * TOAST and/or table-file-extension work. We must mark the old tuple to
3865 * show that it's locked, else other processes may try to update it
3866 * themselves.
3867 *
3868 * We need to invoke the toaster if there are already any out-of-line
3869 * toasted values present, or if the new tuple is over-threshold.
3870 */
3871 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3872 relation->rd_rel->relkind != RELKIND_MATVIEW)
3873 {
3874 /* toast table entries should never be recursively toasted */
3877 need_toast = false;
3878 }
3879 else
3882 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3883
3885
3886 newtupsize = MAXALIGN(newtup->t_len);
3887
3889 {
3893 bool cleared_all_frozen = false;
3894
3895 /*
3896 * To prevent concurrent sessions from updating the tuple, we have to
3897 * temporarily mark it locked, while we release the page-level lock.
3898 *
3899 * To satisfy the rule that any xid potentially appearing in a buffer
3900 * written out to disk, we unfortunately have to WAL log this
3901 * temporary modification. We can reuse xl_heap_lock for this
3902 * purpose. If we crash/error before following through with the
3903 * actual update, xmax will be of an aborted transaction, allowing
3904 * other sessions to proceed.
3905 */
3906
3907 /*
3908 * Compute xmax / infomask appropriate for locking the tuple. This has
3909 * to be done separately from the combo that's going to be used for
3910 * updating, because the potentially created multixact would otherwise
3911 * be wrong.
3912 */
3914 oldtup.t_data->t_infomask,
3915 oldtup.t_data->t_infomask2,
3916 xid, *lockmode, false,
3919
3921
3923
3924 /* Clear obsolete visibility flags ... */
3925 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3926 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3928 /* ... and store info about transaction updating this tuple */
3931 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3932 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3934
3935 /* temporarily make it look not-updated, but locked */
3936 oldtup.t_data->t_ctid = oldtup.t_self;
3937
3938 /*
3939 * Clear all-frozen bit on visibility map if needed. We could
3940 * immediately reset ALL_VISIBLE, but given that the WAL logging
3941 * overhead would be unchanged, that doesn't seem necessarily
3942 * worthwhile.
3943 */
3944 if (PageIsAllVisible(page) &&
3945 visibilitymap_clear(relation, block, vmbuffer,
3947 cleared_all_frozen = true;
3948
3949 MarkBufferDirty(buffer);
3950
3951 if (RelationNeedsWAL(relation))
3952 {
3955
3958
3959 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3961 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3962 oldtup.t_data->t_infomask2);
3963 xlrec.flags =
3967 PageSetLSN(page, recptr);
3968 }
3969
3971
3973
3974 /*
3975 * Let the toaster do its thing, if needed.
3976 *
3977 * Note: below this point, heaptup is the data we actually intend to
3978 * store into the relation; newtup is the caller's original untoasted
3979 * data.
3980 */
3981 if (need_toast)
3982 {
3983 /* Note we always use WAL and FSM during updates */
3985 newtupsize = MAXALIGN(heaptup->t_len);
3986 }
3987 else
3988 heaptup = newtup;
3989
3990 /*
3991 * Now, do we need a new page for the tuple, or not? This is a bit
3992 * tricky since someone else could have added tuples to the page while
3993 * we weren't looking. We have to recheck the available space after
3994 * reacquiring the buffer lock. But don't bother to do that if the
3995 * former amount of free space is still not enough; it's unlikely
3996 * there's more free now than before.
3997 *
3998 * What's more, if we need to get a new page, we will need to acquire
3999 * buffer locks on both old and new pages. To avoid deadlock against
4000 * some other backend trying to get the same two locks in the other
4001 * order, we must be consistent about the order we get the locks in.
4002 * We use the rule "lock the lower-numbered page of the relation
4003 * first". To implement this, we must do RelationGetBufferForTuple
4004 * while not holding the lock on the old page, and we must rely on it
4005 * to get the locks on both pages in the correct order.
4006 *
4007 * Another consideration is that we need visibility map page pin(s) if
4008 * we will have to clear the all-visible flag on either page. If we
4009 * call RelationGetBufferForTuple, we rely on it to acquire any such
4010 * pins; but if we don't, we have to handle that here. Hence we need
4011 * a loop.
4012 */
4013 for (;;)
4014 {
4015 if (newtupsize > pagefree)
4016 {
4017 /* It doesn't fit, must use RelationGetBufferForTuple. */
4018 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4019 buffer, 0, NULL,
4020 &vmbuffer_new, &vmbuffer,
4021 0);
4022 /* We're all done. */
4023 break;
4024 }
4025 /* Acquire VM page pin if needed and we don't have it. */
4026 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4027 visibilitymap_pin(relation, block, &vmbuffer);
4028 /* Re-acquire the lock on the old tuple's page. */
4030 /* Re-check using the up-to-date free space */
4032 if (newtupsize > pagefree ||
4033 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
4034 {
4035 /*
4036 * Rats, it doesn't fit anymore, or somebody just now set the
4037 * all-visible flag. We must now unlock and loop to avoid
4038 * deadlock. Fortunately, this path should seldom be taken.
4039 */
4041 }
4042 else
4043 {
4044 /* We're all done. */
4045 newbuf = buffer;
4046 break;
4047 }
4048 }
4049 }
4050 else
4051 {
4052 /* No TOAST work needed, and it'll fit on same page */
4053 newbuf = buffer;
4054 heaptup = newtup;
4055 }
4056
4057 /*
4058 * We're about to do the actual update -- check for conflict first, to
4059 * avoid possibly having to roll back work we've just done.
4060 *
4061 * This is safe without a recheck as long as there is no possibility of
4062 * another process scanning the pages between this check and the update
4063 * being visible to the scan (i.e., exclusive buffer content lock(s) are
4064 * continuously held from this point until the tuple update is visible).
4065 *
4066 * For the new tuple the only check needed is at the relation level, but
4067 * since both tuples are in the same relation and the check for oldtup
4068 * will include checking the relation level, there is no benefit to a
4069 * separate check for the new tuple.
4070 */
4071 CheckForSerializableConflictIn(relation, &oldtup.t_self,
4072 BufferGetBlockNumber(buffer));
4073
4074 /*
4075 * At this point newbuf and buffer are both pinned and locked, and newbuf
4076 * has enough space for the new tuple. If they are the same buffer, only
4077 * one pin is held.
4078 */
4079
4080 if (newbuf == buffer)
4081 {
4082 /*
4083 * Since the new tuple is going into the same page, we might be able
4084 * to do a HOT update. Check if any of the index columns have been
4085 * changed.
4086 */
4088 {
4089 use_hot_update = true;
4090
4091 /*
4092 * If none of the columns that are used in hot-blocking indexes
4093 * were updated, we can apply HOT, but we do still need to check
4094 * if we need to update the summarizing indexes, and update those
4095 * indexes if the columns were updated, or we may fail to detect
4096 * e.g. value bound changes in BRIN minmax indexes.
4097 */
4099 summarized_update = true;
4100 }
4101 }
4102 else
4103 {
4104 /* Set a hint that the old page could use prune/defrag */
4105 PageSetFull(page);
4106 }
4107
4108 /*
4109 * Compute replica identity tuple before entering the critical section so
4110 * we don't PANIC upon a memory allocation failure.
4111 * ExtractReplicaIdentity() will return NULL if nothing needs to be
4112 * logged. Pass old key required as true only if the replica identity key
4113 * columns are modified or it has external data.
4114 */
4119
4120 /* NO EREPORT(ERROR) from here till changes are logged */
4122
4123 /*
4124 * If this transaction commits, the old tuple will become DEAD sooner or
4125 * later. Set flag that this page is a candidate for pruning once our xid
4126 * falls below the OldestXmin horizon. If the transaction finally aborts,
4127 * the subsequent page pruning will be a no-op and the hint will be
4128 * cleared.
4129 *
4130 * XXX Should we set hint on newbuf as well? If the transaction aborts,
4131 * there would be a prunable tuple in the newbuf; but for now we choose
4132 * not to optimize for aborts. Note that heap_xlog_update must be kept in
4133 * sync if this decision changes.
4134 */
4135 PageSetPrunable(page, xid);
4136
4137 if (use_hot_update)
4138 {
4139 /* Mark the old tuple as HOT-updated */
4141 /* And mark the new tuple as heap-only */
4143 /* Mark the caller's copy too, in case different from heaptup */
4145 }
4146 else
4147 {
4148 /* Make sure tuples are correctly marked as not-HOT */
4152 }
4153
4154 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4155
4156
4157 /* Clear obsolete visibility flags, possibly set by ourselves above... */
4158 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4159 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4160 /* ... and store info about transaction updating this tuple */
4163 oldtup.t_data->t_infomask |= infomask_old_tuple;
4164 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4166
4167 /* record address of new tuple in t_ctid of old one */
4168 oldtup.t_data->t_ctid = heaptup->t_self;
4169
4170 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4171 if (PageIsAllVisible(BufferGetPage(buffer)))
4172 {
4173 all_visible_cleared = true;
4175 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4176 vmbuffer, VISIBILITYMAP_VALID_BITS);
4177 }
4178 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4179 {
4184 }
4185
4186 if (newbuf != buffer)
4188 MarkBufferDirty(buffer);
4189
4190 /* XLOG stuff */
4191 if (RelationNeedsWAL(relation))
4192 {
4194
4195 /*
4196 * For logical decoding we need combo CIDs to properly decode the
4197 * catalog.
4198 */
4200 {
4201 log_heap_new_cid(relation, &oldtup);
4202 log_heap_new_cid(relation, heaptup);
4203 }
4204
4205 recptr = log_heap_update(relation, buffer,
4210 if (newbuf != buffer)
4211 {
4213 }
4215 }
4216
4218
4219 if (newbuf != buffer)
4222
4223 /*
4224 * Mark old tuple for invalidation from system caches at next command
4225 * boundary, and mark the new tuple for invalidation in case we abort. We
4226 * have to do this before releasing the buffer because oldtup is in the
4227 * buffer. (heaptup is all in local memory, but it's necessary to process
4228 * both tuple versions in one call to inval.c so we can avoid redundant
4229 * sinval messages.)
4230 */
4232
4233 /* Now we can release the buffer(s) */
4234 if (newbuf != buffer)
4236 ReleaseBuffer(buffer);
4239 if (BufferIsValid(vmbuffer))
4240 ReleaseBuffer(vmbuffer);
4241
4242 /*
4243 * Release the lmgr tuple lock, if we had it.
4244 */
4245 if (have_tuple_lock)
4246 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4247
4248 pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
4249
4250 /*
4251 * If heaptup is a private copy, release it. Don't forget to copy t_self
4252 * back to the caller's image, too.
4253 */
4254 if (heaptup != newtup)
4255 {
4256 newtup->t_self = heaptup->t_self;
4258 }
4259
4260 /*
4261 * If it is a HOT update, the update may still need to update summarized
4262 * indexes, lest we fail to update those summaries and get incorrect
4263 * results (for example, minmax bounds of the block may change with this
4264 * update).
4265 */
4266 if (use_hot_update)
4267 {
4270 else
4272 }
4273 else
4275
4278
4285
4286 return TM_Ok;
4287}

References Assert, AssertHasSnapshotForToast(), bms_add_members(), bms_free(), bms_overlap(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), TM_FailureData::ctid, DoesMultiXactIdConflict(), END_CRIT_SECTION, ereport, errcode(), errmsg, ERROR, ExtractReplicaIdentity(), fb(), GetCurrentTransactionId(), GetMultiXactIdHintBits(), HEAP2_XACT_MASK, heap_acquire_tuplock(), heap_freetuple(), HEAP_LOCKED_UPGRADED(), HEAP_MOVED, heap_toast_insert_or_update(), HEAP_UPDATED, HEAP_XACT_MASK, HEAP_XMAX_BITS, HEAP_XMAX_INVALID, HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HeapDetermineColumnsInfo(), HeapTupleClearHeapOnly(), HeapTupleClearHotUpdated(), HeapTupleGetUpdateXid(), HeapTupleHasExternal(), HeapTupleHeaderAdjustCmax(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetNatts, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderSetCmax(), HeapTupleHeaderSetCmin(), HeapTupleHeaderSetXmax(), HeapTupleHeaderSetXmin(), HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesVisibility(), HeapTupleSetHeapOnly(), HeapTupleSetHotUpdated(), INDEX_ATTR_BITMAP_HOT_BLOCKING, INDEX_ATTR_BITMAP_IDENTITY_KEY, INDEX_ATTR_BITMAP_KEY, INDEX_ATTR_BITMAP_SUMMARIZED, INJECTION_POINT, InvalidBuffer, InvalidCommandId, InvalidSnapshot, InvalidTransactionId, IsInParallelMode(), ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), LockTupleExclusive, LockTupleNoKeyExclusive, LockWaitBlock, log_heap_new_cid(), log_heap_update(), MarkBufferDirty(), MAXALIGN, MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, PageClearAllVisible(), PageGetHeapFreeSpace(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetFull(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_update(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetBufferForTuple(), RelationGetIndexAttrBitmap(), RelationGetNumberOfAttributes, RelationGetRelid, RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationPutHeapTuple(), RelationSupportsSysCache(), ReleaseBuffer(), SizeOfHeapLock, START_CRIT_SECTION, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TOAST_TUPLE_THRESHOLD, TransactionIdDidAbort(), TransactionIdEquals, TransactionIdIsCurrentTransactionId(), TransactionIdIsValid, TU_All, TU_None, TU_Summarizing, UnlockReleaseBuffer(), UnlockTupleTuplock, UpdateXmaxHintBits(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), VISIBILITYMAP_VALID_BITS, XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP_LOCK, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLTW_Update, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_update(), and simple_heap_update().

◆ HeapCheckForSerializableConflictOut()

void HeapCheckForSerializableConflictOut ( bool  visible,
Relation  relation,
HeapTuple  tuple,
Buffer  buffer,
Snapshot  snapshot 
)

Definition at line 9332 of file heapam.c.

9335{
9336 TransactionId xid;
9338
9339 if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9340 return;
9341
9342 /*
9343 * Check to see whether the tuple has been written to by a concurrent
9344 * transaction, either to create it not visible to us, or to delete it
9345 * while it is visible to us. The "visible" bool indicates whether the
9346 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9347 * is going on with it.
9348 *
9349 * In the event of a concurrently inserted tuple that also happens to have
9350 * been concurrently updated (by a separate transaction), the xmin of the
9351 * tuple will be used -- not the updater's xid.
9352 */
9354 switch (htsvResult)
9355 {
9356 case HEAPTUPLE_LIVE:
9357 if (visible)
9358 return;
9359 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9360 break;
9363 if (visible)
9364 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9365 else
9366 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9367
9369 {
9370 /* This is like the HEAPTUPLE_DEAD case */
9371 Assert(!visible);
9372 return;
9373 }
9374 break;
9376 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9377 break;
9378 case HEAPTUPLE_DEAD:
9379 Assert(!visible);
9380 return;
9381 default:
9382
9383 /*
9384 * The only way to get to this default clause is if a new value is
9385 * added to the enum type without adding it to this switch
9386 * statement. That's a bug, so elog.
9387 */
9388 elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9389
9390 /*
9391 * In spite of having all enum values covered and calling elog on
9392 * this default, some compilers think this is a code path which
9393 * allows xid to be used below without initialization. Silence
9394 * that warning.
9395 */
9397 }
9398
9401
9402 /*
9403 * Find top level xid. Bail out if xid is too early to be a conflict, or
9404 * if it's our own xid.
9405 */
9407 return;
9410 return;
9411
9412 CheckForSerializableConflictOut(relation, xid, snapshot);
9413}

References Assert, CheckForSerializableConflictOut(), CheckForSerializableConflictOutNeeded(), elog, ERROR, fb(), GetTopTransactionIdIfAny(), HEAPTUPLE_DEAD, HEAPTUPLE_DELETE_IN_PROGRESS, HEAPTUPLE_INSERT_IN_PROGRESS, HEAPTUPLE_LIVE, HEAPTUPLE_RECENTLY_DEAD, HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleSatisfiesVacuum(), InvalidTransactionId, SubTransGetTopmostTransaction(), HeapTupleData::t_data, TransactionIdEquals, TransactionIdFollowsOrEquals(), TransactionIdIsValid, TransactionIdPrecedes(), and TransactionXmin.

Referenced by BitmapHeapScanNextBlock(), heap_fetch(), heap_get_latest_tid(), heap_hot_search_buffer(), heapam_scan_sample_next_tuple(), heapgettup(), and page_collect_tuples().

◆ HeapDetermineColumnsInfo()

static Bitmapset * HeapDetermineColumnsInfo ( Relation  relation,
Bitmapset interesting_cols,
Bitmapset external_cols,
HeapTuple  oldtup,
HeapTuple  newtup,
bool has_external 
)
static

Definition at line 4467 of file heapam.c.

4472{
4473 int attidx;
4475 TupleDesc tupdesc = RelationGetDescr(relation);
4476
4477 attidx = -1;
4478 while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
4479 {
4480 /* attidx is zero-based, attrnum is the normal attribute number */
4482 Datum value1,
4483 value2;
4484 bool isnull1,
4485 isnull2;
4486
4487 /*
4488 * If it's a whole-tuple reference, say "not equal". It's not really
4489 * worth supporting this case, since it could only succeed after a
4490 * no-op update, which is hardly a case worth optimizing for.
4491 */
4492 if (attrnum == 0)
4493 {
4494 modified = bms_add_member(modified, attidx);
4495 continue;
4496 }
4497
4498 /*
4499 * Likewise, automatically say "not equal" for any system attribute
4500 * other than tableOID; we cannot expect these to be consistent in a
4501 * HOT chain, or even to be set correctly yet in the new tuple.
4502 */
4503 if (attrnum < 0)
4504 {
4505 if (attrnum != TableOidAttributeNumber)
4506 {
4507 modified = bms_add_member(modified, attidx);
4508 continue;
4509 }
4510 }
4511
4512 /*
4513 * Extract the corresponding values. XXX this is pretty inefficient
4514 * if there are many indexed columns. Should we do a single
4515 * heap_deform_tuple call on each tuple, instead? But that doesn't
4516 * work for system columns ...
4517 */
4518 value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4519 value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4520
4521 if (!heap_attr_equals(tupdesc, attrnum, value1,
4522 value2, isnull1, isnull2))
4523 {
4524 modified = bms_add_member(modified, attidx);
4525 continue;
4526 }
4527
4528 /*
4529 * No need to check attributes that can't be stored externally. Note
4530 * that system attributes can't be stored externally.
4531 */
4532 if (attrnum < 0 || isnull1 ||
4533 TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
4534 continue;
4535
4536 /*
4537 * Check if the old tuple's attribute is stored externally and is a
4538 * member of external_cols.
4539 */
4542 *has_external = true;
4543 }
4544
4545 return modified;
4546}

References attlen, bms_add_member(), bms_is_member(), bms_next_member(), DatumGetPointer(), fb(), FirstLowInvalidHeapAttributeNumber, heap_attr_equals(), heap_getattr(), RelationGetDescr, TableOidAttributeNumber, TupleDescCompactAttr(), and VARATT_IS_EXTERNAL().

Referenced by heap_update().

◆ heapgettup()

static void heapgettup ( HeapScanDesc  scan,
ScanDirection  dir,
int  nkeys,
ScanKey  key 
)
static

Definition at line 960 of file heapam.c.

964{
965 HeapTuple tuple = &(scan->rs_ctup);
966 Page page;
968 int linesleft;
969
970 if (likely(scan->rs_inited))
971 {
972 /* continue from previously returned page/tuple */
974 page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
975 goto continue_page;
976 }
977
978 /*
979 * advance the scan until we find a qualifying tuple or run out of stuff
980 * to scan
981 */
982 while (true)
983 {
984 heap_fetch_next_buffer(scan, dir);
985
986 /* did we run out of blocks to scan? */
987 if (!BufferIsValid(scan->rs_cbuf))
988 break;
989
991
993 page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
995
996 /*
997 * Only continue scanning the page while we have lines left.
998 *
999 * Note that this protects us from accessing line pointers past
1000 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
1001 * table scan, and for when we start scanning a new page.
1002 */
1003 for (; linesleft > 0; linesleft--, lineoff += dir)
1004 {
1005 bool visible;
1007
1008 if (!ItemIdIsNormal(lpp))
1009 continue;
1010
1011 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1012 tuple->t_len = ItemIdGetLength(lpp);
1013 ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
1014
1015 visible = HeapTupleSatisfiesVisibility(tuple,
1016 scan->rs_base.rs_snapshot,
1017 scan->rs_cbuf);
1018
1020 tuple, scan->rs_cbuf,
1021 scan->rs_base.rs_snapshot);
1022
1023 /* skip tuples not visible to this snapshot */
1024 if (!visible)
1025 continue;
1026
1027 /* skip any tuples that don't match the scan key */
1028 if (key != NULL &&
1030 nkeys, key))
1031 continue;
1032
1034 scan->rs_coffset = lineoff;
1035 return;
1036 }
1037
1038 /*
1039 * if we get here, it means we've exhausted the items on this page and
1040 * it's time to move to the next.
1041 */
1043 }
1044
1045 /* end of scan */
1046 if (BufferIsValid(scan->rs_cbuf))
1047 ReleaseBuffer(scan->rs_cbuf);
1048
1049 scan->rs_cbuf = InvalidBuffer;
1052 tuple->t_data = NULL;
1053 scan->rs_inited = false;
1054}

References Assert, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid(), fb(), heap_fetch_next_buffer(), HeapCheckForSerializableConflictOut(), heapgettup_continue_page(), heapgettup_start_page(), HeapKeyTest(), HeapTupleSatisfiesVisibility(), InvalidBlockNumber, InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerSet(), likely, LockBuffer(), PageGetItem(), PageGetItemId(), RelationGetDescr, ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_coffset, HeapScanDescData::rs_ctup, HeapScanDescData::rs_inited, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, TableScanDescData::rs_snapshot, HeapTupleData::t_data, HeapTupleData::t_len, and HeapTupleData::t_self.

Referenced by heap_getnext(), heap_getnextslot(), and heap_getnextslot_tidrange().

◆ heapgettup_advance_block()

static BlockNumber heapgettup_advance_block ( HeapScanDesc  scan,
BlockNumber  block,
ScanDirection  dir 
)
inlinestatic

Definition at line 876 of file heapam.c.

877{
878 Assert(scan->rs_base.rs_parallel == NULL);
879
881 {
882 block++;
883
884 /* wrap back to the start of the heap */
885 if (block >= scan->rs_nblocks)
886 block = 0;
887
888 /*
889 * Report our new scan position for synchronization purposes. We don't
890 * do that when moving backwards, however. That would just mess up any
891 * other forward-moving scanners.
892 *
893 * Note: we do this before checking for end of scan so that the final
894 * state of the position hint is back at the start of the rel. That's
895 * not strictly necessary, but otherwise when you run the same query
896 * multiple times the starting position would shift a little bit
897 * backwards on every invocation, which is confusing. We don't
898 * guarantee any specific ordering in general, though.
899 */
900 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
901 ss_report_location(scan->rs_base.rs_rd, block);
902
903 /* we're done if we're back at where we started */
904 if (block == scan->rs_startblock)
905 return InvalidBlockNumber;
906
907 /* check if the limit imposed by heap_setscanlimits() is met */
908 if (scan->rs_numblocks != InvalidBlockNumber)
909 {
910 if (--scan->rs_numblocks == 0)
911 return InvalidBlockNumber;
912 }
913
914 return block;
915 }
916 else
917 {
918 /* we're done if the last block is the start position */
919 if (block == scan->rs_startblock)
920 return InvalidBlockNumber;
921
922 /* check if the limit imposed by heap_setscanlimits() is met */
923 if (scan->rs_numblocks != InvalidBlockNumber)
924 {
925 if (--scan->rs_numblocks == 0)
926 return InvalidBlockNumber;
927 }
928
929 /* wrap to the end of the heap when the last page was page 0 */
930 if (block == 0)
931 block = scan->rs_nblocks;
932
933 block--;
934
935 return block;
936 }
937}

References Assert, fb(), InvalidBlockNumber, likely, HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_nblocks, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, TableScanDescData::rs_rd, HeapScanDescData::rs_startblock, ScanDirectionIsForward, SO_ALLOW_SYNC, and ss_report_location().

Referenced by heap_scan_stream_read_next_serial().

◆ heapgettup_continue_page()

static Page heapgettup_continue_page ( HeapScanDesc  scan,
ScanDirection  dir,
int linesleft,
OffsetNumber lineoff 
)
inlinestatic

Definition at line 830 of file heapam.c.

832{
833 Page page;
834
835 Assert(scan->rs_inited);
837
838 /* Caller is responsible for ensuring buffer is locked if needed */
839 page = BufferGetPage(scan->rs_cbuf);
840
841 if (ScanDirectionIsForward(dir))
842 {
844 *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
845 }
846 else
847 {
848 /*
849 * The previous returned tuple may have been vacuumed since the
850 * previous scan when we use a non-MVCC snapshot, so we must
851 * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
852 */
854 *linesleft = *lineoff;
855 }
856
857 /* lineoff now references the physically previous or next tid */
858 return page;
859}

References Assert, BufferGetPage(), BufferIsValid(), fb(), Min, OffsetNumberNext, OffsetNumberPrev, PageGetMaxOffsetNumber(), HeapScanDescData::rs_cbuf, HeapScanDescData::rs_coffset, HeapScanDescData::rs_inited, and ScanDirectionIsForward.

Referenced by heapgettup().

◆ heapgettup_initial_block()

static pg_noinline BlockNumber heapgettup_initial_block ( HeapScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 752 of file heapam.c.

753{
754 Assert(!scan->rs_inited);
755 Assert(scan->rs_base.rs_parallel == NULL);
756
757 /* When there are no pages to scan, return InvalidBlockNumber */
758 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
759 return InvalidBlockNumber;
760
761 if (ScanDirectionIsForward(dir))
762 {
763 return scan->rs_startblock;
764 }
765 else
766 {
767 /*
768 * Disable reporting to syncscan logic in a backwards scan; it's not
769 * very likely anyone else is doing the same thing at the same time,
770 * and much more likely that we'll just bollix things for forward
771 * scanners.
772 */
774
775 /*
776 * Start from last page of the scan. Ensure we take into account
777 * rs_numblocks if it's been adjusted by heap_setscanlimits().
778 */
779 if (scan->rs_numblocks != InvalidBlockNumber)
780 return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
781
782 if (scan->rs_startblock > 0)
783 return scan->rs_startblock - 1;
784
785 return scan->rs_nblocks - 1;
786 }
787}

References Assert, fb(), InvalidBlockNumber, HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, HeapScanDescData::rs_nblocks, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, HeapScanDescData::rs_startblock, and ScanDirectionIsForward.

Referenced by heap_scan_stream_read_next_serial().

◆ heapgettup_pagemode()

static void heapgettup_pagemode ( HeapScanDesc  scan,
ScanDirection  dir,
int  nkeys,
ScanKey  key 
)
static

Definition at line 1070 of file heapam.c.

1074{
1075 HeapTuple tuple = &(scan->rs_ctup);
1076 Page page;
1079
1080 if (likely(scan->rs_inited))
1081 {
1082 /* continue from previously returned page/tuple */
1083 page = BufferGetPage(scan->rs_cbuf);
1084
1085 lineindex = scan->rs_cindex + dir;
1086 if (ScanDirectionIsForward(dir))
1087 linesleft = scan->rs_ntuples - lineindex;
1088 else
1089 linesleft = scan->rs_cindex;
1090 /* lineindex now references the next or previous visible tid */
1091
1092 goto continue_page;
1093 }
1094
1095 /*
1096 * advance the scan until we find a qualifying tuple or run out of stuff
1097 * to scan
1098 */
1099 while (true)
1100 {
1101 heap_fetch_next_buffer(scan, dir);
1102
1103 /* did we run out of blocks to scan? */
1104 if (!BufferIsValid(scan->rs_cbuf))
1105 break;
1106
1108
1109 /* prune the page and determine visible tuple offsets */
1111 page = BufferGetPage(scan->rs_cbuf);
1112 linesleft = scan->rs_ntuples;
1114
1115 /* block is the same for all tuples, set it once outside the loop */
1117
1118 /* lineindex now references the next or previous visible tid */
1120
1121 for (; linesleft > 0; linesleft--, lineindex += dir)
1122 {
1123 ItemId lpp;
1125
1126 Assert(lineindex < scan->rs_ntuples);
1128 lpp = PageGetItemId(page, lineoff);
1130
1131 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1132 tuple->t_len = ItemIdGetLength(lpp);
1134
1135 /* skip any tuples that don't match the scan key */
1136 if (key != NULL &&
1138 nkeys, key))
1139 continue;
1140
1141 scan->rs_cindex = lineindex;
1142 return;
1143 }
1144 }
1145
1146 /* end of scan */
1147 if (BufferIsValid(scan->rs_cbuf))
1148 ReleaseBuffer(scan->rs_cbuf);
1149 scan->rs_cbuf = InvalidBuffer;
1152 tuple->t_data = NULL;
1153 scan->rs_inited = false;
1154}

References Assert, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), fb(), heap_fetch_next_buffer(), heap_prepare_pagescan(), HeapKeyTest(), InvalidBlockNumber, InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerSetBlockNumber(), ItemPointerSetOffsetNumber(), likely, PageGetItem(), PageGetItemId(), RelationGetDescr, ReleaseBuffer(), HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_cindex, HeapScanDescData::rs_ctup, HeapScanDescData::rs_inited, HeapScanDescData::rs_ntuples, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, ScanDirectionIsForward, HeapTupleData::t_data, HeapTupleData::t_len, and HeapTupleData::t_self.

Referenced by heap_getnext(), heap_getnextslot(), and heap_getnextslot_tidrange().

◆ heapgettup_start_page()

static Page heapgettup_start_page ( HeapScanDesc  scan,
ScanDirection  dir,
int linesleft,
OffsetNumber lineoff 
)
static

Definition at line 799 of file heapam.c.

801{
802 Page page;
803
804 Assert(scan->rs_inited);
806
807 /* Caller is responsible for ensuring buffer is locked if needed */
808 page = BufferGetPage(scan->rs_cbuf);
809
811
812 if (ScanDirectionIsForward(dir))
814 else
816
817 /* lineoff now references the physically previous or next tid */
818 return page;
819}

References Assert, BufferGetPage(), BufferIsValid(), fb(), FirstOffsetNumber, PageGetMaxOffsetNumber(), HeapScanDescData::rs_cbuf, HeapScanDescData::rs_inited, and ScanDirectionIsForward.

Referenced by heapgettup().

◆ HeapTupleGetUpdateXid()

◆ HeapTupleHeaderAdvanceConflictHorizon()

void HeapTupleHeaderAdvanceConflictHorizon ( HeapTupleHeader  tuple,
TransactionId snapshotConflictHorizon 
)

Definition at line 8060 of file heapam.c.

8062{
8066
8067 if (tuple->t_infomask & HEAP_MOVED)
8068 {
8069 if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
8070 *snapshotConflictHorizon = xvac;
8071 }
8072
8073 /*
8074 * Ignore tuples inserted by an aborted transaction or if the tuple was
8075 * updated/deleted by the inserting transaction.
8076 *
8077 * Look for a committed hint bit, or if no xmin bit is set, check clog.
8078 */
8079 if (HeapTupleHeaderXminCommitted(tuple) ||
8081 {
8082 if (xmax != xmin &&
8083 TransactionIdFollows(xmax, *snapshotConflictHorizon))
8084 *snapshotConflictHorizon = xmax;
8085 }
8086}

References fb(), HEAP_MOVED, HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), HeapTupleHeaderXminCommitted(), HeapTupleHeaderXminInvalid(), HeapTupleHeaderData::t_infomask, TransactionIdDidCommit(), TransactionIdFollows(), and TransactionIdPrecedes().

Referenced by heap_index_delete_tuples(), heap_prune_chain(), and prune_freeze_plan().

◆ index_delete_check_htid()

static void index_delete_check_htid ( TM_IndexDeleteOp delstate,
Page  page,
OffsetNumber  maxoff,
const ItemPointerData htid,
TM_IndexStatus istatus 
)
inlinestatic

Definition at line 8145 of file heapam.c.

8148{
8150 ItemId iid;
8151
8152 Assert(OffsetNumberIsValid(istatus->idxoffnum));
8153
8154 if (unlikely(indexpagehoffnum > maxoff))
8155 ereport(ERROR,
8157 errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
8160 istatus->idxoffnum, delstate->iblknum,
8162
8164 if (unlikely(!ItemIdIsUsed(iid)))
8165 ereport(ERROR,
8167 errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
8170 istatus->idxoffnum, delstate->iblknum,
8172
8173 if (ItemIdHasStorage(iid))
8174 {
8175 HeapTupleHeader htup;
8176
8178 htup = (HeapTupleHeader) PageGetItem(page, iid);
8179
8181 ereport(ERROR,
8183 errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
8186 istatus->idxoffnum, delstate->iblknum,
8188 }
8189}

References Assert, ereport, errcode(), errmsg_internal(), ERROR, fb(), HeapTupleHeaderIsHeapOnly(), ItemIdHasStorage, ItemIdIsNormal, ItemIdIsUsed, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), OffsetNumberIsValid, PageGetItem(), PageGetItemId(), RelationGetRelationName, and unlikely.

Referenced by heap_index_delete_tuples().

◆ index_delete_sort()

static void index_delete_sort ( TM_IndexDeleteOp delstate)
static

Definition at line 8550 of file heapam.c.

8551{
8552 TM_IndexDelete *deltids = delstate->deltids;
8553 int ndeltids = delstate->ndeltids;
8554
8555 /*
8556 * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
8557 *
8558 * This implementation is fast with array sizes up to ~4500. This covers
8559 * all supported BLCKSZ values.
8560 */
8561 const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
8562
8563 /* Think carefully before changing anything here -- keep swaps cheap */
8564 StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
8565 "element size exceeds 8 bytes");
8566
8567 for (int g = 0; g < lengthof(gaps); g++)
8568 {
8569 for (int hi = gaps[g], i = hi; i < ndeltids; i++)
8570 {
8571 TM_IndexDelete d = deltids[i];
8572 int j = i;
8573
8574 while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
8575 {
8576 deltids[j] = deltids[j - hi];
8577 j -= hi;
8578 }
8579 deltids[j] = d;
8580 }
8581 }
8582}

References fb(), i, index_delete_sort_cmp(), j, lengthof, and StaticAssertDecl.

Referenced by heap_index_delete_tuples().

◆ index_delete_sort_cmp()

static int index_delete_sort_cmp ( TM_IndexDelete deltid1,
TM_IndexDelete deltid2 
)
inlinestatic

Definition at line 8514 of file heapam.c.

8515{
8516 ItemPointer tid1 = &deltid1->tid;
8517 ItemPointer tid2 = &deltid2->tid;
8518
8519 {
8522
8523 if (blk1 != blk2)
8524 return (blk1 < blk2) ? -1 : 1;
8525 }
8526 {
8529
8530 if (pos1 != pos2)
8531 return (pos1 < pos2) ? -1 : 1;
8532 }
8533
8534 Assert(false);
8535
8536 return 0;
8537}

References Assert, fb(), ItemPointerGetBlockNumber(), and ItemPointerGetOffsetNumber().

Referenced by index_delete_sort().

◆ initscan()

static void initscan ( HeapScanDesc  scan,
ScanKey  key,
bool  keep_startblock 
)
static

Definition at line 357 of file heapam.c.

358{
360 bool allow_strat;
361 bool allow_sync;
362
363 /*
364 * Determine the number of blocks we have to scan.
365 *
366 * It is sufficient to do this once at scan start, since any tuples added
367 * while the scan is in progress will be invisible to my snapshot anyway.
368 * (That is not true when using a non-MVCC snapshot. However, we couldn't
369 * guarantee to return tuples added after scan start anyway, since they
370 * might go into pages we already scanned. To guarantee consistent
371 * results for a non-MVCC snapshot, the caller must hold some higher-level
372 * lock that ensures the interesting tuple(s) won't change.)
373 */
374 if (scan->rs_base.rs_parallel != NULL)
375 {
377 scan->rs_nblocks = bpscan->phs_nblocks;
378 }
379 else
381
382 /*
383 * If the table is large relative to NBuffers, use a bulk-read access
384 * strategy and enable synchronized scanning (see syncscan.c). Although
385 * the thresholds for these features could be different, we make them the
386 * same so that there are only two behaviors to tune rather than four.
387 * (However, some callers need to be able to disable one or both of these
388 * behaviors, independently of the size of the table; also there is a GUC
389 * variable that can disable synchronized scanning.)
390 *
391 * Note that table_block_parallelscan_initialize has a very similar test;
392 * if you change this, consider changing that one, too.
393 */
395 scan->rs_nblocks > NBuffers / 4)
396 {
398 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
399 }
400 else
401 allow_strat = allow_sync = false;
402
403 if (allow_strat)
404 {
405 /* During a rescan, keep the previous strategy object. */
406 if (scan->rs_strategy == NULL)
408 }
409 else
410 {
411 if (scan->rs_strategy != NULL)
413 scan->rs_strategy = NULL;
414 }
415
416 if (scan->rs_base.rs_parallel != NULL)
417 {
418 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
421 else
423
424 /*
425 * If not rescanning, initialize the startblock. Finding the actual
426 * start location is done in table_block_parallelscan_startblock_init,
427 * based on whether an alternative start location has been set with
428 * heap_setscanlimits, or using the syncscan location, when syncscan
429 * is enabled.
430 */
431 if (!keep_startblock)
433 }
434 else
435 {
436 if (keep_startblock)
437 {
438 /*
439 * When rescanning, we want to keep the previous startblock
440 * setting, so that rewinding a cursor doesn't generate surprising
441 * results. Reset the active syncscan setting, though.
442 */
445 else
447 }
449 {
452 }
453 else
454 {
456 scan->rs_startblock = 0;
457 }
458 }
459
461 scan->rs_inited = false;
462 scan->rs_ctup.t_data = NULL;
464 scan->rs_cbuf = InvalidBuffer;
466 scan->rs_ntuples = 0;
467 scan->rs_cindex = 0;
468
469 /*
470 * Initialize to ForwardScanDirection because it is most common and
471 * because heap scans go forward before going backward (e.g. CURSORs).
472 */
475
476 /* page-at-a-time fields are always invalid when not rs_inited */
477
478 /*
479 * copy the scan key, if appropriate
480 */
481 if (key != NULL && scan->rs_base.rs_nkeys > 0)
482 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
483
484 /*
485 * Currently, we only have a stats counter for sequential heap scans (but
486 * e.g for bitmap scans the underlying bitmap index scans will be counted,
487 * and for sample scans we update stats for tuple fetches).
488 */
489 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
491}

References BAS_BULKREAD, fb(), ForwardScanDirection, FreeAccessStrategy(), GetAccessStrategy(), InvalidBlockNumber, InvalidBuffer, ItemPointerSetInvalid(), NBuffers, pgstat_count_heap_scan, ParallelTableScanDescData::phs_syncscan, RelationGetNumberOfBlocks, RelationUsesLocalBuffers, HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_cindex, HeapScanDescData::rs_ctup, HeapScanDescData::rs_dir, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, TableScanDescData::rs_key, HeapScanDescData::rs_nblocks, TableScanDescData::rs_nkeys, HeapScanDescData::rs_ntuples, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, HeapScanDescData::rs_startblock, HeapScanDescData::rs_strategy, SO_ALLOW_STRAT, SO_ALLOW_SYNC, SO_TYPE_SEQSCAN, ss_get_location(), synchronize_seqscans, HeapTupleData::t_data, and HeapTupleData::t_self.

Referenced by heap_beginscan(), and heap_rescan().

◆ log_heap_new_cid()

static XLogRecPtr log_heap_new_cid ( Relation  relation,
HeapTuple  tup 
)
static

Definition at line 9147 of file heapam.c.

9148{
9150
9152 HeapTupleHeader hdr = tup->t_data;
9153
9154 Assert(ItemPointerIsValid(&tup->t_self));
9155 Assert(tup->t_tableOid != InvalidOid);
9156
9157 xlrec.top_xid = GetTopTransactionId();
9158 xlrec.target_locator = relation->rd_locator;
9159 xlrec.target_tid = tup->t_self;
9160
9161 /*
9162 * If the tuple got inserted & deleted in the same TX we definitely have a
9163 * combo CID, set cmin and cmax.
9164 */
9165 if (hdr->t_infomask & HEAP_COMBOCID)
9166 {
9169 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
9170 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
9171 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
9172 }
9173 /* No combo CID, so only cmin or cmax can be set by this TX */
9174 else
9175 {
9176 /*
9177 * Tuple inserted.
9178 *
9179 * We need to check for LOCK ONLY because multixacts might be
9180 * transferred to the new tuple in case of FOR KEY SHARE updates in
9181 * which case there will be an xmax, although the tuple just got
9182 * inserted.
9183 */
9184 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
9186 {
9188 xlrec.cmax = InvalidCommandId;
9189 }
9190 /* Tuple from a different tx updated or deleted. */
9191 else
9192 {
9193 xlrec.cmin = InvalidCommandId;
9195 }
9196 xlrec.combocid = InvalidCommandId;
9197 }
9198
9199 /*
9200 * Note that we don't need to register the buffer here, because this
9201 * operation does not modify the page. The insert/update/delete that
9202 * called us certainly did, but that's WAL-logged separately.
9203 */
9206
9207 /* will be looked at irrespective of origin */
9208
9210
9211 return recptr;
9212}

References Assert, fb(), GetTopTransactionId(), HEAP_COMBOCID, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetCmin(), HeapTupleHeaderGetRawCommandId(), HeapTupleHeaderXminInvalid(), InvalidCommandId, InvalidOid, ItemPointerIsValid(), RelationData::rd_locator, SizeOfHeapNewCid, HeapTupleHeaderData::t_infomask, XLOG_HEAP2_NEW_CID, XLogBeginInsert(), XLogInsert(), and XLogRegisterData().

Referenced by heap_delete(), heap_insert(), heap_multi_insert(), and heap_update().

◆ log_heap_update()

static XLogRecPtr log_heap_update ( Relation  reln,
Buffer  oldbuf,
Buffer  newbuf,
HeapTuple  oldtup,
HeapTuple  newtup,
HeapTuple  old_key_tuple,
bool  all_visible_cleared,
bool  new_all_visible_cleared 
)
static

Definition at line 8925 of file heapam.c.

8929{
8933 uint8 info;
8935 uint16 prefixlen = 0,
8936 suffixlen = 0;
8938 Page page = BufferGetPage(newbuf);
8940 bool init;
8941 int bufflags;
8942
8943 /* Caller should not call me on a non-WAL-logged relation */
8945
8947
8949 info = XLOG_HEAP_HOT_UPDATE;
8950 else
8951 info = XLOG_HEAP_UPDATE;
8952
8953 /*
8954 * If the old and new tuple are on the same page, we only need to log the
8955 * parts of the new tuple that were changed. That saves on the amount of
8956 * WAL we need to write. Currently, we just count any unchanged bytes in
8957 * the beginning and end of the tuple. That's quick to check, and
8958 * perfectly covers the common case that only one field is updated.
8959 *
8960 * We could do this even if the old and new tuple are on different pages,
8961 * but only if we don't make a full-page image of the old page, which is
8962 * difficult to know in advance. Also, if the old tuple is corrupt for
8963 * some reason, it would allow the corruption to propagate the new page,
8964 * so it seems best to avoid. Under the general assumption that most
8965 * updates tend to create the new tuple version on the same page, there
8966 * isn't much to be gained by doing this across pages anyway.
8967 *
8968 * Skip this if we're taking a full-page image of the new page, as we
8969 * don't include the new tuple in the WAL record in that case. Also
8970 * disable if effective_wal_level='logical', as logical decoding needs to
8971 * be able to read the new tuple in whole from the WAL record alone.
8972 */
8973 if (oldbuf == newbuf && !need_tuple_data &&
8975 {
8976 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8977 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8978 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8979 int newlen = newtup->t_len - newtup->t_data->t_hoff;
8980
8981 /* Check for common prefix between old and new tuple */
8982 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8983 {
8984 if (newp[prefixlen] != oldp[prefixlen])
8985 break;
8986 }
8987
8988 /*
8989 * Storing the length of the prefix takes 2 bytes, so we need to save
8990 * at least 3 bytes or there's no point.
8991 */
8992 if (prefixlen < 3)
8993 prefixlen = 0;
8994
8995 /* Same for suffix */
8997 {
8998 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
8999 break;
9000 }
9001 if (suffixlen < 3)
9002 suffixlen = 0;
9003 }
9004
9005 /* Prepare main WAL data chain */
9006 xlrec.flags = 0;
9011 if (prefixlen > 0)
9013 if (suffixlen > 0)
9015 if (need_tuple_data)
9016 {
9018 if (old_key_tuple)
9019 {
9020 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
9022 else
9024 }
9025 }
9026
9027 /* If new tuple is the single and first tuple on page... */
9030 {
9031 info |= XLOG_HEAP_INIT_PAGE;
9032 init = true;
9033 }
9034 else
9035 init = false;
9036
9037 /* Prepare WAL data for the old page */
9038 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
9039 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
9040 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
9041 oldtup->t_data->t_infomask2);
9042
9043 /* Prepare WAL data for the new page */
9044 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
9045 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
9046
9048 if (init)
9050 if (need_tuple_data)
9052
9054 if (oldbuf != newbuf)
9056
9058
9059 /*
9060 * Prepare WAL data for the new tuple.
9061 */
9062 if (prefixlen > 0 || suffixlen > 0)
9063 {
9064 if (prefixlen > 0 && suffixlen > 0)
9065 {
9068 XLogRegisterBufData(0, &prefix_suffix, sizeof(uint16) * 2);
9069 }
9070 else if (prefixlen > 0)
9071 {
9072 XLogRegisterBufData(0, &prefixlen, sizeof(uint16));
9073 }
9074 else
9075 {
9076 XLogRegisterBufData(0, &suffixlen, sizeof(uint16));
9077 }
9078 }
9079
9080 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
9081 xlhdr.t_infomask = newtup->t_data->t_infomask;
9082 xlhdr.t_hoff = newtup->t_data->t_hoff;
9084
9085 /*
9086 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
9087 *
9088 * The 'data' doesn't include the common prefix or suffix.
9089 */
9091 if (prefixlen == 0)
9092 {
9094 (char *) newtup->t_data + SizeofHeapTupleHeader,
9096 }
9097 else
9098 {
9099 /*
9100 * Have to write the null bitmap and data after the common prefix as
9101 * two separate rdata entries.
9102 */
9103 /* bitmap [+ padding] [+ oid] */
9104 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
9105 {
9107 (char *) newtup->t_data + SizeofHeapTupleHeader,
9108 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
9109 }
9110
9111 /* data after common prefix */
9113 (char *) newtup->t_data + newtup->t_data->t_hoff + prefixlen,
9114 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
9115 }
9116
9117 /* We need to log a tuple identity */
9119 {
9120 /* don't really need this, but its more comfy to decode */
9121 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
9122 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
9123 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
9124
9126
9127 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
9130 }
9131
9132 /* filtering by origin on a row level is much more efficient */
9134
9135 recptr = XLogInsert(RM_HEAP_ID, info);
9136
9137 return recptr;
9138}

References Assert, BufferGetPage(), compute_infobits(), fb(), FirstOffsetNumber, HeapTupleHeaderGetRawXmax(), HeapTupleIsHeapOnly(), init, ItemPointerGetOffsetNumber(), Min, PageGetMaxOffsetNumber(), REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationIsLogicallyLogged, RelationNeedsWAL, SizeOfHeapHeader, SizeofHeapTupleHeader, SizeOfHeapUpdate, XLH_UPDATE_CONTAINS_NEW_TUPLE, XLH_UPDATE_CONTAINS_OLD_KEY, XLH_UPDATE_CONTAINS_OLD_TUPLE, XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED, XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED, XLH_UPDATE_PREFIX_FROM_OLD, XLH_UPDATE_SUFFIX_FROM_OLD, XLOG_HEAP_HOT_UPDATE, XLOG_HEAP_INIT_PAGE, XLOG_HEAP_UPDATE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogCheckBufferNeedsBackup(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heap_update().

◆ log_heap_visible()

XLogRecPtr log_heap_visible ( Relation  rel,
Buffer  heap_buffer,
Buffer  vm_buffer,
TransactionId  snapshotConflictHorizon,
uint8  vmflags 
)

◆ MultiXactIdGetUpdateXid()

static TransactionId MultiXactIdGetUpdateXid ( TransactionId  xmax,
uint16  t_infomask 
)
static

Definition at line 7614 of file heapam.c.

7615{
7617 MultiXactMember *members;
7618 int nmembers;
7619
7620 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7621 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7622
7623 /*
7624 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7625 * pre-pg_upgrade.
7626 */
7627 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7628
7629 if (nmembers > 0)
7630 {
7631 int i;
7632
7633 for (i = 0; i < nmembers; i++)
7634 {
7635 /* Ignore lockers */
7636 if (!ISUPDATE_from_mxstatus(members[i].status))
7637 continue;
7638
7639 /* there can be at most one updater */
7641 update_xact = members[i].xid;
7642#ifndef USE_ASSERT_CHECKING
7643
7644 /*
7645 * in an assert-enabled build, walk the whole array to ensure
7646 * there's no other updater.
7647 */
7648 break;
7649#endif
7650 }
7651
7652 pfree(members);
7653 }
7654
7655 return update_xact;
7656}

References Assert, fb(), GetMultiXactIdMembers(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_LOCK_ONLY, i, InvalidTransactionId, ISUPDATE_from_mxstatus, pfree(), and MultiXactMember::xid.

Referenced by compute_new_xmax_infomask(), FreezeMultiXactId(), heap_lock_updated_tuple(), and HeapTupleGetUpdateXid().

◆ MultiXactIdWait()

static void MultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
Relation  rel,
const ItemPointerData ctid,
XLTW_Oper  oper,
int remaining 
)
static

Definition at line 7860 of file heapam.c.

7863{
7864 (void) Do_MultiXactIdWait(multi, status, infomask, false,
7865 rel, ctid, oper, remaining, false);
7866}

References Do_MultiXactIdWait(), fb(), oper(), and remaining.

Referenced by heap_delete(), heap_inplace_lock(), heap_lock_tuple(), and heap_update().

◆ page_collect_tuples()

static pg_attribute_always_inline int page_collect_tuples ( HeapScanDesc  scan,
Snapshot  snapshot,
Page  page,
Buffer  buffer,
BlockNumber  block,
int  lines,
bool  all_visible,
bool  check_serializable 
)
static

Definition at line 522 of file heapam.c.

526{
527 Oid relid = RelationGetRelid(scan->rs_base.rs_rd);
528 int ntup = 0;
529 int nvis = 0;
531
532 /* page at a time should have been disabled otherwise */
533 Assert(IsMVCCSnapshot(snapshot));
534
535 /* first find all tuples on the page */
537 {
540
542 continue;
543
544 /*
545 * If the page is not all-visible or we need to check serializability,
546 * maintain enough state to be able to refind the tuple efficiently,
547 * without again first needing to fetch the item and then via that the
548 * tuple.
549 */
550 if (!all_visible || check_serializable)
551 {
552 tup = &batchmvcc.tuples[ntup];
553
555 tup->t_len = ItemIdGetLength(lpp);
556 tup->t_tableOid = relid;
557 ItemPointerSet(&(tup->t_self), block, lineoff);
558 }
559
560 /*
561 * If the page is all visible, these fields otherwise won't be
562 * populated in loop below.
563 */
564 if (all_visible)
565 {
567 {
568 batchmvcc.visible[ntup] = true;
569 }
570 scan->rs_vistuples[ntup] = lineoff;
571 }
572
573 ntup++;
574 }
575
577
578 /*
579 * Unless the page is all visible, test visibility for all tuples one go.
580 * That is considerably more efficient than calling
581 * HeapTupleSatisfiesMVCC() one-by-one.
582 */
583 if (all_visible)
584 nvis = ntup;
585 else
586 nvis = HeapTupleSatisfiesMVCCBatch(snapshot, buffer,
587 ntup,
588 &batchmvcc,
589 scan->rs_vistuples);
590
591 /*
592 * So far we don't have batch API for testing serializabilty, so do so
593 * one-by-one.
594 */
596 {
597 for (int i = 0; i < ntup; i++)
598 {
600 scan->rs_base.rs_rd,
601 &batchmvcc.tuples[i],
602 buffer, snapshot);
603 }
604 }
605
606 return nvis;
607}

References Assert, fb(), FirstOffsetNumber, HeapCheckForSerializableConflictOut(), HeapTupleSatisfiesMVCCBatch(), i, IsMVCCSnapshot, ItemIdGetLength, ItemIdIsNormal, ItemPointerSet(), MaxHeapTuplesPerPage, PageGetItem(), PageGetItemId(), RelationGetRelid, HeapScanDescData::rs_base, TableScanDescData::rs_rd, HeapScanDescData::rs_vistuples, HeapTupleData::t_data, and unlikely.

Referenced by heap_prepare_pagescan().

◆ ReleaseBulkInsertStatePin()

void ReleaseBulkInsertStatePin ( BulkInsertState  bistate)

Definition at line 2104 of file heapam.c.

2105{
2106 if (bistate->current_buf != InvalidBuffer)
2107 ReleaseBuffer(bistate->current_buf);
2108 bistate->current_buf = InvalidBuffer;
2109
2110 /*
2111 * Despite the name, we also reset bulk relation extension state.
2112 * Otherwise we can end up erroring out due to looking for free space in
2113 * ->next_free of one partition, even though ->next_free was set when
2114 * extending another partition. It could obviously also be bad for
2115 * efficiency to look at existing blocks at offsets from another
2116 * partition, even if we don't error out.
2117 */
2118 bistate->next_free = InvalidBlockNumber;
2119 bistate->last_free = InvalidBlockNumber;
2120}

References BulkInsertStateData::current_buf, InvalidBlockNumber, InvalidBuffer, BulkInsertStateData::last_free, BulkInsertStateData::next_free, and ReleaseBuffer().

Referenced by CopyFrom().

◆ simple_heap_delete()

void simple_heap_delete ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 3267 of file heapam.c.

3268{
3269 TM_Result result;
3270 TM_FailureData tmfd;
3271
3272 result = heap_delete(relation, tid,
3274 true /* wait for commit */ ,
3275 &tmfd, false /* changingPart */ );
3276 switch (result)
3277 {
3278 case TM_SelfModified:
3279 /* Tuple was already updated in current command? */
3280 elog(ERROR, "tuple already updated by self");
3281 break;
3282
3283 case TM_Ok:
3284 /* done successfully */
3285 break;
3286
3287 case TM_Updated:
3288 elog(ERROR, "tuple concurrently updated");
3289 break;
3290
3291 case TM_Deleted:
3292 elog(ERROR, "tuple concurrently deleted");
3293 break;
3294
3295 default:
3296 elog(ERROR, "unrecognized heap_delete status: %u", result);
3297 break;
3298 }
3299}

References elog, ERROR, GetCurrentCommandId(), heap_delete(), InvalidSnapshot, TM_Deleted, TM_Ok, TM_SelfModified, and TM_Updated.

Referenced by CatalogTupleDelete(), and toast_delete_datum().

◆ simple_heap_insert()

void simple_heap_insert ( Relation  relation,
HeapTuple  tup 
)

Definition at line 2786 of file heapam.c.

2787{
2788 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2789}

References fb(), GetCurrentCommandId(), and heap_insert().

Referenced by CatalogTupleInsert(), CatalogTupleInsertWithInfo(), and InsertOneTuple().

◆ simple_heap_update()

void simple_heap_update ( Relation  relation,
const ItemPointerData otid,
HeapTuple  tup,
TU_UpdateIndexes update_indexes 
)

Definition at line 4557 of file heapam.c.

4559{
4560 TM_Result result;
4561 TM_FailureData tmfd;
4562 LockTupleMode lockmode;
4563
4564 result = heap_update(relation, otid, tup,
4566 true /* wait for commit */ ,
4567 &tmfd, &lockmode, update_indexes);
4568 switch (result)
4569 {
4570 case TM_SelfModified:
4571 /* Tuple was already updated in current command? */
4572 elog(ERROR, "tuple already updated by self");
4573 break;
4574
4575 case TM_Ok:
4576 /* done successfully */
4577 break;
4578
4579 case TM_Updated:
4580 elog(ERROR, "tuple concurrently updated");
4581 break;
4582
4583 case TM_Deleted:
4584 elog(ERROR, "tuple concurrently deleted");
4585 break;
4586
4587 default:
4588 elog(ERROR, "unrecognized heap_update status: %u", result);
4589 break;
4590 }
4591}

References elog, ERROR, fb(), GetCurrentCommandId(), heap_update(), InvalidSnapshot, TM_Deleted, TM_Ok, TM_SelfModified, and TM_Updated.

Referenced by CatalogTupleUpdate(), and CatalogTupleUpdateWithInfo().

◆ test_lockmode_for_conflict()

static TM_Result test_lockmode_for_conflict ( MultiXactStatus  status,
TransactionId  xid,
LockTupleMode  mode,
HeapTuple  tup,
bool needwait 
)
static

Definition at line 5677 of file heapam.c.

5680{
5682
5683 *needwait = false;
5685
5686 /*
5687 * Note: we *must* check TransactionIdIsInProgress before
5688 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5689 * for an explanation.
5690 */
5692 {
5693 /*
5694 * The tuple has already been locked by our own transaction. This is
5695 * very rare but can happen if multiple transactions are trying to
5696 * lock an ancient version of the same tuple.
5697 */
5698 return TM_SelfModified;
5699 }
5700 else if (TransactionIdIsInProgress(xid))
5701 {
5702 /*
5703 * If the locking transaction is running, what we do depends on
5704 * whether the lock modes conflict: if they do, then we must wait for
5705 * it to finish; otherwise we can fall through to lock this tuple
5706 * version without waiting.
5707 */
5710 {
5711 *needwait = true;
5712 }
5713
5714 /*
5715 * If we set needwait above, then this value doesn't matter;
5716 * otherwise, this value signals to caller that it's okay to proceed.
5717 */
5718 return TM_Ok;
5719 }
5720 else if (TransactionIdDidAbort(xid))
5721 return TM_Ok;
5722 else if (TransactionIdDidCommit(xid))
5723 {
5724 /*
5725 * The other transaction committed. If it was only a locker, then the
5726 * lock is completely gone now and we can return success; but if it
5727 * was an update, then what we do depends on whether the two lock
5728 * modes conflict. If they conflict, then we must report error to
5729 * caller. But if they don't, we can fall through to allow the current
5730 * transaction to lock the tuple.
5731 *
5732 * Note: the reason we worry about ISUPDATE here is because as soon as
5733 * a transaction ends, all its locks are gone and meaningless, and
5734 * thus we can ignore them; whereas its updates persist. In the
5735 * TransactionIdIsInProgress case, above, we don't need to check
5736 * because we know the lock is still "alive" and thus a conflict needs
5737 * always be checked.
5738 */
5739 if (!ISUPDATE_from_mxstatus(status))
5740 return TM_Ok;
5741
5744 {
5745 /* bummer */
5746 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5747 return TM_Updated;
5748 else
5749 return TM_Deleted;
5750 }
5751
5752 return TM_Ok;
5753 }
5754
5755 /* Not in progress, not aborted, not committed -- must have crashed */
5756 return TM_Ok;
5757}

References DoLockModesConflict(), fb(), get_mxact_status_for_lock(), ISUPDATE_from_mxstatus, ItemPointerEquals(), LOCKMODE_from_mxstatus, mode, TM_Deleted, TM_Ok, TM_SelfModified, TM_Updated, TransactionIdDidAbort(), TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), and TransactionIdIsInProgress().

Referenced by heap_lock_updated_tuple_rec().

◆ UpdateXmaxHintBits()

◆ xmax_infomask_changed()

static bool xmax_infomask_changed ( uint16  new_infomask,
uint16  old_infomask 
)
inlinestatic

Definition at line 2821 of file heapam.c.

2822{
2823 const uint16 interesting =
2825
2826 if ((new_infomask & interesting) != (old_infomask & interesting))
2827 return true;
2828
2829 return false;
2830}

References fb(), HEAP_LOCK_MASK, HEAP_XMAX_IS_MULTI, and HEAP_XMAX_LOCK_ONLY.

Referenced by heap_delete(), heap_lock_tuple(), and heap_update().

Variable Documentation

◆ hwlock

LOCKMODE hwlock

Definition at line 128 of file heapam.c.

◆ lockstatus

int lockstatus

Definition at line 129 of file heapam.c.

◆ MultiXactStatusLock

const int MultiXactStatusLock[MaxMultiXactStatus+1]
static
Initial value:

Definition at line 207 of file heapam.c.

208{
209 LockTupleKeyShare, /* ForKeyShare */
210 LockTupleShare, /* ForShare */
211 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
212 LockTupleExclusive, /* ForUpdate */
213 LockTupleNoKeyExclusive, /* NoKeyUpdate */
214 LockTupleExclusive /* Update */
215};

◆ [struct]

const struct { ... } tupleLockExtraInfo[]
Initial value:
=
{
.hwlock = AccessShareLock,
.updstatus = -1
},
.hwlock = RowShareLock,
.lockstatus = MultiXactStatusForShare,
.updstatus = -1
},
.hwlock = ExclusiveLock,
},
.lockstatus = MultiXactStatusForUpdate,
.updstatus = MultiXactStatusUpdate
}
}
#define AccessExclusiveLock
Definition lockdefs.h:43
#define ExclusiveLock
Definition lockdefs.h:42
#define RowShareLock
Definition lockdefs.h:37

Referenced by DoesMultiXactIdConflict(), and get_mxact_status_for_lock().

◆ updstatus

int updstatus

Definition at line 130 of file heapam.c.