PostgreSQL Source Code git master
Loading...
Searching...
No Matches
heapam.c File Reference
#include "postgres.h"
#include "access/heapam.h"
#include "access/heaptoast.h"
#include "access/hio.h"
#include "access/multixact.h"
#include "access/subtrans.h"
#include "access/syncscan.h"
#include "access/valid.h"
#include "access/visibilitymap.h"
#include "access/xloginsert.h"
#include "catalog/pg_database.h"
#include "catalog/pg_database_d.h"
#include "commands/vacuum.h"
#include "pgstat.h"
#include "port/pg_bitutils.h"
#include "storage/lmgr.h"
#include "storage/predicate.h"
#include "storage/procarray.h"
#include "utils/datum.h"
#include "utils/injection_point.h"
#include "utils/inval.h"
#include "utils/spccache.h"
#include "utils/syscache.h"
Include dependency graph for heapam.c:

Go to the source code of this file.

Data Structures

struct  IndexDeleteCounts
 

Macros

#define LOCKMODE_from_mxstatus(status)    (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)
 
#define LockTupleTuplock(rel, tup, mode)    LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
 
#define UnlockTupleTuplock(rel, tup, mode)    UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)
 
#define ConditionalLockTupleTuplock(rel, tup, mode, log)    ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock, (log))
 
#define BOTTOMUP_MAX_NBLOCKS   6
 
#define BOTTOMUP_TOLERANCE_NBLOCKS   3
 
#define TUPLOCK_from_mxstatus(status)    (MultiXactStatusLock[(status)])
 
#define FRM_NOOP   0x0001
 
#define FRM_INVALIDATE_XMAX   0x0002
 
#define FRM_RETURN_IS_XID   0x0004
 
#define FRM_RETURN_IS_MULTI   0x0008
 
#define FRM_MARK_COMMITTED   0x0010
 

Typedefs

typedef struct IndexDeleteCounts IndexDeleteCounts
 

Functions

static HeapTuple heap_prepare_insert (Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options)
 
static XLogRecPtr log_heap_update (Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared)
 
static BitmapsetHeapDetermineColumnsInfo (Relation relation, Bitmapset *interesting_cols, Bitmapset *external_cols, HeapTuple oldtup, HeapTuple newtup, bool *has_external)
 
static bool heap_acquire_tuplock (Relation relation, const ItemPointerData *tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock)
 
static BlockNumber heapgettup_advance_block (HeapScanDesc scan, BlockNumber block, ScanDirection dir)
 
static pg_noinline BlockNumber heapgettup_initial_block (HeapScanDesc scan, ScanDirection dir)
 
static void compute_new_xmax_infomask (TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2)
 
static TM_Result heap_lock_updated_tuple (Relation rel, uint16 prior_infomask, TransactionId prior_raw_xmax, const ItemPointerData *prior_ctid, TransactionId xid, LockTupleMode mode)
 
static void GetMultiXactIdHintBits (MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2)
 
static TransactionId MultiXactIdGetUpdateXid (TransactionId xmax, uint16 t_infomask)
 
static bool DoesMultiXactIdConflict (MultiXactId multi, uint16 infomask, LockTupleMode lockmode, bool *current_is_member)
 
static void MultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining)
 
static bool ConditionalMultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining, bool logLockFailure)
 
static void index_delete_sort (TM_IndexDeleteOp *delstate)
 
static int bottomup_sort_and_shrink (TM_IndexDeleteOp *delstate)
 
static XLogRecPtr log_heap_new_cid (Relation relation, HeapTuple tup)
 
static HeapTuple ExtractReplicaIdentity (Relation relation, HeapTuple tp, bool key_required, bool *copy)
 
static void AssertHasSnapshotForToast (Relation rel)
 
static BlockNumber heap_scan_stream_read_next_parallel (ReadStream *stream, void *callback_private_data, void *per_buffer_data)
 
static BlockNumber heap_scan_stream_read_next_serial (ReadStream *stream, void *callback_private_data, void *per_buffer_data)
 
static BlockNumber bitmapheap_stream_read_next (ReadStream *pgsr, void *private_data, void *per_buffer_data)
 
static void initscan (HeapScanDesc scan, ScanKey key, bool keep_startblock)
 
void heap_setscanlimits (TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
 
static pg_attribute_always_inline int page_collect_tuples (HeapScanDesc scan, Snapshot snapshot, Page page, Buffer buffer, BlockNumber block, int lines, bool all_visible, bool check_serializable)
 
void heap_prepare_pagescan (TableScanDesc sscan)
 
static void heap_fetch_next_buffer (HeapScanDesc scan, ScanDirection dir)
 
static Page heapgettup_start_page (HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
 
static Page heapgettup_continue_page (HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
 
static void heapgettup (HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
 
static void heapgettup_pagemode (HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
 
TableScanDesc heap_beginscan (Relation relation, Snapshot snapshot, int nkeys, ScanKey key, ParallelTableScanDesc parallel_scan, uint32 flags)
 
void heap_rescan (TableScanDesc sscan, ScanKey key, bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode)
 
void heap_endscan (TableScanDesc sscan)
 
HeapTuple heap_getnext (TableScanDesc sscan, ScanDirection direction)
 
bool heap_getnextslot (TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
 
void heap_set_tidrange (TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid)
 
bool heap_getnextslot_tidrange (TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
 
bool heap_fetch (Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf)
 
bool heap_hot_search_buffer (ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call)
 
void heap_get_latest_tid (TableScanDesc sscan, ItemPointer tid)
 
static void UpdateXmaxHintBits (HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
 
BulkInsertState GetBulkInsertState (void)
 
void FreeBulkInsertState (BulkInsertState bistate)
 
void ReleaseBulkInsertStatePin (BulkInsertState bistate)
 
void heap_insert (Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate)
 
static int heap_multi_insert_pages (HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
 
void heap_multi_insert (Relation relation, TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate)
 
void simple_heap_insert (Relation relation, HeapTuple tup)
 
static uint8 compute_infobits (uint16 infomask, uint16 infomask2)
 
static bool xmax_infomask_changed (uint16 new_infomask, uint16 old_infomask)
 
TM_Result heap_delete (Relation relation, const ItemPointerData *tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
 
void simple_heap_delete (Relation relation, const ItemPointerData *tid)
 
TM_Result heap_update (Relation relation, const ItemPointerData *otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
 
static bool heap_attr_equals (TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, bool isnull1, bool isnull2)
 
void simple_heap_update (Relation relation, const ItemPointerData *otid, HeapTuple tup, TU_UpdateIndexes *update_indexes)
 
static MultiXactStatus get_mxact_status_for_lock (LockTupleMode mode, bool is_update)
 
TM_Result heap_lock_tuple (Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_updates, Buffer *buffer, TM_FailureData *tmfd)
 
static TM_Result test_lockmode_for_conflict (MultiXactStatus status, TransactionId xid, LockTupleMode mode, HeapTuple tup, bool *needwait)
 
static TM_Result heap_lock_updated_tuple_rec (Relation rel, TransactionId priorXmax, const ItemPointerData *tid, TransactionId xid, LockTupleMode mode)
 
void heap_finish_speculative (Relation relation, const ItemPointerData *tid)
 
void heap_abort_speculative (Relation relation, const ItemPointerData *tid)
 
bool heap_inplace_lock (Relation relation, HeapTuple oldtup_ptr, Buffer buffer, void(*release_callback)(void *), void *arg)
 
void heap_inplace_update_and_unlock (Relation relation, HeapTuple oldtup, HeapTuple tuple, Buffer buffer)
 
void heap_inplace_unlock (Relation relation, HeapTuple oldtup, Buffer buffer)
 
static TransactionId FreezeMultiXactId (MultiXactId multi, uint16 t_infomask, const struct VacuumCutoffs *cutoffs, uint16 *flags, HeapPageFreeze *pagefrz)
 
bool heap_prepare_freeze_tuple (HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, HeapPageFreeze *pagefrz, HeapTupleFreeze *frz, bool *totally_frozen)
 
void heap_pre_freeze_checks (Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
 
void heap_freeze_prepared_tuples (Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
 
bool heap_freeze_tuple (HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId FreezeLimit, TransactionId MultiXactCutoff)
 
TransactionId HeapTupleGetUpdateXid (const HeapTupleHeaderData *tup)
 
static bool Do_MultiXactIdWait (MultiXactId multi, MultiXactStatus status, uint16 infomask, bool nowait, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining, bool logLockFailure)
 
bool heap_tuple_needs_eventual_freeze (HeapTupleHeader tuple)
 
bool heap_tuple_should_freeze (HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, TransactionId *NoFreezePageRelfrozenXid, MultiXactId *NoFreezePageRelminMxid)
 
void HeapTupleHeaderAdvanceConflictHorizon (HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
 
static void index_delete_check_htid (TM_IndexDeleteOp *delstate, Page page, OffsetNumber maxoff, const ItemPointerData *htid, TM_IndexStatus *istatus)
 
TransactionId heap_index_delete_tuples (Relation rel, TM_IndexDeleteOp *delstate)
 
static int index_delete_sort_cmp (TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
 
static int bottomup_nblocksfavorable (IndexDeleteCounts *blockgroups, int nblockgroups, TM_IndexDelete *deltids)
 
static int bottomup_sort_and_shrink_cmp (const void *arg1, const void *arg2)
 
XLogRecPtr log_heap_visible (Relation rel, Buffer heap_buffer, Buffer vm_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags)
 
void HeapCheckForSerializableConflictOut (bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot)
 

Variables

struct { 
 
   LOCKMODE   hwlock 
 
   int   lockstatus 
 
   int   updstatus 
 
tupleLockExtraInfo [] 
 
static const int MultiXactStatusLock [MaxMultiXactStatus+1]
 

Macro Definition Documentation

◆ BOTTOMUP_MAX_NBLOCKS

#define BOTTOMUP_MAX_NBLOCKS   6

Definition at line 188 of file heapam.c.

◆ BOTTOMUP_TOLERANCE_NBLOCKS

#define BOTTOMUP_TOLERANCE_NBLOCKS   3

Definition at line 189 of file heapam.c.

◆ ConditionalLockTupleTuplock

#define ConditionalLockTupleTuplock (   rel,
  tup,
  mode,
  log 
)     ConditionalLockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock, (log))

Definition at line 170 of file heapam.c.

178{
180 int next_item;
181 int ndeltids;
182 TM_IndexDelete *deltids;
184#endif
185
186/* heap_index_delete_tuples bottom-up index deletion costing constants */
187#define BOTTOMUP_MAX_NBLOCKS 6
188#define BOTTOMUP_TOLERANCE_NBLOCKS 3
189
190/*
191 * heap_index_delete_tuples uses this when determining which heap blocks it
192 * must visit to help its bottom-up index deletion caller
193 */
194typedef struct IndexDeleteCounts
195{
196 int16 npromisingtids; /* Number of "promising" TIDs in group */
197 int16 ntids; /* Number of TIDs in group */
198 int16 ifirsttid; /* Offset to group's first deltid */
200
201/*
202 * This table maps tuple lock strength values for each particular
203 * MultiXactStatus value.
204 */
205static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
206{
207 LockTupleKeyShare, /* ForKeyShare */
208 LockTupleShare, /* ForShare */
209 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
210 LockTupleExclusive, /* ForUpdate */
211 LockTupleNoKeyExclusive, /* NoKeyUpdate */
212 LockTupleExclusive /* Update */
213};
214
215/* Get the LockTupleMode for a given MultiXactStatus */
216#define TUPLOCK_from_mxstatus(status) \
217 (MultiXactStatusLock[(status)])
218
219/*
220 * Check that we have a valid snapshot if we might need TOAST access.
221 */
222static inline void
224{
225#ifdef USE_ASSERT_CHECKING
226
227 /* bootstrap mode in particular breaks this rule */
229 return;
230
231 /* if the relation doesn't have a TOAST table, we are good */
232 if (!OidIsValid(rel->rd_rel->reltoastrelid))
233 return;
234
236
237#endif /* USE_ASSERT_CHECKING */
238}
239
240/* ----------------------------------------------------------------
241 * heap support routines
242 * ----------------------------------------------------------------
243 */
244
245/*
246 * Streaming read API callback for parallel sequential scans. Returns the next
247 * block the caller wants from the read stream or InvalidBlockNumber when done.
248 */
249static BlockNumber
251 void *callback_private_data,
252 void *per_buffer_data)
253{
254 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
255
258
259 if (unlikely(!scan->rs_inited))
260 {
261 /* parallel scan */
265 scan->rs_startblock,
266 scan->rs_numblocks);
267
268 /* may return InvalidBlockNumber if there are no more blocks */
272 scan->rs_inited = true;
273 }
274 else
275 {
278 scan->rs_base.rs_parallel);
279 }
280
281 return scan->rs_prefetch_block;
282}
283
284/*
285 * Streaming read API callback for serial sequential and TID range scans.
286 * Returns the next block the caller wants from the read stream or
287 * InvalidBlockNumber when done.
288 */
289static BlockNumber
291 void *callback_private_data,
292 void *per_buffer_data)
293{
294 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
295
296 if (unlikely(!scan->rs_inited))
297 {
299 scan->rs_inited = true;
300 }
301 else
303 scan->rs_prefetch_block,
304 scan->rs_dir);
305
306 return scan->rs_prefetch_block;
307}
308
309/*
310 * Read stream API callback for bitmap heap scans.
311 * Returns the next block the caller wants from the read stream or
312 * InvalidBlockNumber when done.
313 */
314static BlockNumber
315bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data,
316 void *per_buffer_data)
317{
318 TBMIterateResult *tbmres = per_buffer_data;
321 TableScanDesc sscan = &hscan->rs_base;
322
323 for (;;)
324 {
326
327 /* no more entries in the bitmap */
328 if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
329 return InvalidBlockNumber;
330
331 /*
332 * Ignore any claimed entries past what we think is the end of the
333 * relation. It may have been extended after the start of our scan (we
334 * only hold an AccessShareLock, and it could be inserts from this
335 * backend). We don't take this optimization in SERIALIZABLE
336 * isolation though, as we need to examine all invisible tuples
337 * reachable by the index.
338 */
340 tbmres->blockno >= hscan->rs_nblocks)
341 continue;
342
343 return tbmres->blockno;
344 }
345
346 /* not reachable */
347 Assert(false);
348}
349
350/* ----------------
351 * initscan - scan code common to heap_beginscan and heap_rescan
352 * ----------------
353 */
354static void
356{
358 bool allow_strat;
359 bool allow_sync;
360
361 /*
362 * Determine the number of blocks we have to scan.
363 *
364 * It is sufficient to do this once at scan start, since any tuples added
365 * while the scan is in progress will be invisible to my snapshot anyway.
366 * (That is not true when using a non-MVCC snapshot. However, we couldn't
367 * guarantee to return tuples added after scan start anyway, since they
368 * might go into pages we already scanned. To guarantee consistent
369 * results for a non-MVCC snapshot, the caller must hold some higher-level
370 * lock that ensures the interesting tuple(s) won't change.)
371 */
372 if (scan->rs_base.rs_parallel != NULL)
373 {
375 scan->rs_nblocks = bpscan->phs_nblocks;
376 }
377 else
379
380 /*
381 * If the table is large relative to NBuffers, use a bulk-read access
382 * strategy and enable synchronized scanning (see syncscan.c). Although
383 * the thresholds for these features could be different, we make them the
384 * same so that there are only two behaviors to tune rather than four.
385 * (However, some callers need to be able to disable one or both of these
386 * behaviors, independently of the size of the table; also there is a GUC
387 * variable that can disable synchronized scanning.)
388 *
389 * Note that table_block_parallelscan_initialize has a very similar test;
390 * if you change this, consider changing that one, too.
391 */
393 scan->rs_nblocks > NBuffers / 4)
394 {
396 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
397 }
398 else
399 allow_strat = allow_sync = false;
400
401 if (allow_strat)
402 {
403 /* During a rescan, keep the previous strategy object. */
404 if (scan->rs_strategy == NULL)
406 }
407 else
408 {
409 if (scan->rs_strategy != NULL)
411 scan->rs_strategy = NULL;
412 }
413
414 if (scan->rs_base.rs_parallel != NULL)
415 {
416 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
419 else
421
422 /*
423 * If not rescanning, initialize the startblock. Finding the actual
424 * start location is done in table_block_parallelscan_startblock_init,
425 * based on whether an alternative start location has been set with
426 * heap_setscanlimits, or using the syncscan location, when syncscan
427 * is enabled.
428 */
429 if (!keep_startblock)
431 }
432 else
433 {
434 if (keep_startblock)
435 {
436 /*
437 * When rescanning, we want to keep the previous startblock
438 * setting, so that rewinding a cursor doesn't generate surprising
439 * results. Reset the active syncscan setting, though.
440 */
443 else
445 }
447 {
450 }
451 else
452 {
454 scan->rs_startblock = 0;
455 }
456 }
457
459 scan->rs_inited = false;
460 scan->rs_ctup.t_data = NULL;
462 scan->rs_cbuf = InvalidBuffer;
464 scan->rs_ntuples = 0;
465 scan->rs_cindex = 0;
466
467 /*
468 * Initialize to ForwardScanDirection because it is most common and
469 * because heap scans go forward before going backward (e.g. CURSORs).
470 */
473
474 /* page-at-a-time fields are always invalid when not rs_inited */
475
476 /*
477 * copy the scan key, if appropriate
478 */
479 if (key != NULL && scan->rs_base.rs_nkeys > 0)
480 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
481
482 /*
483 * Currently, we only have a stats counter for sequential heap scans (but
484 * e.g for bitmap scans the underlying bitmap index scans will be counted,
485 * and for sample scans we update stats for tuple fetches).
486 */
487 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
489}
490
491/*
492 * heap_setscanlimits - restrict range of a heapscan
493 *
494 * startBlk is the page to start at
495 * numBlks is number of pages to scan (InvalidBlockNumber means "all")
496 */
497void
499{
501
502 Assert(!scan->rs_inited); /* else too late to change */
503 /* else rs_startblock is significant */
505
506 /* Check startBlk is valid (but allow case of zero blocks...) */
507 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
508
509 scan->rs_startblock = startBlk;
510 scan->rs_numblocks = numBlks;
511}
512
513/*
514 * Per-tuple loop for heap_prepare_pagescan(). Pulled out so it can be called
515 * multiple times, with constant arguments for all_visible,
516 * check_serializable.
517 */
519static int
521 Page page, Buffer buffer,
522 BlockNumber block, int lines,
523 bool all_visible, bool check_serializable)
524{
525 Oid relid = RelationGetRelid(scan->rs_base.rs_rd);
526 int ntup = 0;
527 int nvis = 0;
529
530 /* page at a time should have been disabled otherwise */
531 Assert(IsMVCCSnapshot(snapshot));
532
533 /* first find all tuples on the page */
535 {
538
540 continue;
541
542 /*
543 * If the page is not all-visible or we need to check serializability,
544 * maintain enough state to be able to refind the tuple efficiently,
545 * without again first needing to fetch the item and then via that the
546 * tuple.
547 */
548 if (!all_visible || check_serializable)
549 {
550 tup = &batchmvcc.tuples[ntup];
551
553 tup->t_len = ItemIdGetLength(lpp);
554 tup->t_tableOid = relid;
555 ItemPointerSet(&(tup->t_self), block, lineoff);
556 }
557
558 /*
559 * If the page is all visible, these fields otherwise won't be
560 * populated in loop below.
561 */
562 if (all_visible)
563 {
565 {
566 batchmvcc.visible[ntup] = true;
567 }
568 scan->rs_vistuples[ntup] = lineoff;
569 }
570
571 ntup++;
572 }
573
575
576 /*
577 * Unless the page is all visible, test visibility for all tuples one go.
578 * That is considerably more efficient than calling
579 * HeapTupleSatisfiesMVCC() one-by-one.
580 */
581 if (all_visible)
582 nvis = ntup;
583 else
584 nvis = HeapTupleSatisfiesMVCCBatch(snapshot, buffer,
585 ntup,
586 &batchmvcc,
587 scan->rs_vistuples);
588
589 /*
590 * So far we don't have batch API for testing serializabilty, so do so
591 * one-by-one.
592 */
594 {
595 for (int i = 0; i < ntup; i++)
596 {
598 scan->rs_base.rs_rd,
599 &batchmvcc.tuples[i],
600 buffer, snapshot);
601 }
602 }
603
604 return nvis;
605}
606
607/*
608 * heap_prepare_pagescan - Prepare current scan page to be scanned in pagemode
609 *
610 * Preparation currently consists of 1. prune the scan's rs_cbuf page, and 2.
611 * fill the rs_vistuples[] array with the OffsetNumbers of visible tuples.
612 */
613void
615{
617 Buffer buffer = scan->rs_cbuf;
618 BlockNumber block = scan->rs_cblock;
619 Snapshot snapshot;
620 Page page;
621 int lines;
622 bool all_visible;
624
625 Assert(BufferGetBlockNumber(buffer) == block);
626
627 /* ensure we're not accidentally being used when not in pagemode */
629 snapshot = scan->rs_base.rs_snapshot;
630
631 /*
632 * Prune and repair fragmentation for the whole page, if possible.
633 */
634 heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
635
636 /*
637 * We must hold share lock on the buffer content while examining tuple
638 * visibility. Afterwards, however, the tuples we have found to be
639 * visible are guaranteed good as long as we hold the buffer pin.
640 */
642
643 page = BufferGetPage(buffer);
644 lines = PageGetMaxOffsetNumber(page);
645
646 /*
647 * If the all-visible flag indicates that all tuples on the page are
648 * visible to everyone, we can skip the per-tuple visibility tests.
649 *
650 * Note: In hot standby, a tuple that's already visible to all
651 * transactions on the primary might still be invisible to a read-only
652 * transaction in the standby. We partly handle this problem by tracking
653 * the minimum xmin of visible tuples as the cut-off XID while marking a
654 * page all-visible on the primary and WAL log that along with the
655 * visibility map SET operation. In hot standby, we wait for (or abort)
656 * all transactions that can potentially may not see one or more tuples on
657 * the page. That's how index-only scans work fine in hot standby. A
658 * crucial difference between index-only scans and heap scans is that the
659 * index-only scan completely relies on the visibility map where as heap
660 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
661 * the page-level flag can be trusted in the same way, because it might
662 * get propagated somehow without being explicitly WAL-logged, e.g. via a
663 * full page write. Until we can prove that beyond doubt, let's check each
664 * tuple for visibility the hard way.
665 */
666 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
669
670 /*
671 * We call page_collect_tuples() with constant arguments, to get the
672 * compiler to constant fold the constant arguments. Separate calls with
673 * constant arguments, rather than variables, are needed on several
674 * compilers to actually perform constant folding.
675 */
676 if (likely(all_visible))
677 {
679 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
680 block, lines, true, false);
681 else
682 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
683 block, lines, true, true);
684 }
685 else
686 {
688 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
689 block, lines, false, false);
690 else
691 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
692 block, lines, false, true);
693 }
694
696}
697
698/*
699 * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM.
700 *
701 * Read the next block of the scan relation from the read stream and save it
702 * in the scan descriptor. It is already pinned.
703 */
704static inline void
706{
707 Assert(scan->rs_read_stream);
708
709 /* release previous scan buffer, if any */
710 if (BufferIsValid(scan->rs_cbuf))
711 {
712 ReleaseBuffer(scan->rs_cbuf);
713 scan->rs_cbuf = InvalidBuffer;
714 }
715
716 /*
717 * Be sure to check for interrupts at least once per page. Checks at
718 * higher code levels won't be able to stop a seqscan that encounters many
719 * pages' worth of consecutive dead tuples.
720 */
722
723 /*
724 * If the scan direction is changing, reset the prefetch block to the
725 * current block. Otherwise, we will incorrectly prefetch the blocks
726 * between the prefetch block and the current block again before
727 * prefetching blocks in the new, correct scan direction.
728 */
729 if (unlikely(scan->rs_dir != dir))
730 {
731 scan->rs_prefetch_block = scan->rs_cblock;
733 }
734
735 scan->rs_dir = dir;
736
738 if (BufferIsValid(scan->rs_cbuf))
740}
741
742/*
743 * heapgettup_initial_block - return the first BlockNumber to scan
744 *
745 * Returns InvalidBlockNumber when there are no blocks to scan. This can
746 * occur with empty tables and in parallel scans when parallel workers get all
747 * of the pages before we can get a chance to get our first page.
748 */
751{
752 Assert(!scan->rs_inited);
753 Assert(scan->rs_base.rs_parallel == NULL);
754
755 /* When there are no pages to scan, return InvalidBlockNumber */
756 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
757 return InvalidBlockNumber;
758
759 if (ScanDirectionIsForward(dir))
760 {
761 return scan->rs_startblock;
762 }
763 else
764 {
765 /*
766 * Disable reporting to syncscan logic in a backwards scan; it's not
767 * very likely anyone else is doing the same thing at the same time,
768 * and much more likely that we'll just bollix things for forward
769 * scanners.
770 */
772
773 /*
774 * Start from last page of the scan. Ensure we take into account
775 * rs_numblocks if it's been adjusted by heap_setscanlimits().
776 */
777 if (scan->rs_numblocks != InvalidBlockNumber)
778 return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
779
780 if (scan->rs_startblock > 0)
781 return scan->rs_startblock - 1;
782
783 return scan->rs_nblocks - 1;
784 }
785}
786
787
788/*
789 * heapgettup_start_page - helper function for heapgettup()
790 *
791 * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
792 * to the number of tuples on this page. Also set *lineoff to the first
793 * offset to scan with forward scans getting the first offset and backward
794 * getting the final offset on the page.
795 */
796static Page
799{
800 Page page;
801
802 Assert(scan->rs_inited);
804
805 /* Caller is responsible for ensuring buffer is locked if needed */
806 page = BufferGetPage(scan->rs_cbuf);
807
809
810 if (ScanDirectionIsForward(dir))
812 else
814
815 /* lineoff now references the physically previous or next tid */
816 return page;
817}
818
819
820/*
821 * heapgettup_continue_page - helper function for heapgettup()
822 *
823 * Return the next page to scan based on the scan->rs_cbuf and set *linesleft
824 * to the number of tuples left to scan on this page. Also set *lineoff to
825 * the next offset to scan according to the ScanDirection in 'dir'.
826 */
827static inline Page
830{
831 Page page;
832
833 Assert(scan->rs_inited);
835
836 /* Caller is responsible for ensuring buffer is locked if needed */
837 page = BufferGetPage(scan->rs_cbuf);
838
839 if (ScanDirectionIsForward(dir))
840 {
842 *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
843 }
844 else
845 {
846 /*
847 * The previous returned tuple may have been vacuumed since the
848 * previous scan when we use a non-MVCC snapshot, so we must
849 * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
850 */
852 *linesleft = *lineoff;
853 }
854
855 /* lineoff now references the physically previous or next tid */
856 return page;
857}
858
859/*
860 * heapgettup_advance_block - helper for heap_fetch_next_buffer()
861 *
862 * Given the current block number, the scan direction, and various information
863 * contained in the scan descriptor, calculate the BlockNumber to scan next
864 * and return it. If there are no further blocks to scan, return
865 * InvalidBlockNumber to indicate this fact to the caller.
866 *
867 * This should not be called to determine the initial block number -- only for
868 * subsequent blocks.
869 *
870 * This also adjusts rs_numblocks when a limit has been imposed by
871 * heap_setscanlimits().
872 */
873static inline BlockNumber
875{
876 Assert(scan->rs_base.rs_parallel == NULL);
877
879 {
880 block++;
881
882 /* wrap back to the start of the heap */
883 if (block >= scan->rs_nblocks)
884 block = 0;
885
886 /*
887 * Report our new scan position for synchronization purposes. We don't
888 * do that when moving backwards, however. That would just mess up any
889 * other forward-moving scanners.
890 *
891 * Note: we do this before checking for end of scan so that the final
892 * state of the position hint is back at the start of the rel. That's
893 * not strictly necessary, but otherwise when you run the same query
894 * multiple times the starting position would shift a little bit
895 * backwards on every invocation, which is confusing. We don't
896 * guarantee any specific ordering in general, though.
897 */
898 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
899 ss_report_location(scan->rs_base.rs_rd, block);
900
901 /* we're done if we're back at where we started */
902 if (block == scan->rs_startblock)
903 return InvalidBlockNumber;
904
905 /* check if the limit imposed by heap_setscanlimits() is met */
906 if (scan->rs_numblocks != InvalidBlockNumber)
907 {
908 if (--scan->rs_numblocks == 0)
909 return InvalidBlockNumber;
910 }
911
912 return block;
913 }
914 else
915 {
916 /* we're done if the last block is the start position */
917 if (block == scan->rs_startblock)
918 return InvalidBlockNumber;
919
920 /* check if the limit imposed by heap_setscanlimits() is met */
921 if (scan->rs_numblocks != InvalidBlockNumber)
922 {
923 if (--scan->rs_numblocks == 0)
924 return InvalidBlockNumber;
925 }
926
927 /* wrap to the end of the heap when the last page was page 0 */
928 if (block == 0)
929 block = scan->rs_nblocks;
930
931 block--;
932
933 return block;
934 }
935}
936
937/* ----------------
938 * heapgettup - fetch next heap tuple
939 *
940 * Initialize the scan if not already done; then advance to the next
941 * tuple as indicated by "dir"; return the next tuple in scan->rs_ctup,
942 * or set scan->rs_ctup.t_data = NULL if no more tuples.
943 *
944 * Note: the reason nkeys/key are passed separately, even though they are
945 * kept in the scan descriptor, is that the caller may not want us to check
946 * the scankeys.
947 *
948 * Note: when we fall off the end of the scan in either direction, we
949 * reset rs_inited. This means that a further request with the same
950 * scan direction will restart the scan, which is a bit odd, but a
951 * request with the opposite scan direction will start a fresh scan
952 * in the proper direction. The latter is required behavior for cursors,
953 * while the former case is generally undefined behavior in Postgres
954 * so we don't care too much.
955 * ----------------
956 */
957static void
959 ScanDirection dir,
960 int nkeys,
961 ScanKey key)
962{
963 HeapTuple tuple = &(scan->rs_ctup);
964 Page page;
966 int linesleft;
967
968 if (likely(scan->rs_inited))
969 {
970 /* continue from previously returned page/tuple */
972 page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
973 goto continue_page;
974 }
975
976 /*
977 * advance the scan until we find a qualifying tuple or run out of stuff
978 * to scan
979 */
980 while (true)
981 {
982 heap_fetch_next_buffer(scan, dir);
983
984 /* did we run out of blocks to scan? */
985 if (!BufferIsValid(scan->rs_cbuf))
986 break;
987
989
991 page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
993
994 /*
995 * Only continue scanning the page while we have lines left.
996 *
997 * Note that this protects us from accessing line pointers past
998 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
999 * table scan, and for when we start scanning a new page.
1000 */
1001 for (; linesleft > 0; linesleft--, lineoff += dir)
1002 {
1003 bool visible;
1005
1006 if (!ItemIdIsNormal(lpp))
1007 continue;
1008
1009 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1010 tuple->t_len = ItemIdGetLength(lpp);
1011 ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
1012
1013 visible = HeapTupleSatisfiesVisibility(tuple,
1014 scan->rs_base.rs_snapshot,
1015 scan->rs_cbuf);
1016
1018 tuple, scan->rs_cbuf,
1019 scan->rs_base.rs_snapshot);
1020
1021 /* skip tuples not visible to this snapshot */
1022 if (!visible)
1023 continue;
1024
1025 /* skip any tuples that don't match the scan key */
1026 if (key != NULL &&
1028 nkeys, key))
1029 continue;
1030
1032 scan->rs_coffset = lineoff;
1033 return;
1034 }
1035
1036 /*
1037 * if we get here, it means we've exhausted the items on this page and
1038 * it's time to move to the next.
1039 */
1041 }
1042
1043 /* end of scan */
1044 if (BufferIsValid(scan->rs_cbuf))
1045 ReleaseBuffer(scan->rs_cbuf);
1046
1047 scan->rs_cbuf = InvalidBuffer;
1050 tuple->t_data = NULL;
1051 scan->rs_inited = false;
1052}
1053
1054/* ----------------
1055 * heapgettup_pagemode - fetch next heap tuple in page-at-a-time mode
1056 *
1057 * Same API as heapgettup, but used in page-at-a-time mode
1058 *
1059 * The internal logic is much the same as heapgettup's too, but there are some
1060 * differences: we do not take the buffer content lock (that only needs to
1061 * happen inside heap_prepare_pagescan), and we iterate through just the
1062 * tuples listed in rs_vistuples[] rather than all tuples on the page. Notice
1063 * that lineindex is 0-based, where the corresponding loop variable lineoff in
1064 * heapgettup is 1-based.
1065 * ----------------
1066 */
1067static void
1069 ScanDirection dir,
1070 int nkeys,
1071 ScanKey key)
1072{
1073 HeapTuple tuple = &(scan->rs_ctup);
1074 Page page;
1077
1078 if (likely(scan->rs_inited))
1079 {
1080 /* continue from previously returned page/tuple */
1081 page = BufferGetPage(scan->rs_cbuf);
1082
1083 lineindex = scan->rs_cindex + dir;
1084 if (ScanDirectionIsForward(dir))
1085 linesleft = scan->rs_ntuples - lineindex;
1086 else
1087 linesleft = scan->rs_cindex;
1088 /* lineindex now references the next or previous visible tid */
1089
1090 goto continue_page;
1091 }
1092
1093 /*
1094 * advance the scan until we find a qualifying tuple or run out of stuff
1095 * to scan
1096 */
1097 while (true)
1098 {
1099 heap_fetch_next_buffer(scan, dir);
1100
1101 /* did we run out of blocks to scan? */
1102 if (!BufferIsValid(scan->rs_cbuf))
1103 break;
1104
1106
1107 /* prune the page and determine visible tuple offsets */
1109 page = BufferGetPage(scan->rs_cbuf);
1110 linesleft = scan->rs_ntuples;
1112
1113 /* block is the same for all tuples, set it once outside the loop */
1115
1116 /* lineindex now references the next or previous visible tid */
1118
1119 for (; linesleft > 0; linesleft--, lineindex += dir)
1120 {
1121 ItemId lpp;
1123
1124 Assert(lineindex < scan->rs_ntuples);
1126 lpp = PageGetItemId(page, lineoff);
1128
1129 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1130 tuple->t_len = ItemIdGetLength(lpp);
1132
1133 /* skip any tuples that don't match the scan key */
1134 if (key != NULL &&
1136 nkeys, key))
1137 continue;
1138
1139 scan->rs_cindex = lineindex;
1140 return;
1141 }
1142 }
1143
1144 /* end of scan */
1145 if (BufferIsValid(scan->rs_cbuf))
1146 ReleaseBuffer(scan->rs_cbuf);
1147 scan->rs_cbuf = InvalidBuffer;
1150 tuple->t_data = NULL;
1151 scan->rs_inited = false;
1152}
1153
1154
1155/* ----------------------------------------------------------------
1156 * heap access method interface
1157 * ----------------------------------------------------------------
1158 */
1159
1160
1162heap_beginscan(Relation relation, Snapshot snapshot,
1163 int nkeys, ScanKey key,
1164 ParallelTableScanDesc parallel_scan,
1165 uint32 flags)
1166{
1167 HeapScanDesc scan;
1168
1169 /*
1170 * increment relation ref count while scanning relation
1171 *
1172 * This is just to make really sure the relcache entry won't go away while
1173 * the scan has a pointer to it. Caller should be holding the rel open
1174 * anyway, so this is redundant in all normal scenarios...
1175 */
1177
1178 /*
1179 * allocate and initialize scan descriptor
1180 */
1181 if (flags & SO_TYPE_BITMAPSCAN)
1182 {
1184
1185 /*
1186 * Bitmap Heap scans do not have any fields that a normal Heap Scan
1187 * does not have, so no special initializations required here.
1188 */
1189 scan = (HeapScanDesc) bscan;
1190 }
1191 else
1193
1194 scan->rs_base.rs_rd = relation;
1195 scan->rs_base.rs_snapshot = snapshot;
1196 scan->rs_base.rs_nkeys = nkeys;
1197 scan->rs_base.rs_flags = flags;
1198 scan->rs_base.rs_parallel = parallel_scan;
1199 scan->rs_strategy = NULL; /* set in initscan */
1200 scan->rs_cbuf = InvalidBuffer;
1201
1202 /*
1203 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1204 */
1205 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1207
1208 /* Check that a historic snapshot is not used for non-catalog tables */
1209 if (snapshot &&
1210 IsHistoricMVCCSnapshot(snapshot) &&
1212 {
1213 ereport(ERROR,
1215 errmsg("cannot query non-catalog table \"%s\" during logical decoding",
1216 RelationGetRelationName(relation))));
1217 }
1218
1219 /*
1220 * For seqscan and sample scans in a serializable transaction, acquire a
1221 * predicate lock on the entire relation. This is required not only to
1222 * lock all the matching tuples, but also to conflict with new insertions
1223 * into the table. In an indexscan, we take page locks on the index pages
1224 * covering the range specified in the scan qual, but in a heap scan there
1225 * is nothing more fine-grained to lock. A bitmap scan is a different
1226 * story, there we have already scanned the index and locked the index
1227 * pages covering the predicate. But in that case we still have to lock
1228 * any matching heap tuples. For sample scan we could optimize the locking
1229 * to be at least page-level granularity, but we'd need to add per-tuple
1230 * locking for that.
1231 */
1233 {
1234 /*
1235 * Ensure a missing snapshot is noticed reliably, even if the
1236 * isolation mode means predicate locking isn't performed (and
1237 * therefore the snapshot isn't used here).
1238 */
1239 Assert(snapshot);
1240 PredicateLockRelation(relation, snapshot);
1241 }
1242
1243 /* we only need to set this up once */
1244 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1245
1246 /*
1247 * Allocate memory to keep track of page allocation for parallel workers
1248 * when doing a parallel scan.
1249 */
1250 if (parallel_scan != NULL)
1252 else
1254
1255 /*
1256 * we do this here instead of in initscan() because heap_rescan also calls
1257 * initscan() and we don't want to allocate memory again
1258 */
1259 if (nkeys > 0)
1260 scan->rs_base.rs_key = palloc_array(ScanKeyData, nkeys);
1261 else
1262 scan->rs_base.rs_key = NULL;
1263
1264 initscan(scan, key, false);
1265
1266 scan->rs_read_stream = NULL;
1267
1268 /*
1269 * Set up a read stream for sequential scans and TID range scans. This
1270 * should be done after initscan() because initscan() allocates the
1271 * BufferAccessStrategy object passed to the read stream API.
1272 */
1273 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN ||
1275 {
1277
1278 if (scan->rs_base.rs_parallel)
1280 else
1282
1283 /* ---
1284 * It is safe to use batchmode as the only locks taken by `cb`
1285 * are never taken while waiting for IO:
1286 * - SyncScanLock is used in the non-parallel case
1287 * - in the parallel case, only spinlocks and atomics are used
1288 * ---
1289 */
1292 scan->rs_strategy,
1293 scan->rs_base.rs_rd,
1295 cb,
1296 scan,
1297 0);
1298 }
1299 else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
1300 {
1303 scan->rs_strategy,
1304 scan->rs_base.rs_rd,
1307 scan,
1308 sizeof(TBMIterateResult));
1309 }
1310
1311
1312 return (TableScanDesc) scan;
1313}
1314
1315void
1317 bool allow_strat, bool allow_sync, bool allow_pagemode)
1318{
1320
1321 if (set_params)
1322 {
1323 if (allow_strat)
1325 else
1327
1328 if (allow_sync)
1330 else
1332
1333 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1336 else
1338 }
1339
1340 /*
1341 * unpin scan buffers
1342 */
1343 if (BufferIsValid(scan->rs_cbuf))
1344 {
1345 ReleaseBuffer(scan->rs_cbuf);
1346 scan->rs_cbuf = InvalidBuffer;
1347 }
1348
1349 /*
1350 * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any
1351 * additional data vs a normal HeapScan
1352 */
1353
1354 /*
1355 * The read stream is reset on rescan. This must be done before
1356 * initscan(), as some state referred to by read_stream_reset() is reset
1357 * in initscan().
1358 */
1359 if (scan->rs_read_stream)
1361
1362 /*
1363 * reinitialize scan descriptor
1364 */
1365 initscan(scan, key, true);
1366}
1367
1368void
1370{
1372
1373 /* Note: no locking manipulations needed */
1374
1375 /*
1376 * unpin scan buffers
1377 */
1378 if (BufferIsValid(scan->rs_cbuf))
1379 ReleaseBuffer(scan->rs_cbuf);
1380
1381 /*
1382 * Must free the read stream before freeing the BufferAccessStrategy.
1383 */
1384 if (scan->rs_read_stream)
1386
1387 /*
1388 * decrement relation reference count and free scan descriptor storage
1389 */
1391
1392 if (scan->rs_base.rs_key)
1393 pfree(scan->rs_base.rs_key);
1394
1395 if (scan->rs_strategy != NULL)
1397
1398 if (scan->rs_parallelworkerdata != NULL)
1400
1401 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1403
1404 pfree(scan);
1405}
1406
1409{
1411
1412 /*
1413 * This is still widely used directly, without going through table AM, so
1414 * add a safety check. It's possible we should, at a later point,
1415 * downgrade this to an assert. The reason for checking the AM routine,
1416 * rather than the AM oid, is that this allows to write regression tests
1417 * that create another AM reusing the heap handler.
1418 */
1419 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1420 ereport(ERROR,
1422 errmsg_internal("only heap AM is supported")));
1423
1424 /* Note: no locking manipulations needed */
1425
1427 heapgettup_pagemode(scan, direction,
1428 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1429 else
1430 heapgettup(scan, direction,
1431 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1432
1433 if (scan->rs_ctup.t_data == NULL)
1434 return NULL;
1435
1436 /*
1437 * if we get here it means we have a new current scan tuple, so point to
1438 * the proper return buffer and return the tuple.
1439 */
1440
1442
1443 return &scan->rs_ctup;
1444}
1445
1446bool
1448{
1450
1451 /* Note: no locking manipulations needed */
1452
1453 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1454 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1455 else
1456 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1457
1458 if (scan->rs_ctup.t_data == NULL)
1459 {
1460 ExecClearTuple(slot);
1461 return false;
1462 }
1463
1464 /*
1465 * if we get here it means we have a new current scan tuple, so point to
1466 * the proper return buffer and return the tuple.
1467 */
1468
1470
1471 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1472 scan->rs_cbuf);
1473 return true;
1474}
1475
1476void
1479{
1485
1486 /*
1487 * For relations without any pages, we can simply leave the TID range
1488 * unset. There will be no tuples to scan, therefore no tuples outside
1489 * the given TID range.
1490 */
1491 if (scan->rs_nblocks == 0)
1492 return;
1493
1494 /*
1495 * Set up some ItemPointers which point to the first and last possible
1496 * tuples in the heap.
1497 */
1500
1501 /*
1502 * If the given maximum TID is below the highest possible TID in the
1503 * relation, then restrict the range to that, otherwise we scan to the end
1504 * of the relation.
1505 */
1508
1509 /*
1510 * If the given minimum TID is above the lowest possible TID in the
1511 * relation, then restrict the range to only scan for TIDs above that.
1512 */
1515
1516 /*
1517 * Check for an empty range and protect from would be negative results
1518 * from the numBlks calculation below.
1519 */
1521 {
1522 /* Set an empty range of blocks to scan */
1524 return;
1525 }
1526
1527 /*
1528 * Calculate the first block and the number of blocks we must scan. We
1529 * could be more aggressive here and perform some more validation to try
1530 * and further narrow the scope of blocks to scan by checking if the
1531 * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1532 * advance startBlk by one. Likewise, if highestItem has an offset of 0
1533 * we could scan one fewer blocks. However, such an optimization does not
1534 * seem worth troubling over, currently.
1535 */
1537
1540
1541 /* Set the start block and number of blocks to scan */
1543
1544 /* Finally, set the TID range in sscan */
1545 ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
1546 ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
1547}
1548
1549bool
1551 TupleTableSlot *slot)
1552{
1554 ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
1555 ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
1556
1557 /* Note: no locking manipulations needed */
1558 for (;;)
1559 {
1560 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1561 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1562 else
1563 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1564
1565 if (scan->rs_ctup.t_data == NULL)
1566 {
1567 ExecClearTuple(slot);
1568 return false;
1569 }
1570
1571 /*
1572 * heap_set_tidrange will have used heap_setscanlimits to limit the
1573 * range of pages we scan to only ones that can contain the TID range
1574 * we're scanning for. Here we must filter out any tuples from these
1575 * pages that are outside of that range.
1576 */
1577 if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1578 {
1579 ExecClearTuple(slot);
1580
1581 /*
1582 * When scanning backwards, the TIDs will be in descending order.
1583 * Future tuples in this direction will be lower still, so we can
1584 * just return false to indicate there will be no more tuples.
1585 */
1586 if (ScanDirectionIsBackward(direction))
1587 return false;
1588
1589 continue;
1590 }
1591
1592 /*
1593 * Likewise for the final page, we must filter out TIDs greater than
1594 * maxtid.
1595 */
1596 if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1597 {
1598 ExecClearTuple(slot);
1599
1600 /*
1601 * When scanning forward, the TIDs will be in ascending order.
1602 * Future tuples in this direction will be higher still, so we can
1603 * just return false to indicate there will be no more tuples.
1604 */
1605 if (ScanDirectionIsForward(direction))
1606 return false;
1607 continue;
1608 }
1609
1610 break;
1611 }
1612
1613 /*
1614 * if we get here it means we have a new current scan tuple, so point to
1615 * the proper return buffer and return the tuple.
1616 */
1618
1619 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1620 return true;
1621}
1622
1623/*
1624 * heap_fetch - retrieve tuple with given tid
1625 *
1626 * On entry, tuple->t_self is the TID to fetch. We pin the buffer holding
1627 * the tuple, fill in the remaining fields of *tuple, and check the tuple
1628 * against the specified snapshot.
1629 *
1630 * If successful (tuple found and passes snapshot time qual), then *userbuf
1631 * is set to the buffer holding the tuple and true is returned. The caller
1632 * must unpin the buffer when done with the tuple.
1633 *
1634 * If the tuple is not found (ie, item number references a deleted slot),
1635 * then tuple->t_data is set to NULL, *userbuf is set to InvalidBuffer,
1636 * and false is returned.
1637 *
1638 * If the tuple is found but fails the time qual check, then the behavior
1639 * depends on the keep_buf parameter. If keep_buf is false, the results
1640 * are the same as for the tuple-not-found case. If keep_buf is true,
1641 * then tuple->t_data and *userbuf are returned as for the success case,
1642 * and again the caller must unpin the buffer; but false is returned.
1643 *
1644 * heap_fetch does not follow HOT chains: only the exact TID requested will
1645 * be fetched.
1646 *
1647 * It is somewhat inconsistent that we ereport() on invalid block number but
1648 * return false on invalid item number. There are a couple of reasons though.
1649 * One is that the caller can relatively easily check the block number for
1650 * validity, but cannot check the item number without reading the page
1651 * himself. Another is that when we are following a t_ctid link, we can be
1652 * reasonably confident that the page number is valid (since VACUUM shouldn't
1653 * truncate off the destination page without having killed the referencing
1654 * tuple first), but the item number might well not be good.
1655 */
1656bool
1657heap_fetch(Relation relation,
1658 Snapshot snapshot,
1659 HeapTuple tuple,
1660 Buffer *userbuf,
1661 bool keep_buf)
1662{
1663 ItemPointer tid = &(tuple->t_self);
1664 ItemId lp;
1665 Buffer buffer;
1666 Page page;
1667 OffsetNumber offnum;
1668 bool valid;
1669
1670 /*
1671 * Fetch and pin the appropriate page of the relation.
1672 */
1673 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1674
1675 /*
1676 * Need share lock on buffer to examine tuple commit status.
1677 */
1679 page = BufferGetPage(buffer);
1680
1681 /*
1682 * We'd better check for out-of-range offnum in case of VACUUM since the
1683 * TID was obtained.
1684 */
1685 offnum = ItemPointerGetOffsetNumber(tid);
1687 {
1689 ReleaseBuffer(buffer);
1691 tuple->t_data = NULL;
1692 return false;
1693 }
1694
1695 /*
1696 * get the item line pointer corresponding to the requested tid
1697 */
1698 lp = PageGetItemId(page, offnum);
1699
1700 /*
1701 * Must check for deleted tuple.
1702 */
1703 if (!ItemIdIsNormal(lp))
1704 {
1706 ReleaseBuffer(buffer);
1708 tuple->t_data = NULL;
1709 return false;
1710 }
1711
1712 /*
1713 * fill in *tuple fields
1714 */
1715 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1716 tuple->t_len = ItemIdGetLength(lp);
1717 tuple->t_tableOid = RelationGetRelid(relation);
1718
1719 /*
1720 * check tuple visibility, then release lock
1721 */
1722 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1723
1724 if (valid)
1725 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1727
1728 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1729
1731
1732 if (valid)
1733 {
1734 /*
1735 * All checks passed, so return the tuple as valid. Caller is now
1736 * responsible for releasing the buffer.
1737 */
1738 *userbuf = buffer;
1739
1740 return true;
1741 }
1742
1743 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1744 if (keep_buf)
1745 *userbuf = buffer;
1746 else
1747 {
1748 ReleaseBuffer(buffer);
1750 tuple->t_data = NULL;
1751 }
1752
1753 return false;
1754}
1755
1756/*
1757 * heap_hot_search_buffer - search HOT chain for tuple satisfying snapshot
1758 *
1759 * On entry, *tid is the TID of a tuple (either a simple tuple, or the root
1760 * of a HOT chain), and buffer is the buffer holding this tuple. We search
1761 * for the first chain member satisfying the given snapshot. If one is
1762 * found, we update *tid to reference that tuple's offset number, and
1763 * return true. If no match, return false without modifying *tid.
1764 *
1765 * heapTuple is a caller-supplied buffer. When a match is found, we return
1766 * the tuple here, in addition to updating *tid. If no match is found, the
1767 * contents of this buffer on return are undefined.
1768 *
1769 * If all_dead is not NULL, we check non-visible tuples to see if they are
1770 * globally dead; *all_dead is set true if all members of the HOT chain
1771 * are vacuumable, false if not.
1772 *
1773 * Unlike heap_fetch, the caller must already have pin and (at least) share
1774 * lock on the buffer; it is still pinned/locked at exit.
1775 */
1776bool
1778 Snapshot snapshot, HeapTuple heapTuple,
1779 bool *all_dead, bool first_call)
1780{
1781 Page page = BufferGetPage(buffer);
1783 BlockNumber blkno;
1784 OffsetNumber offnum;
1785 bool at_chain_start;
1786 bool valid;
1787 bool skip;
1788 GlobalVisState *vistest = NULL;
1789
1790 /* If this is not the first call, previous call returned a (live!) tuple */
1791 if (all_dead)
1793
1794 blkno = ItemPointerGetBlockNumber(tid);
1795 offnum = ItemPointerGetOffsetNumber(tid);
1797 skip = !first_call;
1798
1799 /* XXX: we should assert that a snapshot is pushed or registered */
1801 Assert(BufferGetBlockNumber(buffer) == blkno);
1802
1803 /* Scan through possible multiple members of HOT-chain */
1804 for (;;)
1805 {
1806 ItemId lp;
1807
1808 /* check for bogus TID */
1810 break;
1811
1812 lp = PageGetItemId(page, offnum);
1813
1814 /* check for unused, dead, or redirected items */
1815 if (!ItemIdIsNormal(lp))
1816 {
1817 /* We should only see a redirect at start of chain */
1819 {
1820 /* Follow the redirect */
1821 offnum = ItemIdGetRedirect(lp);
1822 at_chain_start = false;
1823 continue;
1824 }
1825 /* else must be end of chain */
1826 break;
1827 }
1828
1829 /*
1830 * Update heapTuple to point to the element of the HOT chain we're
1831 * currently investigating. Having t_self set correctly is important
1832 * because the SSI checks and the *Satisfies routine for historical
1833 * MVCC snapshots need the correct tid to decide about the visibility.
1834 */
1835 heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1836 heapTuple->t_len = ItemIdGetLength(lp);
1837 heapTuple->t_tableOid = RelationGetRelid(relation);
1838 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1839
1840 /*
1841 * Shouldn't see a HEAP_ONLY tuple at chain start.
1842 */
1844 break;
1845
1846 /*
1847 * The xmin should match the previous xmax value, else chain is
1848 * broken.
1849 */
1853 break;
1854
1855 /*
1856 * When first_call is true (and thus, skip is initially false) we'll
1857 * return the first tuple we find. But on later passes, heapTuple
1858 * will initially be pointing to the tuple we returned last time.
1859 * Returning it again would be incorrect (and would loop forever), so
1860 * we skip it and return the next match we find.
1861 */
1862 if (!skip)
1863 {
1864 /* If it's visible per the snapshot, we must return it */
1865 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1867 buffer, snapshot);
1868
1869 if (valid)
1870 {
1871 ItemPointerSetOffsetNumber(tid, offnum);
1872 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1874 if (all_dead)
1875 *all_dead = false;
1876 return true;
1877 }
1878 }
1879 skip = false;
1880
1881 /*
1882 * If we can't see it, maybe no one else can either. At caller
1883 * request, check whether all chain members are dead to all
1884 * transactions.
1885 *
1886 * Note: if you change the criterion here for what is "dead", fix the
1887 * planner's get_actual_variable_range() function to match.
1888 */
1889 if (all_dead && *all_dead)
1890 {
1891 if (!vistest)
1892 vistest = GlobalVisTestFor(relation);
1893
1894 if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1895 *all_dead = false;
1896 }
1897
1898 /*
1899 * Check to see if HOT chain continues past this tuple; if so fetch
1900 * the next offnum and loop around.
1901 */
1903 {
1904 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1905 blkno);
1906 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1907 at_chain_start = false;
1909 }
1910 else
1911 break; /* end of chain */
1912 }
1913
1914 return false;
1915}
1916
1917/*
1918 * heap_get_latest_tid - get the latest tid of a specified tuple
1919 *
1920 * Actually, this gets the latest version that is visible according to the
1921 * scan's snapshot. Create a scan using SnapshotDirty to get the very latest,
1922 * possibly uncommitted version.
1923 *
1924 * *tid is both an input and an output parameter: it is updated to
1925 * show the latest version of the row. Note that it will not be changed
1926 * if no version of the row passes the snapshot test.
1927 */
1928void
1930 ItemPointer tid)
1931{
1932 Relation relation = sscan->rs_rd;
1933 Snapshot snapshot = sscan->rs_snapshot;
1934 ItemPointerData ctid;
1936
1937 /*
1938 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1939 * Assume that t_ctid links are valid however - there shouldn't be invalid
1940 * ones in the table.
1941 */
1943
1944 /*
1945 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1946 * need to examine, and *tid is the TID we will return if ctid turns out
1947 * to be bogus.
1948 *
1949 * Note that we will loop until we reach the end of the t_ctid chain.
1950 * Depending on the snapshot passed, there might be at most one visible
1951 * version of the row, but we don't try to optimize for that.
1952 */
1953 ctid = *tid;
1954 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1955 for (;;)
1956 {
1957 Buffer buffer;
1958 Page page;
1959 OffsetNumber offnum;
1960 ItemId lp;
1961 HeapTupleData tp;
1962 bool valid;
1963
1964 /*
1965 * Read, pin, and lock the page.
1966 */
1967 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1969 page = BufferGetPage(buffer);
1970
1971 /*
1972 * Check for bogus item number. This is not treated as an error
1973 * condition because it can happen while following a t_ctid link. We
1974 * just assume that the prior tid is OK and return it unchanged.
1975 */
1976 offnum = ItemPointerGetOffsetNumber(&ctid);
1978 {
1979 UnlockReleaseBuffer(buffer);
1980 break;
1981 }
1982 lp = PageGetItemId(page, offnum);
1983 if (!ItemIdIsNormal(lp))
1984 {
1985 UnlockReleaseBuffer(buffer);
1986 break;
1987 }
1988
1989 /* OK to access the tuple */
1990 tp.t_self = ctid;
1991 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1992 tp.t_len = ItemIdGetLength(lp);
1993 tp.t_tableOid = RelationGetRelid(relation);
1994
1995 /*
1996 * After following a t_ctid link, we might arrive at an unrelated
1997 * tuple. Check for XMIN match.
1998 */
2001 {
2002 UnlockReleaseBuffer(buffer);
2003 break;
2004 }
2005
2006 /*
2007 * Check tuple visibility; if visible, set it as the new result
2008 * candidate.
2009 */
2010 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2011 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2012 if (valid)
2013 *tid = ctid;
2014
2015 /*
2016 * If there's a valid t_ctid link, follow it, else we're done.
2017 */
2018 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2022 {
2023 UnlockReleaseBuffer(buffer);
2024 break;
2025 }
2026
2027 ctid = tp.t_data->t_ctid;
2029 UnlockReleaseBuffer(buffer);
2030 } /* end of loop */
2031}
2032
2033
2034/*
2035 * UpdateXmaxHintBits - update tuple hint bits after xmax transaction ends
2036 *
2037 * This is called after we have waited for the XMAX transaction to terminate.
2038 * If the transaction aborted, we guarantee the XMAX_INVALID hint bit will
2039 * be set on exit. If the transaction committed, we set the XMAX_COMMITTED
2040 * hint bit if possible --- but beware that that may not yet be possible,
2041 * if the transaction committed asynchronously.
2042 *
2043 * Note that if the transaction was a locker only, we set HEAP_XMAX_INVALID
2044 * even if it commits.
2045 *
2046 * Hence callers should look only at XMAX_INVALID.
2047 *
2048 * Note this is not allowed for tuples whose xmax is a multixact.
2049 */
2050static void
2052{
2055
2057 {
2058 if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask) &&
2061 xid);
2062 else
2065 }
2066}
2067
2068
2069/*
2070 * GetBulkInsertState - prepare status object for a bulk insert
2071 */
2074{
2075 BulkInsertState bistate;
2076
2079 bistate->current_buf = InvalidBuffer;
2080 bistate->next_free = InvalidBlockNumber;
2081 bistate->last_free = InvalidBlockNumber;
2082 bistate->already_extended_by = 0;
2083 return bistate;
2084}
2085
2086/*
2087 * FreeBulkInsertState - clean up after finishing a bulk insert
2088 */
2089void
2091{
2092 if (bistate->current_buf != InvalidBuffer)
2093 ReleaseBuffer(bistate->current_buf);
2094 FreeAccessStrategy(bistate->strategy);
2095 pfree(bistate);
2096}
2097
2098/*
2099 * ReleaseBulkInsertStatePin - release a buffer currently held in bistate
2100 */
2101void
2103{
2104 if (bistate->current_buf != InvalidBuffer)
2105 ReleaseBuffer(bistate->current_buf);
2106 bistate->current_buf = InvalidBuffer;
2107
2108 /*
2109 * Despite the name, we also reset bulk relation extension state.
2110 * Otherwise we can end up erroring out due to looking for free space in
2111 * ->next_free of one partition, even though ->next_free was set when
2112 * extending another partition. It could obviously also be bad for
2113 * efficiency to look at existing blocks at offsets from another
2114 * partition, even if we don't error out.
2115 */
2116 bistate->next_free = InvalidBlockNumber;
2117 bistate->last_free = InvalidBlockNumber;
2118}
2119
2120
2121/*
2122 * heap_insert - insert tuple into a heap
2123 *
2124 * The new tuple is stamped with current transaction ID and the specified
2125 * command ID.
2126 *
2127 * See table_tuple_insert for comments about most of the input flags, except
2128 * that this routine directly takes a tuple rather than a slot.
2129 *
2130 * There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
2131 * options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
2132 * implement table_tuple_insert_speculative().
2133 *
2134 * On return the header fields of *tup are updated to match the stored tuple;
2135 * in particular tup->t_self receives the actual TID where the tuple was
2136 * stored. But note that any toasting of fields within the tuple data is NOT
2137 * reflected into *tup.
2138 */
2139void
2141 int options, BulkInsertState bistate)
2142{
2145 Buffer buffer;
2146 Buffer vmbuffer = InvalidBuffer;
2147 bool all_visible_cleared = false;
2148
2149 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2152
2153 AssertHasSnapshotForToast(relation);
2154
2155 /*
2156 * Fill in tuple header fields and toast the tuple if necessary.
2157 *
2158 * Note: below this point, heaptup is the data we actually intend to store
2159 * into the relation; tup is the caller's original untoasted data.
2160 */
2161 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2162
2163 /*
2164 * Find buffer to insert this tuple into. If the page is all visible,
2165 * this will also pin the requisite visibility map page.
2166 */
2167 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2168 InvalidBuffer, options, bistate,
2169 &vmbuffer, NULL,
2170 0);
2171
2172 /*
2173 * We're about to do the actual insert -- but check for conflict first, to
2174 * avoid possibly having to roll back work we've just done.
2175 *
2176 * This is safe without a recheck as long as there is no possibility of
2177 * another process scanning the page between this check and the insert
2178 * being visible to the scan (i.e., an exclusive buffer content lock is
2179 * continuously held from this point until the tuple insert is visible).
2180 *
2181 * For a heap insert, we only need to check for table-level SSI locks. Our
2182 * new tuple can't possibly conflict with existing tuple locks, and heap
2183 * page locks are only consolidated versions of tuple locks; they do not
2184 * lock "gaps" as index page locks do. So we don't need to specify a
2185 * buffer when making the call, which makes for a faster check.
2186 */
2188
2189 /* NO EREPORT(ERROR) from here till changes are logged */
2191
2192 RelationPutHeapTuple(relation, buffer, heaptup,
2194
2195 if (PageIsAllVisible(BufferGetPage(buffer)))
2196 {
2197 all_visible_cleared = true;
2199 visibilitymap_clear(relation,
2201 vmbuffer, VISIBILITYMAP_VALID_BITS);
2202 }
2203
2204 /*
2205 * XXX Should we set PageSetPrunable on this page ?
2206 *
2207 * The inserting transaction may eventually abort thus making this tuple
2208 * DEAD and hence available for pruning. Though we don't want to optimize
2209 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2210 * aborted tuple will never be pruned until next vacuum is triggered.
2211 *
2212 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2213 */
2214
2215 MarkBufferDirty(buffer);
2216
2217 /* XLOG stuff */
2218 if (RelationNeedsWAL(relation))
2219 {
2223 Page page = BufferGetPage(buffer);
2224 uint8 info = XLOG_HEAP_INSERT;
2225 int bufflags = 0;
2226
2227 /*
2228 * If this is a catalog, we need to transmit combo CIDs to properly
2229 * decode, so log that as well.
2230 */
2232 log_heap_new_cid(relation, heaptup);
2233
2234 /*
2235 * If this is the single and first tuple on page, we can reinit the
2236 * page instead of restoring the whole thing. Set flag, and hide
2237 * buffer references from XLogInsert.
2238 */
2241 {
2242 info |= XLOG_HEAP_INIT_PAGE;
2244 }
2245
2246 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2247 xlrec.flags = 0;
2253
2254 /*
2255 * For logical decoding, we need the tuple even if we're doing a full
2256 * page write, so make sure it's included even if we take a full-page
2257 * image. (XXX We could alternatively store a pointer into the FPW).
2258 */
2259 if (RelationIsLogicallyLogged(relation) &&
2261 {
2264
2265 if (IsToastRelation(relation))
2267 }
2268
2271
2272 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2273 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2274 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2275
2276 /*
2277 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2278 * write the whole page to the xlog, we don't need to store
2279 * xl_heap_header in the xlog.
2280 */
2283 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2285 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2287
2288 /* filtering by origin on a row level is much more efficient */
2290
2291 recptr = XLogInsert(RM_HEAP_ID, info);
2292
2293 PageSetLSN(page, recptr);
2294 }
2295
2297
2298 UnlockReleaseBuffer(buffer);
2299 if (vmbuffer != InvalidBuffer)
2300 ReleaseBuffer(vmbuffer);
2301
2302 /*
2303 * If tuple is cacheable, mark it for invalidation from the caches in case
2304 * we abort. Note it is OK to do this after releasing the buffer, because
2305 * the heaptup data structure is all in local memory, not in the shared
2306 * buffer.
2307 */
2309
2310 /* Note: speculative insertions are counted too, even if aborted later */
2311 pgstat_count_heap_insert(relation, 1);
2312
2313 /*
2314 * If heaptup is a private copy, release it. Don't forget to copy t_self
2315 * back to the caller's image, too.
2316 */
2317 if (heaptup != tup)
2318 {
2319 tup->t_self = heaptup->t_self;
2321 }
2322}
2323
2324/*
2325 * Subroutine for heap_insert(). Prepares a tuple for insertion. This sets the
2326 * tuple header fields and toasts the tuple if necessary. Returns a toasted
2327 * version of the tuple if it was toasted, or the original tuple if not. Note
2328 * that in any case, the header fields are also set in the original tuple.
2329 */
2330static HeapTuple
2332 CommandId cid, int options)
2333{
2334 /*
2335 * To allow parallel inserts, we need to ensure that they are safe to be
2336 * performed in workers. We have the infrastructure to allow parallel
2337 * inserts in general except for the cases where inserts generate a new
2338 * CommandId (eg. inserts into a table having a foreign key column).
2339 */
2340 if (IsParallelWorker())
2341 ereport(ERROR,
2343 errmsg("cannot insert tuples in a parallel worker")));
2344
2345 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2346 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2347 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2348 HeapTupleHeaderSetXmin(tup->t_data, xid);
2351
2352 HeapTupleHeaderSetCmin(tup->t_data, cid);
2353 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2354 tup->t_tableOid = RelationGetRelid(relation);
2355
2356 /*
2357 * If the new tuple is too big for storage or contains already toasted
2358 * out-of-line attributes from some other relation, invoke the toaster.
2359 */
2360 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2361 relation->rd_rel->relkind != RELKIND_MATVIEW)
2362 {
2363 /* toast table entries should never be recursively toasted */
2365 return tup;
2366 }
2367 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2368 return heap_toast_insert_or_update(relation, tup, NULL, options);
2369 else
2370 return tup;
2371}
2372
2373/*
2374 * Helper for heap_multi_insert() that computes the number of entire pages
2375 * that inserting the remaining heaptuples requires. Used to determine how
2376 * much the relation needs to be extended by.
2377 */
2378static int
2380{
2382 int npages = 1;
2383
2384 for (int i = done; i < ntuples; i++)
2385 {
2386 size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2387
2388 if (page_avail < tup_sz)
2389 {
2390 npages++;
2392 }
2393 page_avail -= tup_sz;
2394 }
2395
2396 return npages;
2397}
2398
2399/*
2400 * heap_multi_insert - insert multiple tuples into a heap
2401 *
2402 * This is like heap_insert(), but inserts multiple tuples in one operation.
2403 * That's faster than calling heap_insert() in a loop, because when multiple
2404 * tuples can be inserted on a single page, we can write just a single WAL
2405 * record covering all of them, and only need to lock/unlock the page once.
2406 *
2407 * Note: this leaks memory into the current memory context. You can create a
2408 * temporary context before calling this, if that's a problem.
2409 */
2410void
2411heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples,
2412 CommandId cid, int options, BulkInsertState bistate)
2413{
2416 int i;
2417 int ndone;
2419 Page page;
2420 Buffer vmbuffer = InvalidBuffer;
2421 bool needwal;
2425 bool starting_with_empty_page = false;
2426 int npages = 0;
2427 int npages_used = 0;
2428
2429 /* currently not needed (thus unsupported) for heap_multi_insert() */
2431
2432 AssertHasSnapshotForToast(relation);
2433
2434 needwal = RelationNeedsWAL(relation);
2437
2438 /* Toast and set header data in all the slots */
2439 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2440 for (i = 0; i < ntuples; i++)
2441 {
2442 HeapTuple tuple;
2443
2444 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2445 slots[i]->tts_tableOid = RelationGetRelid(relation);
2446 tuple->t_tableOid = slots[i]->tts_tableOid;
2447 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2448 options);
2449 }
2450
2451 /*
2452 * We're about to do the actual inserts -- but check for conflict first,
2453 * to minimize the possibility of having to roll back work we've just
2454 * done.
2455 *
2456 * A check here does not definitively prevent a serialization anomaly;
2457 * that check MUST be done at least past the point of acquiring an
2458 * exclusive buffer content lock on every buffer that will be affected,
2459 * and MAY be done after all inserts are reflected in the buffers and
2460 * those locks are released; otherwise there is a race condition. Since
2461 * multiple buffers can be locked and unlocked in the loop below, and it
2462 * would not be feasible to identify and lock all of those buffers before
2463 * the loop, we must do a final check at the end.
2464 *
2465 * The check here could be omitted with no loss of correctness; it is
2466 * present strictly as an optimization.
2467 *
2468 * For heap inserts, we only need to check for table-level SSI locks. Our
2469 * new tuples can't possibly conflict with existing tuple locks, and heap
2470 * page locks are only consolidated versions of tuple locks; they do not
2471 * lock "gaps" as index page locks do. So we don't need to specify a
2472 * buffer when making the call, which makes for a faster check.
2473 */
2475
2476 ndone = 0;
2477 while (ndone < ntuples)
2478 {
2479 Buffer buffer;
2480 bool all_visible_cleared = false;
2481 bool all_frozen_set = false;
2482 int nthispage;
2483
2485
2486 /*
2487 * Compute number of pages needed to fit the to-be-inserted tuples in
2488 * the worst case. This will be used to determine how much to extend
2489 * the relation by in RelationGetBufferForTuple(), if needed. If we
2490 * filled a prior page from scratch, we can just update our last
2491 * computation, but if we started with a partially filled page,
2492 * recompute from scratch, the number of potentially required pages
2493 * can vary due to tuples needing to fit onto the page, page headers
2494 * etc.
2495 */
2496 if (ndone == 0 || !starting_with_empty_page)
2497 {
2498 npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2500 npages_used = 0;
2501 }
2502 else
2503 npages_used++;
2504
2505 /*
2506 * Find buffer where at least the next tuple will fit. If the page is
2507 * all-visible, this will also pin the requisite visibility map page.
2508 *
2509 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2510 * empty page. See all_frozen_set below.
2511 */
2512 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2513 InvalidBuffer, options, bistate,
2514 &vmbuffer, NULL,
2515 npages - npages_used);
2516 page = BufferGetPage(buffer);
2517
2519
2521 {
2522 all_frozen_set = true;
2523 /* Lock the vmbuffer before entering the critical section */
2525 }
2526
2527 /* NO EREPORT(ERROR) from here till changes are logged */
2529
2530 /*
2531 * RelationGetBufferForTuple has ensured that the first tuple fits.
2532 * Put that on the page, and then as many other tuples as fit.
2533 */
2534 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2535
2536 /*
2537 * For logical decoding we need combo CIDs to properly decode the
2538 * catalog.
2539 */
2540 if (needwal && need_cids)
2541 log_heap_new_cid(relation, heaptuples[ndone]);
2542
2543 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2544 {
2546
2547 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2548 break;
2549
2550 RelationPutHeapTuple(relation, buffer, heaptup, false);
2551
2552 /*
2553 * For logical decoding we need combo CIDs to properly decode the
2554 * catalog.
2555 */
2556 if (needwal && need_cids)
2557 log_heap_new_cid(relation, heaptup);
2558 }
2559
2560 /*
2561 * If the page is all visible, need to clear that, unless we're only
2562 * going to add further frozen rows to it.
2563 *
2564 * If we're only adding already frozen rows to a previously empty
2565 * page, mark it as all-frozen and update the visibility map. We're
2566 * already holding a pin on the vmbuffer.
2567 */
2569 {
2570 all_visible_cleared = true;
2571 PageClearAllVisible(page);
2572 visibilitymap_clear(relation,
2573 BufferGetBlockNumber(buffer),
2574 vmbuffer, VISIBILITYMAP_VALID_BITS);
2575 }
2576 else if (all_frozen_set)
2577 {
2578 PageSetAllVisible(page);
2580 vmbuffer,
2583 relation->rd_locator);
2584 }
2585
2586 /*
2587 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2588 */
2589
2590 MarkBufferDirty(buffer);
2591
2592 /* XLOG stuff */
2593 if (needwal)
2594 {
2598 char *tupledata;
2599 int totaldatalen;
2600 char *scratchptr = scratch.data;
2601 bool init;
2602 int bufflags = 0;
2603
2604 /*
2605 * If the page was previously empty, we can reinit the page
2606 * instead of restoring the whole thing.
2607 */
2609
2610 /* allocate xl_heap_multi_insert struct from the scratch area */
2613
2614 /*
2615 * Allocate offsets array. Unless we're reinitializing the page,
2616 * in that case the tuples are stored in order starting at
2617 * FirstOffsetNumber and we don't need to store the offsets
2618 * explicitly.
2619 */
2620 if (!init)
2621 scratchptr += nthispage * sizeof(OffsetNumber);
2622
2623 /* the rest of the scratch space is used for tuple data */
2624 tupledata = scratchptr;
2625
2626 /* check that the mutually exclusive flags are not both set */
2628
2629 xlrec->flags = 0;
2632
2633 /*
2634 * We don't have to worry about including a conflict xid in the
2635 * WAL record, as HEAP_INSERT_FROZEN intentionally violates
2636 * visibility rules.
2637 */
2638 if (all_frozen_set)
2640
2641 xlrec->ntuples = nthispage;
2642
2643 /*
2644 * Write out an xl_multi_insert_tuple and the tuple data itself
2645 * for each tuple.
2646 */
2647 for (i = 0; i < nthispage; i++)
2648 {
2650 xl_multi_insert_tuple *tuphdr;
2651 int datalen;
2652
2653 if (!init)
2654 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2655 /* xl_multi_insert_tuple needs two-byte alignment. */
2657 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2658
2659 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2660 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2661 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2662
2663 /* write bitmap [+ padding] [+ oid] + data */
2664 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2666 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2667 datalen);
2668 tuphdr->datalen = datalen;
2669 scratchptr += datalen;
2670 }
2671 totaldatalen = scratchptr - tupledata;
2672 Assert((scratchptr - scratch.data) < BLCKSZ);
2673
2674 if (need_tuple_data)
2676
2677 /*
2678 * Signal that this is the last xl_heap_multi_insert record
2679 * emitted by this call to heap_multi_insert(). Needed for logical
2680 * decoding so it knows when to cleanup temporary data.
2681 */
2682 if (ndone + nthispage == ntuples)
2684
2685 if (init)
2686 {
2687 info |= XLOG_HEAP_INIT_PAGE;
2689 }
2690
2691 /*
2692 * If we're doing logical decoding, include the new tuple data
2693 * even if we take a full-page image of the page.
2694 */
2695 if (need_tuple_data)
2697
2699 XLogRegisterData(xlrec, tupledata - scratch.data);
2701 if (all_frozen_set)
2702 XLogRegisterBuffer(1, vmbuffer, 0);
2703
2704 XLogRegisterBufData(0, tupledata, totaldatalen);
2705
2706 /* filtering by origin on a row level is much more efficient */
2708
2709 recptr = XLogInsert(RM_HEAP2_ID, info);
2710
2711 PageSetLSN(page, recptr);
2712 if (all_frozen_set)
2713 {
2714 Assert(BufferIsDirty(vmbuffer));
2715 PageSetLSN(BufferGetPage(vmbuffer), recptr);
2716 }
2717 }
2718
2720
2721 if (all_frozen_set)
2722 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
2723
2724 UnlockReleaseBuffer(buffer);
2725 ndone += nthispage;
2726
2727 /*
2728 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2729 * likely that we'll insert into subsequent heap pages that are likely
2730 * to use the same vm page.
2731 */
2732 }
2733
2734 /* We're done with inserting all tuples, so release the last vmbuffer. */
2735 if (vmbuffer != InvalidBuffer)
2736 ReleaseBuffer(vmbuffer);
2737
2738 /*
2739 * We're done with the actual inserts. Check for conflicts again, to
2740 * ensure that all rw-conflicts in to these inserts are detected. Without
2741 * this final check, a sequential scan of the heap may have locked the
2742 * table after the "before" check, missing one opportunity to detect the
2743 * conflict, and then scanned the table before the new tuples were there,
2744 * missing the other chance to detect the conflict.
2745 *
2746 * For heap inserts, we only need to check for table-level SSI locks. Our
2747 * new tuples can't possibly conflict with existing tuple locks, and heap
2748 * page locks are only consolidated versions of tuple locks; they do not
2749 * lock "gaps" as index page locks do. So we don't need to specify a
2750 * buffer when making the call.
2751 */
2753
2754 /*
2755 * If tuples are cacheable, mark them for invalidation from the caches in
2756 * case we abort. Note it is OK to do this after releasing the buffer,
2757 * because the heaptuples data structure is all in local memory, not in
2758 * the shared buffer.
2759 */
2760 if (IsCatalogRelation(relation))
2761 {
2762 for (i = 0; i < ntuples; i++)
2764 }
2765
2766 /* copy t_self fields back to the caller's slots */
2767 for (i = 0; i < ntuples; i++)
2768 slots[i]->tts_tid = heaptuples[i]->t_self;
2769
2770 pgstat_count_heap_insert(relation, ntuples);
2771}
2772
2773/*
2774 * simple_heap_insert - insert a tuple
2775 *
2776 * Currently, this routine differs from heap_insert only in supplying
2777 * a default command ID and not allowing access to the speedup options.
2778 *
2779 * This should be used rather than using heap_insert directly in most places
2780 * where we are modifying system catalogs.
2781 */
2782void
2784{
2785 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2786}
2787
2788/*
2789 * Given infomask/infomask2, compute the bits that must be saved in the
2790 * "infobits" field of xl_heap_delete, xl_heap_update, xl_heap_lock,
2791 * xl_heap_lock_updated WAL records.
2792 *
2793 * See fix_infomask_from_infobits.
2794 */
2795static uint8
2797{
2798 return
2802 /* note we ignore HEAP_XMAX_SHR_LOCK here */
2804 ((infomask2 & HEAP_KEYS_UPDATED) != 0 ?
2805 XLHL_KEYS_UPDATED : 0);
2806}
2807
2808/*
2809 * Given two versions of the same t_infomask for a tuple, compare them and
2810 * return whether the relevant status for a tuple Xmax has changed. This is
2811 * used after a buffer lock has been released and reacquired: we want to ensure
2812 * that the tuple state continues to be the same it was when we previously
2813 * examined it.
2814 *
2815 * Note the Xmax field itself must be compared separately.
2816 */
2817static inline bool
2819{
2820 const uint16 interesting =
2822
2823 if ((new_infomask & interesting) != (old_infomask & interesting))
2824 return true;
2825
2826 return false;
2827}
2828
2829/*
2830 * heap_delete - delete a tuple
2831 *
2832 * See table_tuple_delete() for an explanation of the parameters, except that
2833 * this routine directly takes a tuple rather than a slot.
2834 *
2835 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
2836 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
2837 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
2838 * generated by another transaction).
2839 */
2841heap_delete(Relation relation, const ItemPointerData *tid,
2842 CommandId cid, Snapshot crosscheck, bool wait,
2843 TM_FailureData *tmfd, bool changingPart)
2844{
2845 TM_Result result;
2847 ItemId lp;
2848 HeapTupleData tp;
2849 Page page;
2850 BlockNumber block;
2851 Buffer buffer;
2852 Buffer vmbuffer = InvalidBuffer;
2853 TransactionId new_xmax;
2856 bool have_tuple_lock = false;
2857 bool iscombo;
2858 bool all_visible_cleared = false;
2859 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2860 bool old_key_copied = false;
2861
2863
2864 AssertHasSnapshotForToast(relation);
2865
2866 /*
2867 * Forbid this during a parallel operation, lest it allocate a combo CID.
2868 * Other workers might need that combo CID for visibility checks, and we
2869 * have no provision for broadcasting it to them.
2870 */
2871 if (IsInParallelMode())
2872 ereport(ERROR,
2874 errmsg("cannot delete tuples during a parallel operation")));
2875
2876 block = ItemPointerGetBlockNumber(tid);
2877 buffer = ReadBuffer(relation, block);
2878 page = BufferGetPage(buffer);
2879
2880 /*
2881 * Before locking the buffer, pin the visibility map page if it appears to
2882 * be necessary. Since we haven't got the lock yet, someone else might be
2883 * in the middle of changing this, so we'll need to recheck after we have
2884 * the lock.
2885 */
2886 if (PageIsAllVisible(page))
2887 visibilitymap_pin(relation, block, &vmbuffer);
2888
2890
2893
2894 tp.t_tableOid = RelationGetRelid(relation);
2895 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2896 tp.t_len = ItemIdGetLength(lp);
2897 tp.t_self = *tid;
2898
2899l1:
2900
2901 /*
2902 * If we didn't pin the visibility map page and the page has become all
2903 * visible while we were busy locking the buffer, we'll have to unlock and
2904 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2905 * unfortunate, but hopefully shouldn't happen often.
2906 */
2907 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2908 {
2910 visibilitymap_pin(relation, block, &vmbuffer);
2912 }
2913
2914 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2915
2916 if (result == TM_Invisible)
2917 {
2918 UnlockReleaseBuffer(buffer);
2919 ereport(ERROR,
2921 errmsg("attempted to delete invisible tuple")));
2922 }
2923 else if (result == TM_BeingModified && wait)
2924 {
2927
2928 /* must copy state data before unlocking buffer */
2931
2932 /*
2933 * Sleep until concurrent transaction ends -- except when there's a
2934 * single locker and it's our own transaction. Note we don't care
2935 * which lock mode the locker has, because we need the strongest one.
2936 *
2937 * Before sleeping, we need to acquire tuple lock to establish our
2938 * priority for the tuple (see heap_lock_tuple). LockTuple will
2939 * release us when we are next-in-line for the tuple.
2940 *
2941 * If we are forced to "start over" below, we keep the tuple lock;
2942 * this arranges that we stay at the head of the line while rechecking
2943 * tuple state.
2944 */
2946 {
2947 bool current_is_member = false;
2948
2951 {
2953
2954 /*
2955 * Acquire the lock, if necessary (but skip it when we're
2956 * requesting a lock and already have one; avoids deadlock).
2957 */
2958 if (!current_is_member)
2961
2962 /* wait for multixact */
2964 relation, &(tp.t_self), XLTW_Delete,
2965 NULL);
2967
2968 /*
2969 * If xwait had just locked the tuple then some other xact
2970 * could update this tuple before we get to this point. Check
2971 * for xmax change, and start over if so.
2972 *
2973 * We also must start over if we didn't pin the VM page, and
2974 * the page has become all visible.
2975 */
2976 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2979 xwait))
2980 goto l1;
2981 }
2982
2983 /*
2984 * You might think the multixact is necessarily done here, but not
2985 * so: it could have surviving members, namely our own xact or
2986 * other subxacts of this backend. It is legal for us to delete
2987 * the tuple in either case, however (the latter case is
2988 * essentially a situation of upgrading our former shared lock to
2989 * exclusive). We don't bother changing the on-disk hint bits
2990 * since we are about to overwrite the xmax altogether.
2991 */
2992 }
2994 {
2995 /*
2996 * Wait for regular transaction to end; but first, acquire tuple
2997 * lock.
2998 */
3002 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3004
3005 /*
3006 * xwait is done, but if xwait had just locked the tuple then some
3007 * other xact could update this tuple before we get to this point.
3008 * Check for xmax change, and start over if so.
3009 *
3010 * We also must start over if we didn't pin the VM page, and the
3011 * page has become all visible.
3012 */
3013 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
3016 xwait))
3017 goto l1;
3018
3019 /* Otherwise check if it committed or aborted */
3020 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3021 }
3022
3023 /*
3024 * We may overwrite if previous xmax aborted, or if it committed but
3025 * only locked the tuple without updating it.
3026 */
3027 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3030 result = TM_Ok;
3031 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
3032 result = TM_Updated;
3033 else
3034 result = TM_Deleted;
3035 }
3036
3037 /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3038 if (result != TM_Ok)
3039 {
3040 Assert(result == TM_SelfModified ||
3041 result == TM_Updated ||
3042 result == TM_Deleted ||
3043 result == TM_BeingModified);
3045 Assert(result != TM_Updated ||
3047 }
3048
3049 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3050 {
3051 /* Perform additional check for transaction-snapshot mode RI updates */
3052 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3053 result = TM_Updated;
3054 }
3055
3056 if (result != TM_Ok)
3057 {
3058 tmfd->ctid = tp.t_data->t_ctid;
3060 if (result == TM_SelfModified)
3062 else
3063 tmfd->cmax = InvalidCommandId;
3064 UnlockReleaseBuffer(buffer);
3065 if (have_tuple_lock)
3067 if (vmbuffer != InvalidBuffer)
3068 ReleaseBuffer(vmbuffer);
3069 return result;
3070 }
3071
3072 /*
3073 * We're about to do the actual delete -- check for conflict first, to
3074 * avoid possibly having to roll back work we've just done.
3075 *
3076 * This is safe without a recheck as long as there is no possibility of
3077 * another process scanning the page between this check and the delete
3078 * being visible to the scan (i.e., an exclusive buffer content lock is
3079 * continuously held from this point until the tuple delete is visible).
3080 */
3082
3083 /* replace cid with a combo CID if necessary */
3085
3086 /*
3087 * Compute replica identity tuple before entering the critical section so
3088 * we don't PANIC upon a memory allocation failure.
3089 */
3090 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3091
3092 /*
3093 * If this is the first possibly-multixact-able operation in the current
3094 * transaction, set my per-backend OldestMemberMXactId setting. We can be
3095 * certain that the transaction will never become a member of any older
3096 * MultiXactIds than that. (We have to do this even if we end up just
3097 * using our own TransactionId below, since some other backend could
3098 * incorporate our XID into a MultiXact immediately afterwards.)
3099 */
3101
3104 xid, LockTupleExclusive, true,
3105 &new_xmax, &new_infomask, &new_infomask2);
3106
3108
3109 /*
3110 * If this transaction commits, the tuple will become DEAD sooner or
3111 * later. Set flag that this page is a candidate for pruning once our xid
3112 * falls below the OldestXmin horizon. If the transaction finally aborts,
3113 * the subsequent page pruning will be a no-op and the hint will be
3114 * cleared.
3115 */
3116 PageSetPrunable(page, xid);
3117
3118 if (PageIsAllVisible(page))
3119 {
3120 all_visible_cleared = true;
3121 PageClearAllVisible(page);
3122 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3123 vmbuffer, VISIBILITYMAP_VALID_BITS);
3124 }
3125
3126 /* store transaction information of xact deleting the tuple */
3132 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3134 /* Make sure there is no forward chain link in t_ctid */
3135 tp.t_data->t_ctid = tp.t_self;
3136
3137 /* Signal that this is actually a move into another partition */
3138 if (changingPart)
3140
3141 MarkBufferDirty(buffer);
3142
3143 /*
3144 * XLOG stuff
3145 *
3146 * NB: heap_abort_speculative() uses the same xlog record and replay
3147 * routines.
3148 */
3149 if (RelationNeedsWAL(relation))
3150 {
3154
3155 /*
3156 * For logical decode we need combo CIDs to properly decode the
3157 * catalog
3158 */
3160 log_heap_new_cid(relation, &tp);
3161
3162 xlrec.flags = 0;
3165 if (changingPart)
3167 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3168 tp.t_data->t_infomask2);
3170 xlrec.xmax = new_xmax;
3171
3172 if (old_key_tuple != NULL)
3173 {
3174 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3176 else
3178 }
3179
3182
3184
3185 /*
3186 * Log replica identity of the deleted tuple if there is one
3187 */
3188 if (old_key_tuple != NULL)
3189 {
3190 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3191 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3192 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3193
3195 XLogRegisterData((char *) old_key_tuple->t_data
3197 old_key_tuple->t_len
3199 }
3200
3201 /* filtering by origin on a row level is much more efficient */
3203
3205
3206 PageSetLSN(page, recptr);
3207 }
3208
3210
3212
3213 if (vmbuffer != InvalidBuffer)
3214 ReleaseBuffer(vmbuffer);
3215
3216 /*
3217 * If the tuple has toasted out-of-line attributes, we need to delete
3218 * those items too. We have to do this before releasing the buffer
3219 * because we need to look at the contents of the tuple, but it's OK to
3220 * release the content lock on the buffer first.
3221 */
3222 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3223 relation->rd_rel->relkind != RELKIND_MATVIEW)
3224 {
3225 /* toast table entries should never be recursively toasted */
3227 }
3228 else if (HeapTupleHasExternal(&tp))
3229 heap_toast_delete(relation, &tp, false);
3230
3231 /*
3232 * Mark tuple for invalidation from system caches at next command
3233 * boundary. We have to do this before releasing the buffer because we
3234 * need to look at the contents of the tuple.
3235 */
3236 CacheInvalidateHeapTuple(relation, &tp, NULL);
3237
3238 /* Now we can release the buffer */
3239 ReleaseBuffer(buffer);
3240
3241 /*
3242 * Release the lmgr tuple lock, if we had it.
3243 */
3244 if (have_tuple_lock)
3246
3247 pgstat_count_heap_delete(relation);
3248
3251
3252 return TM_Ok;
3253}
3254
3255/*
3256 * simple_heap_delete - delete a tuple
3257 *
3258 * This routine may be used to delete a tuple when concurrent updates of
3259 * the target tuple are not expected (for example, because we have a lock
3260 * on the relation associated with the tuple). Any failure is reported
3261 * via ereport().
3262 */
3263void
3264simple_heap_delete(Relation relation, const ItemPointerData *tid)
3265{
3266 TM_Result result;
3267 TM_FailureData tmfd;
3268
3269 result = heap_delete(relation, tid,
3271 true /* wait for commit */ ,
3272 &tmfd, false /* changingPart */ );
3273 switch (result)
3274 {
3275 case TM_SelfModified:
3276 /* Tuple was already updated in current command? */
3277 elog(ERROR, "tuple already updated by self");
3278 break;
3279
3280 case TM_Ok:
3281 /* done successfully */
3282 break;
3283
3284 case TM_Updated:
3285 elog(ERROR, "tuple concurrently updated");
3286 break;
3287
3288 case TM_Deleted:
3289 elog(ERROR, "tuple concurrently deleted");
3290 break;
3291
3292 default:
3293 elog(ERROR, "unrecognized heap_delete status: %u", result);
3294 break;
3295 }
3296}
3297
3298/*
3299 * heap_update - replace a tuple
3300 *
3301 * See table_tuple_update() for an explanation of the parameters, except that
3302 * this routine directly takes a tuple rather than a slot.
3303 *
3304 * In the failure cases, the routine fills *tmfd with the tuple's t_ctid,
3305 * t_xmax (resolving a possible MultiXact, if necessary), and t_cmax (the last
3306 * only for TM_SelfModified, since we cannot obtain cmax from a combo CID
3307 * generated by another transaction).
3308 */
3311 CommandId cid, Snapshot crosscheck, bool wait,
3312 TM_FailureData *tmfd, LockTupleMode *lockmode,
3314{
3315 TM_Result result;
3323 ItemId lp;
3327 bool old_key_copied = false;
3328 Page page;
3329 BlockNumber block;
3331 Buffer buffer,
3332 newbuf,
3333 vmbuffer = InvalidBuffer,
3335 bool need_toast;
3337 pagefree;
3338 bool have_tuple_lock = false;
3339 bool iscombo;
3340 bool use_hot_update = false;
3341 bool summarized_update = false;
3342 bool key_intact;
3343 bool all_visible_cleared = false;
3344 bool all_visible_cleared_new = false;
3345 bool checked_lockers;
3346 bool locker_remains;
3347 bool id_has_external = false;
3354
3356
3357 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3360
3361 AssertHasSnapshotForToast(relation);
3362
3363 /*
3364 * Forbid this during a parallel operation, lest it allocate a combo CID.
3365 * Other workers might need that combo CID for visibility checks, and we
3366 * have no provision for broadcasting it to them.
3367 */
3368 if (IsInParallelMode())
3369 ereport(ERROR,
3371 errmsg("cannot update tuples during a parallel operation")));
3372
3373#ifdef USE_ASSERT_CHECKING
3375#endif
3376
3377 /*
3378 * Fetch the list of attributes to be checked for various operations.
3379 *
3380 * For HOT considerations, this is wasted effort if we fail to update or
3381 * have to put the new tuple on a different page. But we must compute the
3382 * list before obtaining buffer lock --- in the worst case, if we are
3383 * doing an update on one of the relevant system catalogs, we could
3384 * deadlock if we try to fetch the list later. In any case, the relcache
3385 * caches the data so this is usually pretty cheap.
3386 *
3387 * We also need columns used by the replica identity and columns that are
3388 * considered the "key" of rows in the table.
3389 *
3390 * Note that we get copies of each bitmap, so we need not worry about
3391 * relcache flush happening midway through.
3392 */
3405
3407 INJECTION_POINT("heap_update-before-pin", NULL);
3408 buffer = ReadBuffer(relation, block);
3409 page = BufferGetPage(buffer);
3410
3411 /*
3412 * Before locking the buffer, pin the visibility map page if it appears to
3413 * be necessary. Since we haven't got the lock yet, someone else might be
3414 * in the middle of changing this, so we'll need to recheck after we have
3415 * the lock.
3416 */
3417 if (PageIsAllVisible(page))
3418 visibilitymap_pin(relation, block, &vmbuffer);
3419
3421
3423
3424 /*
3425 * Usually, a buffer pin and/or snapshot blocks pruning of otid, ensuring
3426 * we see LP_NORMAL here. When the otid origin is a syscache, we may have
3427 * neither a pin nor a snapshot. Hence, we may see other LP_ states, each
3428 * of which indicates concurrent pruning.
3429 *
3430 * Failing with TM_Updated would be most accurate. However, unlike other
3431 * TM_Updated scenarios, we don't know the successor ctid in LP_UNUSED and
3432 * LP_DEAD cases. While the distinction between TM_Updated and TM_Deleted
3433 * does matter to SQL statements UPDATE and MERGE, those SQL statements
3434 * hold a snapshot that ensures LP_NORMAL. Hence, the choice between
3435 * TM_Updated and TM_Deleted affects only the wording of error messages.
3436 * Settle on TM_Deleted, for two reasons. First, it avoids complicating
3437 * the specification of when tmfd->ctid is valid. Second, it creates
3438 * error log evidence that we took this branch.
3439 *
3440 * Since it's possible to see LP_UNUSED at otid, it's also possible to see
3441 * LP_NORMAL for a tuple that replaced LP_UNUSED. If it's a tuple for an
3442 * unrelated row, we'll fail with "duplicate key value violates unique".
3443 * XXX if otid is the live, newer version of the newtup row, we'll discard
3444 * changes originating in versions of this catalog row after the version
3445 * the caller got from syscache. See syscache-update-pruned.spec.
3446 */
3447 if (!ItemIdIsNormal(lp))
3448 {
3450
3451 UnlockReleaseBuffer(buffer);
3453 if (vmbuffer != InvalidBuffer)
3454 ReleaseBuffer(vmbuffer);
3455 tmfd->ctid = *otid;
3456 tmfd->xmax = InvalidTransactionId;
3457 tmfd->cmax = InvalidCommandId;
3459
3464 /* modified_attrs not yet initialized */
3466 return TM_Deleted;
3467 }
3468
3469 /*
3470 * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3471 * properly.
3472 */
3473 oldtup.t_tableOid = RelationGetRelid(relation);
3474 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3475 oldtup.t_len = ItemIdGetLength(lp);
3476 oldtup.t_self = *otid;
3477
3478 /* the new tuple is ready, except for this: */
3479 newtup->t_tableOid = RelationGetRelid(relation);
3480
3481 /*
3482 * Determine columns modified by the update. Additionally, identify
3483 * whether any of the unmodified replica identity key attributes in the
3484 * old tuple is externally stored or not. This is required because for
3485 * such attributes the flattened value won't be WAL logged as part of the
3486 * new tuple so we must include it as part of the old_key_tuple. See
3487 * ExtractReplicaIdentity.
3488 */
3490 id_attrs, &oldtup,
3492
3493 /*
3494 * If we're not updating any "key" column, we can grab a weaker lock type.
3495 * This allows for more concurrency when we are running simultaneously
3496 * with foreign key checks.
3497 *
3498 * Note that if a column gets detoasted while executing the update, but
3499 * the value ends up being the same, this test will fail and we will use
3500 * the stronger lock. This is acceptable; the important case to optimize
3501 * is updates that don't manipulate key columns, not those that
3502 * serendipitously arrive at the same key values.
3503 */
3505 {
3506 *lockmode = LockTupleNoKeyExclusive;
3508 key_intact = true;
3509
3510 /*
3511 * If this is the first possibly-multixact-able operation in the
3512 * current transaction, set my per-backend OldestMemberMXactId
3513 * setting. We can be certain that the transaction will never become a
3514 * member of any older MultiXactIds than that. (We have to do this
3515 * even if we end up just using our own TransactionId below, since
3516 * some other backend could incorporate our XID into a MultiXact
3517 * immediately afterwards.)
3518 */
3520 }
3521 else
3522 {
3523 *lockmode = LockTupleExclusive;
3525 key_intact = false;
3526 }
3527
3528 /*
3529 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3530 * otid may very well point at newtup->t_self, which we will overwrite
3531 * with the new tuple's location, so there's great risk of confusion if we
3532 * use otid anymore.
3533 */
3534
3535l2:
3536 checked_lockers = false;
3537 locker_remains = false;
3538 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3539
3540 /* see below about the "no wait" case */
3541 Assert(result != TM_BeingModified || wait);
3542
3543 if (result == TM_Invisible)
3544 {
3545 UnlockReleaseBuffer(buffer);
3546 ereport(ERROR,
3548 errmsg("attempted to update invisible tuple")));
3549 }
3550 else if (result == TM_BeingModified && wait)
3551 {
3554 bool can_continue = false;
3555
3556 /*
3557 * XXX note that we don't consider the "no wait" case here. This
3558 * isn't a problem currently because no caller uses that case, but it
3559 * should be fixed if such a caller is introduced. It wasn't a
3560 * problem previously because this code would always wait, but now
3561 * that some tuple locks do not conflict with one of the lock modes we
3562 * use, it is possible that this case is interesting to handle
3563 * specially.
3564 *
3565 * This may cause failures with third-party code that calls
3566 * heap_update directly.
3567 */
3568
3569 /* must copy state data before unlocking buffer */
3571 infomask = oldtup.t_data->t_infomask;
3572
3573 /*
3574 * Now we have to do something about the existing locker. If it's a
3575 * multi, sleep on it; we might be awakened before it is completely
3576 * gone (or even not sleep at all in some cases); we need to preserve
3577 * it as locker, unless it is gone completely.
3578 *
3579 * If it's not a multi, we need to check for sleeping conditions
3580 * before actually going to sleep. If the update doesn't conflict
3581 * with the locks, we just continue without sleeping (but making sure
3582 * it is preserved).
3583 *
3584 * Before sleeping, we need to acquire tuple lock to establish our
3585 * priority for the tuple (see heap_lock_tuple). LockTuple will
3586 * release us when we are next-in-line for the tuple. Note we must
3587 * not acquire the tuple lock until we're sure we're going to sleep;
3588 * otherwise we're open for race conditions with other transactions
3589 * holding the tuple lock which sleep on us.
3590 *
3591 * If we are forced to "start over" below, we keep the tuple lock;
3592 * this arranges that we stay at the head of the line while rechecking
3593 * tuple state.
3594 */
3596 {
3598 int remain;
3599 bool current_is_member = false;
3600
3602 *lockmode, &current_is_member))
3603 {
3605
3606 /*
3607 * Acquire the lock, if necessary (but skip it when we're
3608 * requesting a lock and already have one; avoids deadlock).
3609 */
3610 if (!current_is_member)
3611 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3613
3614 /* wait for multixact */
3616 relation, &oldtup.t_self, XLTW_Update,
3617 &remain);
3618 checked_lockers = true;
3619 locker_remains = remain != 0;
3621
3622 /*
3623 * If xwait had just locked the tuple then some other xact
3624 * could update this tuple before we get to this point. Check
3625 * for xmax change, and start over if so.
3626 */
3627 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3628 infomask) ||
3630 xwait))
3631 goto l2;
3632 }
3633
3634 /*
3635 * Note that the multixact may not be done by now. It could have
3636 * surviving members; our own xact or other subxacts of this
3637 * backend, and also any other concurrent transaction that locked
3638 * the tuple with LockTupleKeyShare if we only got
3639 * LockTupleNoKeyExclusive. If this is the case, we have to be
3640 * careful to mark the updated tuple with the surviving members in
3641 * Xmax.
3642 *
3643 * Note that there could have been another update in the
3644 * MultiXact. In that case, we need to check whether it committed
3645 * or aborted. If it aborted we are safe to update it again;
3646 * otherwise there is an update conflict, and we have to return
3647 * TableTuple{Deleted, Updated} below.
3648 *
3649 * In the LockTupleExclusive case, we still need to preserve the
3650 * surviving members: those would include the tuple locks we had
3651 * before this one, which are important to keep in case this
3652 * subxact aborts.
3653 */
3654 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3656 else
3658
3659 /*
3660 * There was no UPDATE in the MultiXact; or it aborted. No
3661 * TransactionIdIsInProgress() call needed here, since we called
3662 * MultiXactIdWait() above.
3663 */
3666 can_continue = true;
3667 }
3669 {
3670 /*
3671 * The only locker is ourselves; we can avoid grabbing the tuple
3672 * lock here, but must preserve our locking information.
3673 */
3674 checked_lockers = true;
3675 locker_remains = true;
3676 can_continue = true;
3677 }
3679 {
3680 /*
3681 * If it's just a key-share locker, and we're not changing the key
3682 * columns, we don't need to wait for it to end; but we need to
3683 * preserve it as locker.
3684 */
3685 checked_lockers = true;
3686 locker_remains = true;
3687 can_continue = true;
3688 }
3689 else
3690 {
3691 /*
3692 * Wait for regular transaction to end; but first, acquire tuple
3693 * lock.
3694 */
3696 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3698 XactLockTableWait(xwait, relation, &oldtup.t_self,
3699 XLTW_Update);
3700 checked_lockers = true;
3702
3703 /*
3704 * xwait is done, but if xwait had just locked the tuple then some
3705 * other xact could update this tuple before we get to this point.
3706 * Check for xmax change, and start over if so.
3707 */
3708 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3711 goto l2;
3712
3713 /* Otherwise check if it committed or aborted */
3714 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3715 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3716 can_continue = true;
3717 }
3718
3719 if (can_continue)
3720 result = TM_Ok;
3721 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3722 result = TM_Updated;
3723 else
3724 result = TM_Deleted;
3725 }
3726
3727 /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3728 if (result != TM_Ok)
3729 {
3730 Assert(result == TM_SelfModified ||
3731 result == TM_Updated ||
3732 result == TM_Deleted ||
3733 result == TM_BeingModified);
3734 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3735 Assert(result != TM_Updated ||
3736 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3737 }
3738
3739 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3740 {
3741 /* Perform additional check for transaction-snapshot mode RI updates */
3743 result = TM_Updated;
3744 }
3745
3746 if (result != TM_Ok)
3747 {
3748 tmfd->ctid = oldtup.t_data->t_ctid;
3749 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3750 if (result == TM_SelfModified)
3751 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3752 else
3753 tmfd->cmax = InvalidCommandId;
3754 UnlockReleaseBuffer(buffer);
3755 if (have_tuple_lock)
3756 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3757 if (vmbuffer != InvalidBuffer)
3758 ReleaseBuffer(vmbuffer);
3760
3767 return result;
3768 }
3769
3770 /*
3771 * If we didn't pin the visibility map page and the page has become all
3772 * visible while we were busy locking the buffer, or during some
3773 * subsequent window during which we had it unlocked, we'll have to unlock
3774 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3775 * bit unfortunate, especially since we'll now have to recheck whether the
3776 * tuple has been locked or updated under us, but hopefully it won't
3777 * happen very often.
3778 */
3779 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3780 {
3782 visibilitymap_pin(relation, block, &vmbuffer);
3784 goto l2;
3785 }
3786
3787 /* Fill in transaction status data */
3788
3789 /*
3790 * If the tuple we're updating is locked, we need to preserve the locking
3791 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3792 */
3794 oldtup.t_data->t_infomask,
3795 oldtup.t_data->t_infomask2,
3796 xid, *lockmode, true,
3799
3800 /*
3801 * And also prepare an Xmax value for the new copy of the tuple. If there
3802 * was no xmax previously, or there was one but all lockers are now gone,
3803 * then use InvalidTransactionId; otherwise, get the xmax from the old
3804 * tuple. (In rare cases that might also be InvalidTransactionId and yet
3805 * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3806 */
3807 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3808 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3811 else
3813
3815 {
3818 }
3819 else
3820 {
3821 /*
3822 * If we found a valid Xmax for the new tuple, then the infomask bits
3823 * to use on the new tuple depend on what was there on the old one.
3824 * Note that since we're doing an update, the only possibility is that
3825 * the lockers had FOR KEY SHARE lock.
3826 */
3827 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3828 {
3831 }
3832 else
3833 {
3836 }
3837 }
3838
3839 /*
3840 * Prepare the new tuple with the appropriate initial values of Xmin and
3841 * Xmax, as well as initial infomask bits as computed above.
3842 */
3843 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3844 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3845 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3847 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3848 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3850
3851 /*
3852 * Replace cid with a combo CID if necessary. Note that we already put
3853 * the plain cid into the new tuple.
3854 */
3856
3857 /*
3858 * If the toaster needs to be activated, OR if the new tuple will not fit
3859 * on the same page as the old, then we need to release the content lock
3860 * (but not the pin!) on the old tuple's buffer while we are off doing
3861 * TOAST and/or table-file-extension work. We must mark the old tuple to
3862 * show that it's locked, else other processes may try to update it
3863 * themselves.
3864 *
3865 * We need to invoke the toaster if there are already any out-of-line
3866 * toasted values present, or if the new tuple is over-threshold.
3867 */
3868 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3869 relation->rd_rel->relkind != RELKIND_MATVIEW)
3870 {
3871 /* toast table entries should never be recursively toasted */
3874 need_toast = false;
3875 }
3876 else
3879 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3880
3882
3883 newtupsize = MAXALIGN(newtup->t_len);
3884
3886 {
3890 bool cleared_all_frozen = false;
3891
3892 /*
3893 * To prevent concurrent sessions from updating the tuple, we have to
3894 * temporarily mark it locked, while we release the page-level lock.
3895 *
3896 * To satisfy the rule that any xid potentially appearing in a buffer
3897 * written out to disk, we unfortunately have to WAL log this
3898 * temporary modification. We can reuse xl_heap_lock for this
3899 * purpose. If we crash/error before following through with the
3900 * actual update, xmax will be of an aborted transaction, allowing
3901 * other sessions to proceed.
3902 */
3903
3904 /*
3905 * Compute xmax / infomask appropriate for locking the tuple. This has
3906 * to be done separately from the combo that's going to be used for
3907 * updating, because the potentially created multixact would otherwise
3908 * be wrong.
3909 */
3911 oldtup.t_data->t_infomask,
3912 oldtup.t_data->t_infomask2,
3913 xid, *lockmode, false,
3916
3918
3920
3921 /* Clear obsolete visibility flags ... */
3922 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3923 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3925 /* ... and store info about transaction updating this tuple */
3928 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3929 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3931
3932 /* temporarily make it look not-updated, but locked */
3933 oldtup.t_data->t_ctid = oldtup.t_self;
3934
3935 /*
3936 * Clear all-frozen bit on visibility map if needed. We could
3937 * immediately reset ALL_VISIBLE, but given that the WAL logging
3938 * overhead would be unchanged, that doesn't seem necessarily
3939 * worthwhile.
3940 */
3941 if (PageIsAllVisible(page) &&
3942 visibilitymap_clear(relation, block, vmbuffer,
3944 cleared_all_frozen = true;
3945
3946 MarkBufferDirty(buffer);
3947
3948 if (RelationNeedsWAL(relation))
3949 {
3952
3955
3956 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3958 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3959 oldtup.t_data->t_infomask2);
3960 xlrec.flags =
3964 PageSetLSN(page, recptr);
3965 }
3966
3968
3970
3971 /*
3972 * Let the toaster do its thing, if needed.
3973 *
3974 * Note: below this point, heaptup is the data we actually intend to
3975 * store into the relation; newtup is the caller's original untoasted
3976 * data.
3977 */
3978 if (need_toast)
3979 {
3980 /* Note we always use WAL and FSM during updates */
3982 newtupsize = MAXALIGN(heaptup->t_len);
3983 }
3984 else
3985 heaptup = newtup;
3986
3987 /*
3988 * Now, do we need a new page for the tuple, or not? This is a bit
3989 * tricky since someone else could have added tuples to the page while
3990 * we weren't looking. We have to recheck the available space after
3991 * reacquiring the buffer lock. But don't bother to do that if the
3992 * former amount of free space is still not enough; it's unlikely
3993 * there's more free now than before.
3994 *
3995 * What's more, if we need to get a new page, we will need to acquire
3996 * buffer locks on both old and new pages. To avoid deadlock against
3997 * some other backend trying to get the same two locks in the other
3998 * order, we must be consistent about the order we get the locks in.
3999 * We use the rule "lock the lower-numbered page of the relation
4000 * first". To implement this, we must do RelationGetBufferForTuple
4001 * while not holding the lock on the old page, and we must rely on it
4002 * to get the locks on both pages in the correct order.
4003 *
4004 * Another consideration is that we need visibility map page pin(s) if
4005 * we will have to clear the all-visible flag on either page. If we
4006 * call RelationGetBufferForTuple, we rely on it to acquire any such
4007 * pins; but if we don't, we have to handle that here. Hence we need
4008 * a loop.
4009 */
4010 for (;;)
4011 {
4012 if (newtupsize > pagefree)
4013 {
4014 /* It doesn't fit, must use RelationGetBufferForTuple. */
4015 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4016 buffer, 0, NULL,
4017 &vmbuffer_new, &vmbuffer,
4018 0);
4019 /* We're all done. */
4020 break;
4021 }
4022 /* Acquire VM page pin if needed and we don't have it. */
4023 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4024 visibilitymap_pin(relation, block, &vmbuffer);
4025 /* Re-acquire the lock on the old tuple's page. */
4027 /* Re-check using the up-to-date free space */
4029 if (newtupsize > pagefree ||
4030 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
4031 {
4032 /*
4033 * Rats, it doesn't fit anymore, or somebody just now set the
4034 * all-visible flag. We must now unlock and loop to avoid
4035 * deadlock. Fortunately, this path should seldom be taken.
4036 */
4038 }
4039 else
4040 {
4041 /* We're all done. */
4042 newbuf = buffer;
4043 break;
4044 }
4045 }
4046 }
4047 else
4048 {
4049 /* No TOAST work needed, and it'll fit on same page */
4050 newbuf = buffer;
4051 heaptup = newtup;
4052 }
4053
4054 /*
4055 * We're about to do the actual update -- check for conflict first, to
4056 * avoid possibly having to roll back work we've just done.
4057 *
4058 * This is safe without a recheck as long as there is no possibility of
4059 * another process scanning the pages between this check and the update
4060 * being visible to the scan (i.e., exclusive buffer content lock(s) are
4061 * continuously held from this point until the tuple update is visible).
4062 *
4063 * For the new tuple the only check needed is at the relation level, but
4064 * since both tuples are in the same relation and the check for oldtup
4065 * will include checking the relation level, there is no benefit to a
4066 * separate check for the new tuple.
4067 */
4068 CheckForSerializableConflictIn(relation, &oldtup.t_self,
4069 BufferGetBlockNumber(buffer));
4070
4071 /*
4072 * At this point newbuf and buffer are both pinned and locked, and newbuf
4073 * has enough space for the new tuple. If they are the same buffer, only
4074 * one pin is held.
4075 */
4076
4077 if (newbuf == buffer)
4078 {
4079 /*
4080 * Since the new tuple is going into the same page, we might be able
4081 * to do a HOT update. Check if any of the index columns have been
4082 * changed.
4083 */
4085 {
4086 use_hot_update = true;
4087
4088 /*
4089 * If none of the columns that are used in hot-blocking indexes
4090 * were updated, we can apply HOT, but we do still need to check
4091 * if we need to update the summarizing indexes, and update those
4092 * indexes if the columns were updated, or we may fail to detect
4093 * e.g. value bound changes in BRIN minmax indexes.
4094 */
4096 summarized_update = true;
4097 }
4098 }
4099 else
4100 {
4101 /* Set a hint that the old page could use prune/defrag */
4102 PageSetFull(page);
4103 }
4104
4105 /*
4106 * Compute replica identity tuple before entering the critical section so
4107 * we don't PANIC upon a memory allocation failure.
4108 * ExtractReplicaIdentity() will return NULL if nothing needs to be
4109 * logged. Pass old key required as true only if the replica identity key
4110 * columns are modified or it has external data.
4111 */
4116
4117 /* NO EREPORT(ERROR) from here till changes are logged */
4119
4120 /*
4121 * If this transaction commits, the old tuple will become DEAD sooner or
4122 * later. Set flag that this page is a candidate for pruning once our xid
4123 * falls below the OldestXmin horizon. If the transaction finally aborts,
4124 * the subsequent page pruning will be a no-op and the hint will be
4125 * cleared.
4126 *
4127 * XXX Should we set hint on newbuf as well? If the transaction aborts,
4128 * there would be a prunable tuple in the newbuf; but for now we choose
4129 * not to optimize for aborts. Note that heap_xlog_update must be kept in
4130 * sync if this decision changes.
4131 */
4132 PageSetPrunable(page, xid);
4133
4134 if (use_hot_update)
4135 {
4136 /* Mark the old tuple as HOT-updated */
4138 /* And mark the new tuple as heap-only */
4140 /* Mark the caller's copy too, in case different from heaptup */
4142 }
4143 else
4144 {
4145 /* Make sure tuples are correctly marked as not-HOT */
4149 }
4150
4151 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4152
4153
4154 /* Clear obsolete visibility flags, possibly set by ourselves above... */
4155 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4156 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4157 /* ... and store info about transaction updating this tuple */
4160 oldtup.t_data->t_infomask |= infomask_old_tuple;
4161 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4163
4164 /* record address of new tuple in t_ctid of old one */
4165 oldtup.t_data->t_ctid = heaptup->t_self;
4166
4167 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4168 if (PageIsAllVisible(BufferGetPage(buffer)))
4169 {
4170 all_visible_cleared = true;
4172 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4173 vmbuffer, VISIBILITYMAP_VALID_BITS);
4174 }
4175 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4176 {
4181 }
4182
4183 if (newbuf != buffer)
4185 MarkBufferDirty(buffer);
4186
4187 /* XLOG stuff */
4188 if (RelationNeedsWAL(relation))
4189 {
4191
4192 /*
4193 * For logical decoding we need combo CIDs to properly decode the
4194 * catalog.
4195 */
4197 {
4198 log_heap_new_cid(relation, &oldtup);
4199 log_heap_new_cid(relation, heaptup);
4200 }
4201
4202 recptr = log_heap_update(relation, buffer,
4207 if (newbuf != buffer)
4208 {
4210 }
4212 }
4213
4215
4216 if (newbuf != buffer)
4219
4220 /*
4221 * Mark old tuple for invalidation from system caches at next command
4222 * boundary, and mark the new tuple for invalidation in case we abort. We
4223 * have to do this before releasing the buffer because oldtup is in the
4224 * buffer. (heaptup is all in local memory, but it's necessary to process
4225 * both tuple versions in one call to inval.c so we can avoid redundant
4226 * sinval messages.)
4227 */
4229
4230 /* Now we can release the buffer(s) */
4231 if (newbuf != buffer)
4233 ReleaseBuffer(buffer);
4236 if (BufferIsValid(vmbuffer))
4237 ReleaseBuffer(vmbuffer);
4238
4239 /*
4240 * Release the lmgr tuple lock, if we had it.
4241 */
4242 if (have_tuple_lock)
4243 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4244
4245 pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
4246
4247 /*
4248 * If heaptup is a private copy, release it. Don't forget to copy t_self
4249 * back to the caller's image, too.
4250 */
4251 if (heaptup != newtup)
4252 {
4253 newtup->t_self = heaptup->t_self;
4255 }
4256
4257 /*
4258 * If it is a HOT update, the update may still need to update summarized
4259 * indexes, lest we fail to update those summaries and get incorrect
4260 * results (for example, minmax bounds of the block may change with this
4261 * update).
4262 */
4263 if (use_hot_update)
4264 {
4267 else
4269 }
4270 else
4272
4275
4282
4283 return TM_Ok;
4284}
4285
4286#ifdef USE_ASSERT_CHECKING
4287/*
4288 * Confirm adequate lock held during heap_update(), per rules from
4289 * README.tuplock section "Locking to write inplace-updated tables".
4290 */
4291static void
4293 const ItemPointerData *otid,
4295{
4296 /* LOCKTAG_TUPLE acceptable for any catalog */
4297 switch (RelationGetRelid(relation))
4298 {
4299 case RelationRelationId:
4300 case DatabaseRelationId:
4301 {
4303
4305 relation->rd_lockInfo.lockRelId.dbId,
4306 relation->rd_lockInfo.lockRelId.relId,
4310 return;
4311 }
4312 break;
4313 default:
4314 Assert(!IsInplaceUpdateRelation(relation));
4315 return;
4316 }
4317
4318 switch (RelationGetRelid(relation))
4319 {
4320 case RelationRelationId:
4321 {
4322 /* LOCKTAG_TUPLE or LOCKTAG_RELATION ok */
4324 Oid relid = classForm->oid;
4325 Oid dbid;
4326 LOCKTAG tag;
4327
4328 if (IsSharedRelation(relid))
4329 dbid = InvalidOid;
4330 else
4331 dbid = MyDatabaseId;
4332
4333 if (classForm->relkind == RELKIND_INDEX)
4334 {
4335 Relation irel = index_open(relid, AccessShareLock);
4336
4337 SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4339 }
4340 else
4341 SET_LOCKTAG_RELATION(tag, dbid, relid);
4342
4343 if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, false) &&
4344 !LockHeldByMe(&tag, ShareRowExclusiveLock, true))
4345 elog(WARNING,
4346 "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4347 NameStr(classForm->relname),
4348 relid,
4349 classForm->relkind,
4352 }
4353 break;
4354 case DatabaseRelationId:
4355 {
4356 /* LOCKTAG_TUPLE required */
4358
4359 elog(WARNING,
4360 "missing lock on database \"%s\" (OID %u) @ TID (%u,%u)",
4361 NameStr(dbForm->datname),
4362 dbForm->oid,
4365 }
4366 break;
4367 }
4368}
4369
4370/*
4371 * Confirm adequate relation lock held, per rules from README.tuplock section
4372 * "Locking to write inplace-updated tables".
4373 */
4374static void
4376{
4378 Oid relid = classForm->oid;
4379 Oid dbid;
4380 LOCKTAG tag;
4381
4382 if (IsSharedRelation(relid))
4383 dbid = InvalidOid;
4384 else
4385 dbid = MyDatabaseId;
4386
4387 if (classForm->relkind == RELKIND_INDEX)
4388 {
4389 Relation irel = index_open(relid, AccessShareLock);
4390
4391 SET_LOCKTAG_RELATION(tag, dbid, irel->rd_index->indrelid);
4393 }
4394 else
4395 SET_LOCKTAG_RELATION(tag, dbid, relid);
4396
4397 if (!LockHeldByMe(&tag, ShareUpdateExclusiveLock, true))
4398 elog(WARNING,
4399 "missing lock for relation \"%s\" (OID %u, relkind %c) @ TID (%u,%u)",
4400 NameStr(classForm->relname),
4401 relid,
4402 classForm->relkind,
4405}
4406#endif
4407
4408/*
4409 * Check if the specified attribute's values are the same. Subroutine for
4410 * HeapDetermineColumnsInfo.
4411 */
4412static bool
4413heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2,
4414 bool isnull1, bool isnull2)
4415{
4416 /*
4417 * If one value is NULL and other is not, then they are certainly not
4418 * equal
4419 */
4420 if (isnull1 != isnull2)
4421 return false;
4422
4423 /*
4424 * If both are NULL, they can be considered equal.
4425 */
4426 if (isnull1)
4427 return true;
4428
4429 /*
4430 * We do simple binary comparison of the two datums. This may be overly
4431 * strict because there can be multiple binary representations for the
4432 * same logical value. But we should be OK as long as there are no false
4433 * positives. Using a type-specific equality operator is messy because
4434 * there could be multiple notions of equality in different operator
4435 * classes; furthermore, we cannot safely invoke user-defined functions
4436 * while holding exclusive buffer lock.
4437 */
4438 if (attrnum <= 0)
4439 {
4440 /* The only allowed system columns are OIDs, so do this */
4442 }
4443 else
4444 {
4446
4448 att = TupleDescCompactAttr(tupdesc, attrnum - 1);
4449 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4450 }
4451}
4452
4453/*
4454 * Check which columns are being updated.
4455 *
4456 * Given an updated tuple, determine (and return into the output bitmapset),
4457 * from those listed as interesting, the set of columns that changed.
4458 *
4459 * has_external indicates if any of the unmodified attributes (from those
4460 * listed as interesting) of the old tuple is a member of external_cols and is
4461 * stored externally.
4462 */
4463static Bitmapset *
4468 bool *has_external)
4469{
4470 int attidx;
4472 TupleDesc tupdesc = RelationGetDescr(relation);
4473
4474 attidx = -1;
4475 while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
4476 {
4477 /* attidx is zero-based, attrnum is the normal attribute number */
4479 Datum value1,
4480 value2;
4481 bool isnull1,
4482 isnull2;
4483
4484 /*
4485 * If it's a whole-tuple reference, say "not equal". It's not really
4486 * worth supporting this case, since it could only succeed after a
4487 * no-op update, which is hardly a case worth optimizing for.
4488 */
4489 if (attrnum == 0)
4490 {
4491 modified = bms_add_member(modified, attidx);
4492 continue;
4493 }
4494
4495 /*
4496 * Likewise, automatically say "not equal" for any system attribute
4497 * other than tableOID; we cannot expect these to be consistent in a
4498 * HOT chain, or even to be set correctly yet in the new tuple.
4499 */
4500 if (attrnum < 0)
4501 {
4502 if (attrnum != TableOidAttributeNumber)
4503 {
4504 modified = bms_add_member(modified, attidx);
4505 continue;
4506 }
4507 }
4508
4509 /*
4510 * Extract the corresponding values. XXX this is pretty inefficient
4511 * if there are many indexed columns. Should we do a single
4512 * heap_deform_tuple call on each tuple, instead? But that doesn't
4513 * work for system columns ...
4514 */
4515 value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4516 value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4517
4518 if (!heap_attr_equals(tupdesc, attrnum, value1,
4519 value2, isnull1, isnull2))
4520 {
4521 modified = bms_add_member(modified, attidx);
4522 continue;
4523 }
4524
4525 /*
4526 * No need to check attributes that can't be stored externally. Note
4527 * that system attributes can't be stored externally.
4528 */
4529 if (attrnum < 0 || isnull1 ||
4530 TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
4531 continue;
4532
4533 /*
4534 * Check if the old tuple's attribute is stored externally and is a
4535 * member of external_cols.
4536 */
4539 *has_external = true;
4540 }
4541
4542 return modified;
4543}
4544
4545/*
4546 * simple_heap_update - replace a tuple
4547 *
4548 * This routine may be used to update a tuple when concurrent updates of
4549 * the target tuple are not expected (for example, because we have a lock
4550 * on the relation associated with the tuple). Any failure is reported
4551 * via ereport().
4552 */
4553void
4556{
4557 TM_Result result;
4558 TM_FailureData tmfd;
4559 LockTupleMode lockmode;
4560
4561 result = heap_update(relation, otid, tup,
4563 true /* wait for commit */ ,
4564 &tmfd, &lockmode, update_indexes);
4565 switch (result)
4566 {
4567 case TM_SelfModified:
4568 /* Tuple was already updated in current command? */
4569 elog(ERROR, "tuple already updated by self");
4570 break;
4571
4572 case TM_Ok:
4573 /* done successfully */
4574 break;
4575
4576 case TM_Updated:
4577 elog(ERROR, "tuple concurrently updated");
4578 break;
4579
4580 case TM_Deleted:
4581 elog(ERROR, "tuple concurrently deleted");
4582 break;
4583
4584 default:
4585 elog(ERROR, "unrecognized heap_update status: %u", result);
4586 break;
4587 }
4588}
4589
4590
4591/*
4592 * Return the MultiXactStatus corresponding to the given tuple lock mode.
4593 */
4594static MultiXactStatus
4596{
4597 int retval;
4598
4599 if (is_update)
4600 retval = tupleLockExtraInfo[mode].updstatus;
4601 else
4602 retval = tupleLockExtraInfo[mode].lockstatus;
4603
4604 if (retval == -1)
4605 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4606 is_update ? "true" : "false");
4607
4608 return (MultiXactStatus) retval;
4609}
4610
4611/*
4612 * heap_lock_tuple - lock a tuple in shared or exclusive mode
4613 *
4614 * Note that this acquires a buffer pin, which the caller must release.
4615 *
4616 * Input parameters:
4617 * relation: relation containing tuple (caller must hold suitable lock)
4618 * cid: current command ID (used for visibility test, and stored into
4619 * tuple's cmax if lock is successful)
4620 * mode: indicates if shared or exclusive tuple lock is desired
4621 * wait_policy: what to do if tuple lock is not available
4622 * follow_updates: if true, follow the update chain to also lock descendant
4623 * tuples.
4624 *
4625 * Output parameters:
4626 * *tuple: all fields filled in
4627 * *buffer: set to buffer holding tuple (pinned but not locked at exit)
4628 * *tmfd: filled in failure cases (see below)
4629 *
4630 * Function results are the same as the ones for table_tuple_lock().
4631 *
4632 * In the failure cases other than TM_Invisible, the routine fills
4633 * *tmfd with the tuple's t_ctid, t_xmax (resolving a possible MultiXact,
4634 * if necessary), and t_cmax (the last only for TM_SelfModified,
4635 * since we cannot obtain cmax from a combo CID generated by another
4636 * transaction).
4637 * See comments for struct TM_FailureData for additional info.
4638 *
4639 * See README.tuplock for a thorough explanation of this mechanism.
4640 */
4642heap_lock_tuple(Relation relation, HeapTuple tuple,
4644 bool follow_updates,
4645 Buffer *buffer, TM_FailureData *tmfd)
4646{
4647 TM_Result result;
4648 ItemPointer tid = &(tuple->t_self);
4649 ItemId lp;
4650 Page page;
4651 Buffer vmbuffer = InvalidBuffer;
4652 BlockNumber block;
4653 TransactionId xid,
4654 xmax;
4658 bool first_time = true;
4659 bool skip_tuple_lock = false;
4660 bool have_tuple_lock = false;
4661 bool cleared_all_frozen = false;
4662
4663 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4664 block = ItemPointerGetBlockNumber(tid);
4665
4666 /*
4667 * Before locking the buffer, pin the visibility map page if it appears to
4668 * be necessary. Since we haven't got the lock yet, someone else might be
4669 * in the middle of changing this, so we'll need to recheck after we have
4670 * the lock.
4671 */
4672 if (PageIsAllVisible(BufferGetPage(*buffer)))
4673 visibilitymap_pin(relation, block, &vmbuffer);
4674
4676
4677 page = BufferGetPage(*buffer);
4680
4681 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4682 tuple->t_len = ItemIdGetLength(lp);
4683 tuple->t_tableOid = RelationGetRelid(relation);
4684
4685l3:
4686 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4687
4688 if (result == TM_Invisible)
4689 {
4690 /*
4691 * This is possible, but only when locking a tuple for ON CONFLICT
4692 * UPDATE. We return this value here rather than throwing an error in
4693 * order to give that case the opportunity to throw a more specific
4694 * error.
4695 */
4696 result = TM_Invisible;
4697 goto out_locked;
4698 }
4699 else if (result == TM_BeingModified ||
4700 result == TM_Updated ||
4701 result == TM_Deleted)
4702 {
4706 bool require_sleep;
4707 ItemPointerData t_ctid;
4708
4709 /* must copy state data before unlocking buffer */
4711 infomask = tuple->t_data->t_infomask;
4712 infomask2 = tuple->t_data->t_infomask2;
4713 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4714
4716
4717 /*
4718 * If any subtransaction of the current top transaction already holds
4719 * a lock as strong as or stronger than what we're requesting, we
4720 * effectively hold the desired lock already. We *must* succeed
4721 * without trying to take the tuple lock, else we will deadlock
4722 * against anyone wanting to acquire a stronger lock.
4723 *
4724 * Note we only do this the first time we loop on the HTSU result;
4725 * there is no point in testing in subsequent passes, because
4726 * evidently our own transaction cannot have acquired a new lock after
4727 * the first time we checked.
4728 */
4729 if (first_time)
4730 {
4731 first_time = false;
4732
4734 {
4735 int i;
4736 int nmembers;
4737 MultiXactMember *members;
4738
4739 /*
4740 * We don't need to allow old multixacts here; if that had
4741 * been the case, HeapTupleSatisfiesUpdate would have returned
4742 * MayBeUpdated and we wouldn't be here.
4743 */
4744 nmembers =
4745 GetMultiXactIdMembers(xwait, &members, false,
4747
4748 for (i = 0; i < nmembers; i++)
4749 {
4750 /* only consider members of our own transaction */
4751 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4752 continue;
4753
4754 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4755 {
4756 pfree(members);
4757 result = TM_Ok;
4758 goto out_unlocked;
4759 }
4760 else
4761 {
4762 /*
4763 * Disable acquisition of the heavyweight tuple lock.
4764 * Otherwise, when promoting a weaker lock, we might
4765 * deadlock with another locker that has acquired the
4766 * heavyweight tuple lock and is waiting for our
4767 * transaction to finish.
4768 *
4769 * Note that in this case we still need to wait for
4770 * the multixact if required, to avoid acquiring
4771 * conflicting locks.
4772 */
4773 skip_tuple_lock = true;
4774 }
4775 }
4776
4777 if (members)
4778 pfree(members);
4779 }
4781 {
4782 switch (mode)
4783 {
4784 case LockTupleKeyShare:
4788 result = TM_Ok;
4789 goto out_unlocked;
4790 case LockTupleShare:
4793 {
4794 result = TM_Ok;
4795 goto out_unlocked;
4796 }
4797 break;
4800 {
4801 result = TM_Ok;
4802 goto out_unlocked;
4803 }
4804 break;
4805 case LockTupleExclusive:
4808 {
4809 result = TM_Ok;
4810 goto out_unlocked;
4811 }
4812 break;
4813 }
4814 }
4815 }
4816
4817 /*
4818 * Initially assume that we will have to wait for the locking
4819 * transaction(s) to finish. We check various cases below in which
4820 * this can be turned off.
4821 */
4822 require_sleep = true;
4823 if (mode == LockTupleKeyShare)
4824 {
4825 /*
4826 * If we're requesting KeyShare, and there's no update present, we
4827 * don't need to wait. Even if there is an update, we can still
4828 * continue if the key hasn't been modified.
4829 *
4830 * However, if there are updates, we need to walk the update chain
4831 * to mark future versions of the row as locked, too. That way,
4832 * if somebody deletes that future version, we're protected
4833 * against the key going away. This locking of future versions
4834 * could block momentarily, if a concurrent transaction is
4835 * deleting a key; or it could return a value to the effect that
4836 * the transaction deleting the key has already committed. So we
4837 * do this before re-locking the buffer; otherwise this would be
4838 * prone to deadlocks.
4839 *
4840 * Note that the TID we're locking was grabbed before we unlocked
4841 * the buffer. For it to change while we're not looking, the
4842 * other properties we're testing for below after re-locking the
4843 * buffer would also change, in which case we would restart this
4844 * loop above.
4845 */
4847 {
4848 bool updated;
4849
4851
4852 /*
4853 * If there are updates, follow the update chain; bail out if
4854 * that cannot be done.
4855 */
4856 if (follow_updates && updated &&
4857 !ItemPointerEquals(&tuple->t_self, &t_ctid))
4858 {
4859 TM_Result res;
4860
4861 res = heap_lock_updated_tuple(relation,
4862 infomask, xwait, &t_ctid,
4864 mode);
4865 if (res != TM_Ok)
4866 {
4867 result = res;
4868 /* recovery code expects to have buffer lock held */
4870 goto failed;
4871 }
4872 }
4873
4875
4876 /*
4877 * Make sure it's still an appropriate lock, else start over.
4878 * Also, if it wasn't updated before we released the lock, but
4879 * is updated now, we start over too; the reason is that we
4880 * now need to follow the update chain to lock the new
4881 * versions.
4882 */
4883 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4884 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4885 !updated))
4886 goto l3;
4887
4888 /* Things look okay, so we can skip sleeping */
4889 require_sleep = false;
4890
4891 /*
4892 * Note we allow Xmax to change here; other updaters/lockers
4893 * could have modified it before we grabbed the buffer lock.
4894 * However, this is not a problem, because with the recheck we
4895 * just did we ensure that they still don't conflict with the
4896 * lock we want.
4897 */
4898 }
4899 }
4900 else if (mode == LockTupleShare)
4901 {
4902 /*
4903 * If we're requesting Share, we can similarly avoid sleeping if
4904 * there's no update and no exclusive lock present.
4905 */
4908 {
4910
4911 /*
4912 * Make sure it's still an appropriate lock, else start over.
4913 * See above about allowing xmax to change.
4914 */
4917 goto l3;
4918 require_sleep = false;
4919 }
4920 }
4921 else if (mode == LockTupleNoKeyExclusive)
4922 {
4923 /*
4924 * If we're requesting NoKeyExclusive, we might also be able to
4925 * avoid sleeping; just ensure that there no conflicting lock
4926 * already acquired.
4927 */
4929 {
4931 mode, NULL))
4932 {
4933 /*
4934 * No conflict, but if the xmax changed under us in the
4935 * meantime, start over.
4936 */
4940 xwait))
4941 goto l3;
4942
4943 /* otherwise, we're good */
4944 require_sleep = false;
4945 }
4946 }
4948 {
4950
4951 /* if the xmax changed in the meantime, start over */
4954 xwait))
4955 goto l3;
4956 /* otherwise, we're good */
4957 require_sleep = false;
4958 }
4959 }
4960
4961 /*
4962 * As a check independent from those above, we can also avoid sleeping
4963 * if the current transaction is the sole locker of the tuple. Note
4964 * that the strength of the lock already held is irrelevant; this is
4965 * not about recording the lock in Xmax (which will be done regardless
4966 * of this optimization, below). Also, note that the cases where we
4967 * hold a lock stronger than we are requesting are already handled
4968 * above by not doing anything.
4969 *
4970 * Note we only deal with the non-multixact case here; MultiXactIdWait
4971 * is well equipped to deal with this situation on its own.
4972 */
4975 {
4976 /* ... but if the xmax changed in the meantime, start over */
4980 xwait))
4981 goto l3;
4983 require_sleep = false;
4984 }
4985
4986 /*
4987 * Time to sleep on the other transaction/multixact, if necessary.
4988 *
4989 * If the other transaction is an update/delete that's already
4990 * committed, then sleeping cannot possibly do any good: if we're
4991 * required to sleep, get out to raise an error instead.
4992 *
4993 * By here, we either have already acquired the buffer exclusive lock,
4994 * or we must wait for the locking transaction or multixact; so below
4995 * we ensure that we grab buffer lock after the sleep.
4996 */
4997 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4998 {
5000 goto failed;
5001 }
5002 else if (require_sleep)
5003 {
5004 /*
5005 * Acquire tuple lock to establish our priority for the tuple, or
5006 * die trying. LockTuple will release us when we are next-in-line
5007 * for the tuple. We must do this even if we are share-locking,
5008 * but not if we already have a weaker lock on the tuple.
5009 *
5010 * If we are forced to "start over" below, we keep the tuple lock;
5011 * this arranges that we stay at the head of the line while
5012 * rechecking tuple state.
5013 */
5014 if (!skip_tuple_lock &&
5015 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
5017 {
5018 /*
5019 * This can only happen if wait_policy is Skip and the lock
5020 * couldn't be obtained.
5021 */
5022 result = TM_WouldBlock;
5023 /* recovery code expects to have buffer lock held */
5025 goto failed;
5026 }
5027
5029 {
5031
5032 /* We only ever lock tuples, never update them */
5033 if (status >= MultiXactStatusNoKeyUpdate)
5034 elog(ERROR, "invalid lock mode in heap_lock_tuple");
5035
5036 /* wait for multixact to end, or die trying */
5037 switch (wait_policy)
5038 {
5039 case LockWaitBlock:
5041 relation, &tuple->t_self, XLTW_Lock, NULL);
5042 break;
5043 case LockWaitSkip:
5045 status, infomask, relation,
5046 NULL, false))
5047 {
5048 result = TM_WouldBlock;
5049 /* recovery code expects to have buffer lock held */
5051 goto failed;
5052 }
5053 break;
5054 case LockWaitError:
5056 status, infomask, relation,
5058 ereport(ERROR,
5060 errmsg("could not obtain lock on row in relation \"%s\"",
5061 RelationGetRelationName(relation))));
5062
5063 break;
5064 }
5065
5066 /*
5067 * Of course, the multixact might not be done here: if we're
5068 * requesting a light lock mode, other transactions with light
5069 * locks could still be alive, as well as locks owned by our
5070 * own xact or other subxacts of this backend. We need to
5071 * preserve the surviving MultiXact members. Note that it
5072 * isn't absolutely necessary in the latter case, but doing so
5073 * is simpler.
5074 */
5075 }
5076 else
5077 {
5078 /* wait for regular transaction to end, or die trying */
5079 switch (wait_policy)
5080 {
5081 case LockWaitBlock:
5082 XactLockTableWait(xwait, relation, &tuple->t_self,
5083 XLTW_Lock);
5084 break;
5085 case LockWaitSkip:
5087 {
5088 result = TM_WouldBlock;
5089 /* recovery code expects to have buffer lock held */
5091 goto failed;
5092 }
5093 break;
5094 case LockWaitError:
5096 ereport(ERROR,
5098 errmsg("could not obtain lock on row in relation \"%s\"",
5099 RelationGetRelationName(relation))));
5100 break;
5101 }
5102 }
5103
5104 /* if there are updates, follow the update chain */
5106 !ItemPointerEquals(&tuple->t_self, &t_ctid))
5107 {
5108 TM_Result res;
5109
5110 res = heap_lock_updated_tuple(relation,
5111 infomask, xwait, &t_ctid,
5113 mode);
5114 if (res != TM_Ok)
5115 {
5116 result = res;
5117 /* recovery code expects to have buffer lock held */
5119 goto failed;
5120 }
5121 }
5122
5124
5125 /*
5126 * xwait is done, but if xwait had just locked the tuple then some
5127 * other xact could update this tuple before we get to this point.
5128 * Check for xmax change, and start over if so.
5129 */
5132 xwait))
5133 goto l3;
5134
5136 {
5137 /*
5138 * Otherwise check if it committed or aborted. Note we cannot
5139 * be here if the tuple was only locked by somebody who didn't
5140 * conflict with us; that would have been handled above. So
5141 * that transaction must necessarily be gone by now. But
5142 * don't check for this in the multixact case, because some
5143 * locker transactions might still be running.
5144 */
5145 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5146 }
5147 }
5148
5149 /* By here, we're certain that we hold buffer exclusive lock again */
5150
5151 /*
5152 * We may lock if previous xmax aborted, or if it committed but only
5153 * locked the tuple without updating it; or if we didn't have to wait
5154 * at all for whatever reason.
5155 */
5156 if (!require_sleep ||
5157 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5160 result = TM_Ok;
5161 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
5162 result = TM_Updated;
5163 else
5164 result = TM_Deleted;
5165 }
5166
5167failed:
5168 if (result != TM_Ok)
5169 {
5170 Assert(result == TM_SelfModified || result == TM_Updated ||
5171 result == TM_Deleted || result == TM_WouldBlock);
5172
5173 /*
5174 * When locking a tuple under LockWaitSkip semantics and we fail with
5175 * TM_WouldBlock above, it's possible for concurrent transactions to
5176 * release the lock and set HEAP_XMAX_INVALID in the meantime. So
5177 * this assert is slightly different from the equivalent one in
5178 * heap_delete and heap_update.
5179 */
5180 Assert((result == TM_WouldBlock) ||
5181 !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5182 Assert(result != TM_Updated ||
5183 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
5184 tmfd->ctid = tuple->t_data->t_ctid;
5185 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5186 if (result == TM_SelfModified)
5187 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5188 else
5189 tmfd->cmax = InvalidCommandId;
5190 goto out_locked;
5191 }
5192
5193 /*
5194 * If we didn't pin the visibility map page and the page has become all
5195 * visible while we were busy locking the buffer, or during some
5196 * subsequent window during which we had it unlocked, we'll have to unlock
5197 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5198 * unfortunate, especially since we'll now have to recheck whether the
5199 * tuple has been locked or updated under us, but hopefully it won't
5200 * happen very often.
5201 */
5202 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5203 {
5205 visibilitymap_pin(relation, block, &vmbuffer);
5207 goto l3;
5208 }
5209
5210 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5211 old_infomask = tuple->t_data->t_infomask;
5212
5213 /*
5214 * If this is the first possibly-multixact-able operation in the current
5215 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5216 * certain that the transaction will never become a member of any older
5217 * MultiXactIds than that. (We have to do this even if we end up just
5218 * using our own TransactionId below, since some other backend could
5219 * incorporate our XID into a MultiXact immediately afterwards.)
5220 */
5222
5223 /*
5224 * Compute the new xmax and infomask to store into the tuple. Note we do
5225 * not modify the tuple just yet, because that would leave it in the wrong
5226 * state if multixact.c elogs.
5227 */
5229 GetCurrentTransactionId(), mode, false,
5230 &xid, &new_infomask, &new_infomask2);
5231
5233
5234 /*
5235 * Store transaction information of xact locking the tuple.
5236 *
5237 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5238 * possibly generating a useless combo CID. Moreover, if we're locking a
5239 * previously updated tuple, it's important to preserve the Cmax.
5240 *
5241 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5242 * we would break the HOT chain.
5243 */
5246 tuple->t_data->t_infomask |= new_infomask;
5247 tuple->t_data->t_infomask2 |= new_infomask2;
5250 HeapTupleHeaderSetXmax(tuple->t_data, xid);
5251
5252 /*
5253 * Make sure there is no forward chain link in t_ctid. Note that in the
5254 * cases where the tuple has been updated, we must not overwrite t_ctid,
5255 * because it was set by the updater. Moreover, if the tuple has been
5256 * updated, we need to follow the update chain to lock the new versions of
5257 * the tuple as well.
5258 */
5260 tuple->t_data->t_ctid = *tid;
5261
5262 /* Clear only the all-frozen bit on visibility map if needed */
5263 if (PageIsAllVisible(page) &&
5264 visibilitymap_clear(relation, block, vmbuffer,
5266 cleared_all_frozen = true;
5267
5268
5269 MarkBufferDirty(*buffer);
5270
5271 /*
5272 * XLOG stuff. You might think that we don't need an XLOG record because
5273 * there is no state change worth restoring after a crash. You would be
5274 * wrong however: we have just written either a TransactionId or a
5275 * MultiXactId that may never have been seen on disk before, and we need
5276 * to make sure that there are XLOG entries covering those ID numbers.
5277 * Else the same IDs might be re-used after a crash, which would be
5278 * disastrous if this page made it to disk before the crash. Essentially
5279 * we have to enforce the WAL log-before-data rule even in this case.
5280 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5281 * entries for everything anyway.)
5282 */
5283 if (RelationNeedsWAL(relation))
5284 {
5287
5290
5291 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5292 xlrec.xmax = xid;
5293 xlrec.infobits_set = compute_infobits(new_infomask,
5294 tuple->t_data->t_infomask2);
5297
5298 /* we don't decode row locks atm, so no need to log the origin */
5299
5301
5302 PageSetLSN(page, recptr);
5303 }
5304
5306
5307 result = TM_Ok;
5308
5311
5313 if (BufferIsValid(vmbuffer))
5314 ReleaseBuffer(vmbuffer);
5315
5316 /*
5317 * Don't update the visibility map here. Locking a tuple doesn't change
5318 * visibility info.
5319 */
5320
5321 /*
5322 * Now that we have successfully marked the tuple as locked, we can
5323 * release the lmgr tuple lock, if we had it.
5324 */
5325 if (have_tuple_lock)
5326 UnlockTupleTuplock(relation, tid, mode);
5327
5328 return result;
5329}
5330
5331/*
5332 * Acquire heavyweight lock on the given tuple, in preparation for acquiring
5333 * its normal, Xmax-based tuple lock.
5334 *
5335 * have_tuple_lock is an input and output parameter: on input, it indicates
5336 * whether the lock has previously been acquired (and this function does
5337 * nothing in that case). If this function returns success, have_tuple_lock
5338 * has been flipped to true.
5339 *
5340 * Returns false if it was unable to obtain the lock; this can only happen if
5341 * wait_policy is Skip.
5342 */
5343static bool
5346{
5347 if (*have_tuple_lock)
5348 return true;
5349
5350 switch (wait_policy)
5351 {
5352 case LockWaitBlock:
5353 LockTupleTuplock(relation, tid, mode);
5354 break;
5355
5356 case LockWaitSkip:
5357 if (!ConditionalLockTupleTuplock(relation, tid, mode, false))
5358 return false;
5359 break;
5360
5361 case LockWaitError:
5363 ereport(ERROR,
5365 errmsg("could not obtain lock on row in relation \"%s\"",
5366 RelationGetRelationName(relation))));
5367 break;
5368 }
5369 *have_tuple_lock = true;
5370
5371 return true;
5372}
5373
5374/*
5375 * Given an original set of Xmax and infomask, and a transaction (identified by
5376 * add_to_xmax) acquiring a new lock of some mode, compute the new Xmax and
5377 * corresponding infomasks to use on the tuple.
5378 *
5379 * Note that this might have side effects such as creating a new MultiXactId.
5380 *
5381 * Most callers will have called HeapTupleSatisfiesUpdate before this function;
5382 * that will have set the HEAP_XMAX_INVALID bit if the xmax was a MultiXactId
5383 * but it was not running anymore. There is a race condition, which is that the
5384 * MultiXactId may have finished since then, but that uncommon case is handled
5385 * either here, or within MultiXactIdExpand.
5386 *
5387 * There is a similar race condition possible when the old xmax was a regular
5388 * TransactionId. We test TransactionIdIsInProgress again just to narrow the
5389 * window, but it's still possible to end up creating an unnecessary
5390 * MultiXactId. Fortunately this is harmless.
5391 */
5392static void
5398{
5399 TransactionId new_xmax;
5402
5404
5405l5:
5406 new_infomask = 0;
5407 new_infomask2 = 0;
5409 {
5410 /*
5411 * No previous locker; we just insert our own TransactionId.
5412 *
5413 * Note that it's critical that this case be the first one checked,
5414 * because there are several blocks below that come back to this one
5415 * to implement certain optimizations; old_infomask might contain
5416 * other dirty bits in those cases, but we don't really care.
5417 */
5418 if (is_update)
5419 {
5420 new_xmax = add_to_xmax;
5421 if (mode == LockTupleExclusive)
5423 }
5424 else
5425 {
5427 switch (mode)
5428 {
5429 case LockTupleKeyShare:
5430 new_xmax = add_to_xmax;
5432 break;
5433 case LockTupleShare:
5434 new_xmax = add_to_xmax;
5436 break;
5438 new_xmax = add_to_xmax;
5440 break;
5441 case LockTupleExclusive:
5442 new_xmax = add_to_xmax;
5445 break;
5446 default:
5447 new_xmax = InvalidTransactionId; /* silence compiler */
5448 elog(ERROR, "invalid lock mode");
5449 }
5450 }
5451 }
5453 {
5455
5456 /*
5457 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5458 * cross-check.
5459 */
5461
5462 /*
5463 * A multixact together with LOCK_ONLY set but neither lock bit set
5464 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5465 * anymore. This check is critical for databases upgraded by
5466 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5467 * that such multis are never passed.
5468 */
5470 {
5473 goto l5;
5474 }
5475
5476 /*
5477 * If the XMAX is already a MultiXactId, then we need to expand it to
5478 * include add_to_xmax; but if all the members were lockers and are
5479 * all gone, we can do away with the IS_MULTI bit and just set
5480 * add_to_xmax as the only locker/updater. If all lockers are gone
5481 * and we have an updater that aborted, we can also do without a
5482 * multi.
5483 *
5484 * The cost of doing GetMultiXactIdMembers would be paid by
5485 * MultiXactIdExpand if we weren't to do this, so this check is not
5486 * incurring extra work anyhow.
5487 */
5489 {
5492 old_infomask)))
5493 {
5494 /*
5495 * Reset these bits and restart; otherwise fall through to
5496 * create a new multi below.
5497 */
5500 goto l5;
5501 }
5502 }
5503
5505
5506 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5507 new_status);
5509 }
5511 {
5512 /*
5513 * It's a committed update, so we need to preserve him as updater of
5514 * the tuple.
5515 */
5516 MultiXactStatus status;
5518
5520 status = MultiXactStatusUpdate;
5521 else
5523
5525
5526 /*
5527 * since it's not running, it's obviously impossible for the old
5528 * updater to be identical to the current one, so we need not check
5529 * for that case as we do in the block above.
5530 */
5531 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5533 }
5534 else if (TransactionIdIsInProgress(xmax))
5535 {
5536 /*
5537 * If the XMAX is a valid, in-progress TransactionId, then we need to
5538 * create a new MultiXactId that includes both the old locker or
5539 * updater and our own TransactionId.
5540 */
5544
5546 {
5552 {
5555 else
5557 }
5558 else
5559 {
5560 /*
5561 * LOCK_ONLY can be present alone only when a page has been
5562 * upgraded by pg_upgrade. But in that case,
5563 * TransactionIdIsInProgress() should have returned false. We
5564 * assume it's no longer locked in this case.
5565 */
5566 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5569 goto l5;
5570 }
5571 }
5572 else
5573 {
5574 /* it's an update, but which kind? */
5577 else
5579 }
5580
5582
5583 /*
5584 * If the lock to be acquired is for the same TransactionId as the
5585 * existing lock, there's an optimization possible: consider only the
5586 * strongest of both locks as the only one present, and restart.
5587 */
5588 if (xmax == add_to_xmax)
5589 {
5590 /*
5591 * Note that it's not possible for the original tuple to be
5592 * updated: we wouldn't be here because the tuple would have been
5593 * invisible and we wouldn't try to update it. As a subtlety,
5594 * this code can also run when traversing an update chain to lock
5595 * future versions of a tuple. But we wouldn't be here either,
5596 * because the add_to_xmax would be different from the original
5597 * updater.
5598 */
5600
5601 /* acquire the strongest of both */
5602 if (mode < old_mode)
5603 mode = old_mode;
5604 /* mustn't touch is_update */
5605
5607 goto l5;
5608 }
5609
5610 /* otherwise, just fall back to creating a new multixact */
5612 new_xmax = MultiXactIdCreate(xmax, old_status,
5615 }
5618 {
5619 /*
5620 * It's a committed update, so we gotta preserve him as updater of the
5621 * tuple.
5622 */
5623 MultiXactStatus status;
5625
5627 status = MultiXactStatusUpdate;
5628 else
5630
5632
5633 /*
5634 * since it's not running, it's obviously impossible for the old
5635 * updater to be identical to the current one, so we need not check
5636 * for that case as we do in the block above.
5637 */
5638 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5640 }
5641 else
5642 {
5643 /*
5644 * Can get here iff the locking/updating transaction was running when
5645 * the infomask was extracted from the tuple, but finished before
5646 * TransactionIdIsInProgress got to run. Deal with it as if there was
5647 * no locker at all in the first place.
5648 */
5650 goto l5;
5651 }
5652
5655 *result_xmax = new_xmax;
5656}
5657
5658/*
5659 * Subroutine for heap_lock_updated_tuple_rec.
5660 *
5661 * Given a hypothetical multixact status held by the transaction identified
5662 * with the given xid, does the current transaction need to wait, fail, or can
5663 * it continue if it wanted to acquire a lock of the given mode? "needwait"
5664 * is set to true if waiting is necessary; if it can continue, then TM_Ok is
5665 * returned. If the lock is already held by the current transaction, return
5666 * TM_SelfModified. In case of a conflict with another transaction, a
5667 * different HeapTupleSatisfiesUpdate return code is returned.
5668 *
5669 * The held status is said to be hypothetical because it might correspond to a
5670 * lock held by a single Xid, i.e. not a real MultiXactId; we express it this
5671 * way for simplicity of API.
5672 */
5673static TM_Result
5676 bool *needwait)
5677{
5679
5680 *needwait = false;
5682
5683 /*
5684 * Note: we *must* check TransactionIdIsInProgress before
5685 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5686 * for an explanation.
5687 */
5689 {
5690 /*
5691 * The tuple has already been locked by our own transaction. This is
5692 * very rare but can happen if multiple transactions are trying to
5693 * lock an ancient version of the same tuple.
5694 */
5695 return TM_SelfModified;
5696 }
5697 else if (TransactionIdIsInProgress(xid))
5698 {
5699 /*
5700 * If the locking transaction is running, what we do depends on
5701 * whether the lock modes conflict: if they do, then we must wait for
5702 * it to finish; otherwise we can fall through to lock this tuple
5703 * version without waiting.
5704 */
5707 {
5708 *needwait = true;
5709 }
5710
5711 /*
5712 * If we set needwait above, then this value doesn't matter;
5713 * otherwise, this value signals to caller that it's okay to proceed.
5714 */
5715 return TM_Ok;
5716 }
5717 else if (TransactionIdDidAbort(xid))
5718 return TM_Ok;
5719 else if (TransactionIdDidCommit(xid))
5720 {
5721 /*
5722 * The other transaction committed. If it was only a locker, then the
5723 * lock is completely gone now and we can return success; but if it
5724 * was an update, then what we do depends on whether the two lock
5725 * modes conflict. If they conflict, then we must report error to
5726 * caller. But if they don't, we can fall through to allow the current
5727 * transaction to lock the tuple.
5728 *
5729 * Note: the reason we worry about ISUPDATE here is because as soon as
5730 * a transaction ends, all its locks are gone and meaningless, and
5731 * thus we can ignore them; whereas its updates persist. In the
5732 * TransactionIdIsInProgress case, above, we don't need to check
5733 * because we know the lock is still "alive" and thus a conflict needs
5734 * always be checked.
5735 */
5736 if (!ISUPDATE_from_mxstatus(status))
5737 return TM_Ok;
5738
5741 {
5742 /* bummer */
5743 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5744 return TM_Updated;
5745 else
5746 return TM_Deleted;
5747 }
5748
5749 return TM_Ok;
5750 }
5751
5752 /* Not in progress, not aborted, not committed -- must have crashed */
5753 return TM_Ok;
5754}
5755
5756
5757/*
5758 * Recursive part of heap_lock_updated_tuple
5759 *
5760 * Fetch the tuple pointed to by tid in rel, and mark it as locked by the given
5761 * xid with the given mode; if this tuple is updated, recurse to lock the new
5762 * version as well.
5763 */
5764static TM_Result
5766 const ItemPointerData *tid, TransactionId xid,
5768{
5769 TM_Result result;
5772 Buffer buf;
5777 TransactionId xmax,
5778 new_xmax;
5779 bool cleared_all_frozen = false;
5781 Buffer vmbuffer = InvalidBuffer;
5782 BlockNumber block;
5783
5784 ItemPointerCopy(tid, &tupid);
5785
5786 for (;;)
5787 {
5788 new_infomask = 0;
5789 new_xmax = InvalidTransactionId;
5791 ItemPointerCopy(&tupid, &(mytup.t_self));
5792
5793 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
5794 {
5795 /*
5796 * if we fail to find the updated version of the tuple, it's
5797 * because it was vacuumed/pruned away after its creator
5798 * transaction aborted. So behave as if we got to the end of the
5799 * chain, and there's no further tuple to lock: return success to
5800 * caller.
5801 */
5802 result = TM_Ok;
5803 goto out_unlocked;
5804 }
5805
5806l4:
5808
5809 /*
5810 * Before locking the buffer, pin the visibility map page if it
5811 * appears to be necessary. Since we haven't got the lock yet,
5812 * someone else might be in the middle of changing this, so we'll need
5813 * to recheck after we have the lock.
5814 */
5816 {
5817 visibilitymap_pin(rel, block, &vmbuffer);
5818 pinned_desired_page = true;
5819 }
5820 else
5821 pinned_desired_page = false;
5822
5824
5825 /*
5826 * If we didn't pin the visibility map page and the page has become
5827 * all visible while we were busy locking the buffer, we'll have to
5828 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5829 * That's a bit unfortunate, but hopefully shouldn't happen often.
5830 *
5831 * Note: in some paths through this function, we will reach here
5832 * holding a pin on a vm page that may or may not be the one matching
5833 * this page. If this page isn't all-visible, we won't use the vm
5834 * page, but we hold onto such a pin till the end of the function.
5835 */
5837 {
5839 visibilitymap_pin(rel, block, &vmbuffer);
5841 }
5842
5843 /*
5844 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5845 * end of the chain, we're done, so return success.
5846 */
5849 priorXmax))
5850 {
5851 result = TM_Ok;
5852 goto out_locked;
5853 }
5854
5855 /*
5856 * Also check Xmin: if this tuple was created by an aborted
5857 * (sub)transaction, then we already locked the last live one in the
5858 * chain, thus we're done, so return success.
5859 */
5861 {
5862 result = TM_Ok;
5863 goto out_locked;
5864 }
5865
5866 old_infomask = mytup.t_data->t_infomask;
5867 old_infomask2 = mytup.t_data->t_infomask2;
5868 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5869
5870 /*
5871 * If this tuple version has been updated or locked by some concurrent
5872 * transaction(s), what we do depends on whether our lock mode
5873 * conflicts with what those other transactions hold, and also on the
5874 * status of them.
5875 */
5877 {
5879 bool needwait;
5880
5883 {
5884 int nmembers;
5885 int i;
5886 MultiXactMember *members;
5887
5888 /*
5889 * We don't need a test for pg_upgrade'd tuples: this is only
5890 * applied to tuples after the first in an update chain. Said
5891 * first tuple in the chain may well be locked-in-9.2-and-
5892 * pg_upgraded, but that one was already locked by our caller,
5893 * not us; and any subsequent ones cannot be because our
5894 * caller must necessarily have obtained a snapshot later than
5895 * the pg_upgrade itself.
5896 */
5897 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5898
5899 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5901 for (i = 0; i < nmembers; i++)
5902 {
5903 result = test_lockmode_for_conflict(members[i].status,
5904 members[i].xid,
5905 mode,
5906 &mytup,
5907 &needwait);
5908
5909 /*
5910 * If the tuple was already locked by ourselves in a
5911 * previous iteration of this (say heap_lock_tuple was
5912 * forced to restart the locking loop because of a change
5913 * in xmax), then we hold the lock already on this tuple
5914 * version and we don't need to do anything; and this is
5915 * not an error condition either. We just need to skip
5916 * this tuple and continue locking the next version in the
5917 * update chain.
5918 */
5919 if (result == TM_SelfModified)
5920 {
5921 pfree(members);
5922 goto next;
5923 }
5924
5925 if (needwait)
5926 {
5928 XactLockTableWait(members[i].xid, rel,
5929 &mytup.t_self,
5931 pfree(members);
5932 goto l4;
5933 }
5934 if (result != TM_Ok)
5935 {
5936 pfree(members);
5937 goto out_locked;
5938 }
5939 }
5940 if (members)
5941 pfree(members);
5942 }
5943 else
5944 {
5945 MultiXactStatus status;
5946
5947 /*
5948 * For a non-multi Xmax, we first need to compute the
5949 * corresponding MultiXactStatus by using the infomask bits.
5950 */
5952 {
5956 status = MultiXactStatusForShare;
5958 {
5960 status = MultiXactStatusForUpdate;
5961 else
5963 }
5964 else
5965 {
5966 /*
5967 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5968 * as share-locked in the old cluster) shouldn't be
5969 * seen in the middle of an update chain.
5970 */
5971 elog(ERROR, "invalid lock status in tuple");
5972 }
5973 }
5974 else
5975 {
5976 /* it's an update, but which kind? */
5978 status = MultiXactStatusUpdate;
5979 else
5981 }
5982
5983 result = test_lockmode_for_conflict(status, rawxmax, mode,
5984 &mytup, &needwait);
5985
5986 /*
5987 * If the tuple was already locked by ourselves in a previous
5988 * iteration of this (say heap_lock_tuple was forced to
5989 * restart the locking loop because of a change in xmax), then
5990 * we hold the lock already on this tuple version and we don't
5991 * need to do anything; and this is not an error condition
5992 * either. We just need to skip this tuple and continue
5993 * locking the next version in the update chain.
5994 */
5995 if (result == TM_SelfModified)
5996 goto next;
5997
5998 if (needwait)
5999 {
6001 XactLockTableWait(rawxmax, rel, &mytup.t_self,
6003 goto l4;
6004 }
6005 if (result != TM_Ok)
6006 {
6007 goto out_locked;
6008 }
6009 }
6010 }
6011
6012 /* compute the new Xmax and infomask values for the tuple ... */
6013 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
6014 xid, mode, false,
6015 &new_xmax, &new_infomask, &new_infomask2);
6016
6018 visibilitymap_clear(rel, block, vmbuffer,
6020 cleared_all_frozen = true;
6021
6023
6024 /* ... and set them */
6025 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
6026 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
6027 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6028 mytup.t_data->t_infomask |= new_infomask;
6029 mytup.t_data->t_infomask2 |= new_infomask2;
6030
6032
6033 /* XLOG stuff */
6034 if (RelationNeedsWAL(rel))
6035 {
6038 Page page = BufferGetPage(buf);
6039
6042
6043 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
6044 xlrec.xmax = new_xmax;
6046 xlrec.flags =
6048
6050
6052
6053 PageSetLSN(page, recptr);
6054 }
6055
6057
6058next:
6059 /* if we find the end of update chain, we're done. */
6060 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
6062 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
6064 {
6065 result = TM_Ok;
6066 goto out_locked;
6067 }
6068
6069 /* tail recursion */
6071 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
6073 }
6074
6075 result = TM_Ok;
6076
6079
6081 if (vmbuffer != InvalidBuffer)
6082 ReleaseBuffer(vmbuffer);
6083
6084 return result;
6085}
6086
6087/*
6088 * heap_lock_updated_tuple
6089 * Follow update chain when locking an updated tuple, acquiring locks (row
6090 * marks) on the updated versions.
6091 *
6092 * 'prior_infomask', 'prior_raw_xmax' and 'prior_ctid' are the corresponding
6093 * fields from the initial tuple. We will lock the tuples starting from the
6094 * one that 'prior_ctid' points to. Note: This function does not lock the
6095 * initial tuple itself.
6096 *
6097 * This function doesn't check visibility, it just unconditionally marks the
6098 * tuple(s) as locked. If any tuple in the updated chain is being deleted
6099 * concurrently (or updated with the key being modified), sleep until the
6100 * transaction doing it is finished.
6101 *
6102 * Note that we don't acquire heavyweight tuple locks on the tuples we walk
6103 * when we have to wait for other transactions to release them, as opposed to
6104 * what heap_lock_tuple does. The reason is that having more than one
6105 * transaction walking the chain is probably uncommon enough that risk of
6106 * starvation is not likely: one of the preconditions for being here is that
6107 * the snapshot in use predates the update that created this tuple (because we
6108 * started at an earlier version of the tuple), but at the same time such a
6109 * transaction cannot be using repeatable read or serializable isolation
6110 * levels, because that would lead to a serializability failure.
6111 */
6112static TM_Result
6118{
6119 INJECTION_POINT("heap_lock_updated_tuple", NULL);
6120
6121 /*
6122 * If the tuple has moved into another partition (effectively a delete)
6123 * stop here.
6124 */
6126 {
6128
6129 /*
6130 * If this is the first possibly-multixact-able operation in the
6131 * current transaction, set my per-backend OldestMemberMXactId
6132 * setting. We can be certain that the transaction will never become a
6133 * member of any older MultiXactIds than that. (We have to do this
6134 * even if we end up just using our own TransactionId below, since
6135 * some other backend could incorporate our XID into a MultiXact
6136 * immediately afterwards.)
6137 */
6139
6143 }
6144
6145 /* nothing to lock */
6146 return TM_Ok;
6147}
6148
6149/*
6150 * heap_finish_speculative - mark speculative insertion as successful
6151 *
6152 * To successfully finish a speculative insertion we have to clear speculative
6153 * token from tuple. To do so the t_ctid field, which will contain a
6154 * speculative token value, is modified in place to point to the tuple itself,
6155 * which is characteristic of a newly inserted ordinary tuple.
6156 *
6157 * NB: It is not ok to commit without either finishing or aborting a
6158 * speculative insertion. We could treat speculative tuples of committed
6159 * transactions implicitly as completed, but then we would have to be prepared
6160 * to deal with speculative tokens on committed tuples. That wouldn't be
6161 * difficult - no-one looks at the ctid field of a tuple with invalid xmax -
6162 * but clearing the token at completion isn't very expensive either.
6163 * An explicit confirmation WAL record also makes logical decoding simpler.
6164 */
6165void
6167{
6168 Buffer buffer;
6169 Page page;
6170 OffsetNumber offnum;
6171 ItemId lp;
6172 HeapTupleHeader htup;
6173
6174 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
6176 page = BufferGetPage(buffer);
6177
6178 offnum = ItemPointerGetOffsetNumber(tid);
6180 elog(ERROR, "offnum out of range");
6181 lp = PageGetItemId(page, offnum);
6182 if (!ItemIdIsNormal(lp))
6183 elog(ERROR, "invalid lp");
6184
6185 htup = (HeapTupleHeader) PageGetItem(page, lp);
6186
6187 /* NO EREPORT(ERROR) from here till changes are logged */
6189
6191
6192 MarkBufferDirty(buffer);
6193
6194 /*
6195 * Replace the speculative insertion token with a real t_ctid, pointing to
6196 * itself like it does on regular tuples.
6197 */
6198 htup->t_ctid = *tid;
6199
6200 /* XLOG stuff */
6201 if (RelationNeedsWAL(relation))
6202 {
6205
6207
6209
6210 /* We want the same filtering on this as on a plain insert */
6212
6215
6217
6218 PageSetLSN(page, recptr);
6219 }
6220
6222
6223 UnlockReleaseBuffer(buffer);
6224}
6225
6226/*
6227 * heap_abort_speculative - kill a speculatively inserted tuple
6228 *
6229 * Marks a tuple that was speculatively inserted in the same command as dead,
6230 * by setting its xmin as invalid. That makes it immediately appear as dead
6231 * to all transactions, including our own. In particular, it makes
6232 * HeapTupleSatisfiesDirty() regard the tuple as dead, so that another backend
6233 * inserting a duplicate key value won't unnecessarily wait for our whole
6234 * transaction to finish (it'll just wait for our speculative insertion to
6235 * finish).
6236 *
6237 * Killing the tuple prevents "unprincipled deadlocks", which are deadlocks
6238 * that arise due to a mutual dependency that is not user visible. By
6239 * definition, unprincipled deadlocks cannot be prevented by the user
6240 * reordering lock acquisition in client code, because the implementation level
6241 * lock acquisitions are not under the user's direct control. If speculative
6242 * inserters did not take this precaution, then under high concurrency they
6243 * could deadlock with each other, which would not be acceptable.
6244 *
6245 * This is somewhat redundant with heap_delete, but we prefer to have a
6246 * dedicated routine with stripped down requirements. Note that this is also
6247 * used to delete the TOAST tuples created during speculative insertion.
6248 *
6249 * This routine does not affect logical decoding as it only looks at
6250 * confirmation records.
6251 */
6252void
6254{
6256 ItemId lp;
6257 HeapTupleData tp;
6258 Page page;
6259 BlockNumber block;
6260 Buffer buffer;
6261
6263
6264 block = ItemPointerGetBlockNumber(tid);
6265 buffer = ReadBuffer(relation, block);
6266 page = BufferGetPage(buffer);
6267
6269
6270 /*
6271 * Page can't be all visible, we just inserted into it, and are still
6272 * running.
6273 */
6274 Assert(!PageIsAllVisible(page));
6275
6278
6279 tp.t_tableOid = RelationGetRelid(relation);
6280 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6281 tp.t_len = ItemIdGetLength(lp);
6282 tp.t_self = *tid;
6283
6284 /*
6285 * Sanity check that the tuple really is a speculatively inserted tuple,
6286 * inserted by us.
6287 */
6288 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6289 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6290 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6291 elog(ERROR, "attempted to kill a non-speculative tuple");
6293
6294 /*
6295 * No need to check for serializable conflicts here. There is never a
6296 * need for a combo CID, either. No need to extract replica identity, or
6297 * do anything special with infomask bits.
6298 */
6299
6301
6302 /*
6303 * The tuple will become DEAD immediately. Flag that this page is a
6304 * candidate for pruning by setting xmin to TransactionXmin. While not
6305 * immediately prunable, it is the oldest xid we can cheaply determine
6306 * that's safe against wraparound / being older than the table's
6307 * relfrozenxid. To defend against the unlikely case of a new relation
6308 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6309 * if so (vacuum can't subsequently move relfrozenxid to beyond
6310 * TransactionXmin, so there's no race here).
6311 */
6313 {
6314 TransactionId relfrozenxid = relation->rd_rel->relfrozenxid;
6316
6317 if (TransactionIdPrecedes(TransactionXmin, relfrozenxid))
6318 prune_xid = relfrozenxid;
6319 else
6322 }
6323
6324 /* store transaction information of xact deleting the tuple */
6327
6328 /*
6329 * Set the tuple header xmin to InvalidTransactionId. This makes the
6330 * tuple immediately invisible everyone. (In particular, to any
6331 * transactions waiting on the speculative token, woken up later.)
6332 */
6334
6335 /* Clear the speculative insertion token too */
6336 tp.t_data->t_ctid = tp.t_self;
6337
6338 MarkBufferDirty(buffer);
6339
6340 /*
6341 * XLOG stuff
6342 *
6343 * The WAL records generated here match heap_delete(). The same recovery
6344 * routines are used.
6345 */
6346 if (RelationNeedsWAL(relation))
6347 {
6350
6352 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6353 tp.t_data->t_infomask2);
6355 xlrec.xmax = xid;
6356
6360
6361 /* No replica identity & replication origin logged */
6362
6364
6365 PageSetLSN(page, recptr);
6366 }
6367
6369
6371
6372 if (HeapTupleHasExternal(&tp))
6373 {
6374 Assert(!IsToastRelation(relation));
6375 heap_toast_delete(relation, &tp, true);
6376 }
6377
6378 /*
6379 * Never need to mark tuple for invalidation, since catalogs don't support
6380 * speculative insertion
6381 */
6382
6383 /* Now we can release the buffer */
6384 ReleaseBuffer(buffer);
6385
6386 /* count deletion, as we counted the insertion too */
6387 pgstat_count_heap_delete(relation);
6388}
6389
6390/*
6391 * heap_inplace_lock - protect inplace update from concurrent heap_update()
6392 *
6393 * Evaluate whether the tuple's state is compatible with a no-key update.
6394 * Current transaction rowmarks are fine, as is KEY SHARE from any
6395 * transaction. If compatible, return true with the buffer exclusive-locked,
6396 * and the caller must release that by calling
6397 * heap_inplace_update_and_unlock(), calling heap_inplace_unlock(), or raising
6398 * an error. Otherwise, call release_callback(arg), wait for blocking
6399 * transactions to end, and return false.
6400 *
6401 * Since this is intended for system catalogs and SERIALIZABLE doesn't cover
6402 * DDL, this doesn't guarantee any particular predicate locking.
6403 *
6404 * heap_delete() is a rarer source of blocking transactions (xwait). We'll
6405 * wait for such a transaction just like for the normal heap_update() case.
6406 * Normal concurrent DROP commands won't cause that, because all inplace
6407 * updaters take some lock that conflicts with DROP. An explicit SQL "DELETE
6408 * FROM pg_class" can cause it. By waiting, if the concurrent transaction
6409 * executed both "DELETE FROM pg_class" and "INSERT INTO pg_class", our caller
6410 * can find the successor tuple.
6411 *
6412 * Readers of inplace-updated fields expect changes to those fields are
6413 * durable. For example, vac_truncate_clog() reads datfrozenxid from
6414 * pg_database tuples via catalog snapshots. A future snapshot must not
6415 * return a lower datfrozenxid for the same database OID (lower in the
6416 * FullTransactionIdPrecedes() sense). We achieve that since no update of a
6417 * tuple can start while we hold a lock on its buffer. In cases like
6418 * BEGIN;GRANT;CREATE INDEX;COMMIT we're inplace-updating a tuple visible only
6419 * to this transaction. ROLLBACK then is one case where it's okay to lose
6420 * inplace updates. (Restoring relhasindex=false on ROLLBACK is fine, since
6421 * any concurrent CREATE INDEX would have blocked, then inplace-updated the
6422 * committed tuple.)
6423 *
6424 * In principle, we could avoid waiting by overwriting every tuple in the
6425 * updated tuple chain. Reader expectations permit updating a tuple only if
6426 * it's aborted, is the tail of the chain, or we already updated the tuple
6427 * referenced in its t_ctid. Hence, we would need to overwrite the tuples in
6428 * order from tail to head. That would imply either (a) mutating all tuples
6429 * in one critical section or (b) accepting a chance of partial completion.
6430 * Partial completion of a relfrozenxid update would have the weird
6431 * consequence that the table's next VACUUM could see the table's relfrozenxid
6432 * move forward between vacuum_get_cutoffs() and finishing.
6433 */
6434bool
6436 HeapTuple oldtup_ptr, Buffer buffer,
6437 void (*release_callback) (void *), void *arg)
6438{
6439 HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */
6440 TM_Result result;
6441 bool ret;
6442
6443#ifdef USE_ASSERT_CHECKING
6444 if (RelationGetRelid(relation) == RelationRelationId)
6446#endif
6447
6448 Assert(BufferIsValid(buffer));
6449
6450 /*
6451 * Register shared cache invals if necessary. Other sessions may finish
6452 * inplace updates of this tuple between this step and LockTuple(). Since
6453 * inplace updates don't change cache keys, that's harmless.
6454 *
6455 * While it's tempting to register invals only after confirming we can
6456 * return true, the following obstacle precludes reordering steps that
6457 * way. Registering invals might reach a CatalogCacheInitializeCache()
6458 * that locks "buffer". That would hang indefinitely if running after our
6459 * own LockBuffer(). Hence, we must register invals before LockBuffer().
6460 */
6462
6463 LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6465
6466 /*----------
6467 * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
6468 *
6469 * - wait unconditionally
6470 * - already locked tuple above, since inplace needs that unconditionally
6471 * - don't recheck header after wait: simpler to defer to next iteration
6472 * - don't try to continue even if the updater aborts: likewise
6473 * - no crosscheck
6474 */
6476 buffer);
6477
6478 if (result == TM_Invisible)
6479 {
6480 /* no known way this can happen */
6481 ereport(ERROR,
6483 errmsg_internal("attempted to overwrite invisible tuple")));
6484 }
6485 else if (result == TM_SelfModified)
6486 {
6487 /*
6488 * CREATE INDEX might reach this if an expression is silly enough to
6489 * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
6490 * statements might get here after a heap_update() of the same row, in
6491 * the absence of an intervening CommandCounterIncrement().
6492 */
6493 ereport(ERROR,
6495 errmsg("tuple to be updated was already modified by an operation triggered by the current command")));
6496 }
6497 else if (result == TM_BeingModified)
6498 {
6501
6503 infomask = oldtup.t_data->t_infomask;
6504
6506 {
6509 int remain;
6510
6512 lockmode, NULL))
6513 {
6516 ret = false;
6518 relation, &oldtup.t_self, XLTW_Update,
6519 &remain);
6520 }
6521 else
6522 ret = true;
6523 }
6525 ret = true;
6527 ret = true;
6528 else
6529 {
6532 ret = false;
6533 XactLockTableWait(xwait, relation, &oldtup.t_self,
6534 XLTW_Update);
6535 }
6536 }
6537 else
6538 {
6539 ret = (result == TM_Ok);
6540 if (!ret)
6541 {
6544 }
6545 }
6546
6547 /*
6548 * GetCatalogSnapshot() relies on invalidation messages to know when to
6549 * take a new snapshot. COMMIT of xwait is responsible for sending the
6550 * invalidation. We're not acquiring heavyweight locks sufficient to
6551 * block if not yet sent, so we must take a new snapshot to ensure a later
6552 * attempt has a fair chance. While we don't need this if xwait aborted,
6553 * don't bother optimizing that.
6554 */
6555 if (!ret)
6556 {
6557 UnlockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6560 }
6561 return ret;
6562}
6563
6564/*
6565 * heap_inplace_update_and_unlock - core of systable_inplace_update_finish
6566 *
6567 * The tuple cannot change size, and therefore its header fields and null
6568 * bitmap (if any) don't change either.
6569 *
6570 * Since we hold LOCKTAG_TUPLE, no updater has a local copy of this tuple.
6571 */
6572void
6574 HeapTuple oldtup, HeapTuple tuple,
6575 Buffer buffer)
6576{
6577 HeapTupleHeader htup = oldtup->t_data;
6578 uint32 oldlen;
6579 uint32 newlen;
6580 char *dst;
6581 char *src;
6582 int nmsgs = 0;
6584 bool RelcacheInitFileInval = false;
6585
6586 Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
6587 oldlen = oldtup->t_len - htup->t_hoff;
6588 newlen = tuple->t_len - tuple->t_data->t_hoff;
6589 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6590 elog(ERROR, "wrong tuple length");
6591
6592 dst = (char *) htup + htup->t_hoff;
6593 src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6594
6595 /* Like RecordTransactionCommit(), log only if needed */
6598 &RelcacheInitFileInval);
6599
6600 /*
6601 * Unlink relcache init files as needed. If unlinking, acquire
6602 * RelCacheInitLock until after associated invalidations. By doing this
6603 * in advance, if we checkpoint and then crash between inplace
6604 * XLogInsert() and inval, we don't rely on StartupXLOG() ->
6605 * RelationCacheInitFileRemove(). That uses elevel==LOG, so replay would
6606 * neglect to PANIC on EIO.
6607 */
6609
6610 /*----------
6611 * NO EREPORT(ERROR) from here till changes are complete
6612 *
6613 * Our buffer lock won't stop a reader having already pinned and checked
6614 * visibility for this tuple. Hence, we write WAL first, then mutate the
6615 * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6616 * checkpoint delay makes that acceptable. With the usual order of
6617 * changes, a crash after memcpy() and before XLogInsert() could allow
6618 * datfrozenxid to overtake relfrozenxid:
6619 *
6620 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6621 * ["R" is a VACUUM tbl]
6622 * D: vac_update_datfrozenxid() -> systable_beginscan(pg_class)
6623 * D: systable_getnext() returns pg_class tuple of tbl
6624 * R: memcpy() into pg_class tuple of tbl
6625 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6626 * [crash]
6627 * [recovery restores datfrozenxid w/o relfrozenxid]
6628 *
6629 * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
6630 * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
6631 * The stack copy facilitates a FPI of the post-mutation block before we
6632 * accept other sessions seeing it. DELAY_CHKPT_START allows us to
6633 * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
6634 * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
6635 * This function, however, likely could avoid it with the following order
6636 * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
6637 * DELAY_CHKPT_START here, too, as a way to have fewer distinct code
6638 * patterns to analyze. Inplace update isn't so frequent that it should
6639 * pursue the small optimization of skipping DELAY_CHKPT_START.
6640 */
6644
6645 /* XLOG stuff */
6646 if (RelationNeedsWAL(relation))
6647 {
6650 char *origdata = (char *) BufferGetBlock(buffer);
6651 Page page = BufferGetPage(buffer);
6652 uint16 lower = ((PageHeader) page)->pd_lower;
6653 uint16 upper = ((PageHeader) page)->pd_upper;
6655 RelFileLocator rlocator;
6656 ForkNumber forkno;
6657 BlockNumber blkno;
6659
6660 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6661 xlrec.dbId = MyDatabaseId;
6663 xlrec.relcacheInitFileInval = RelcacheInitFileInval;
6664 xlrec.nmsgs = nmsgs;
6665
6668 if (nmsgs != 0)
6670 nmsgs * sizeof(SharedInvalidationMessage));
6671
6672 /* register block matching what buffer will look like after changes */
6677 BufferGetTag(buffer, &rlocator, &forkno, &blkno);
6678 Assert(forkno == MAIN_FORKNUM);
6679 XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
6681 XLogRegisterBufData(0, src, newlen);
6682
6683 /* inplace updates aren't decoded atm, don't log the origin */
6684
6686
6687 PageSetLSN(page, recptr);
6688 }
6689
6690 memcpy(dst, src, newlen);
6691
6692 MarkBufferDirty(buffer);
6693
6695
6696 /*
6697 * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we
6698 * do this before UnlockTuple().
6699 */
6701
6704 UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
6705
6706 AcceptInvalidationMessages(); /* local processing of just-sent inval */
6707
6708 /*
6709 * Queue a transactional inval, for logical decoding and for third-party
6710 * code that might have been relying on it since long before inplace
6711 * update adopted immediate invalidation. See README.tuplock section
6712 * "Reading inplace-updated columns" for logical decoding details.
6713 */
6715 CacheInvalidateHeapTuple(relation, tuple, NULL);
6716}
6717
6718/*
6719 * heap_inplace_unlock - reverse of heap_inplace_lock
6720 */
6721void
6723 HeapTuple oldtup, Buffer buffer)
6724{
6726 UnlockTuple(relation, &oldtup->t_self, InplaceUpdateTupleLock);
6728}
6729
6730#define FRM_NOOP 0x0001
6731#define FRM_INVALIDATE_XMAX 0x0002
6732#define FRM_RETURN_IS_XID 0x0004
6733#define FRM_RETURN_IS_MULTI 0x0008
6734#define FRM_MARK_COMMITTED 0x0010
6735
6736/*
6737 * FreezeMultiXactId
6738 * Determine what to do during freezing when a tuple is marked by a
6739 * MultiXactId.
6740 *
6741 * "flags" is an output value; it's used to tell caller what to do on return.
6742 * "pagefrz" is an input/output value, used to manage page level freezing.
6743 *
6744 * Possible values that we can set in "flags":
6745 * FRM_NOOP
6746 * don't do anything -- keep existing Xmax
6747 * FRM_INVALIDATE_XMAX
6748 * mark Xmax as InvalidTransactionId and set XMAX_INVALID flag.
6749 * FRM_RETURN_IS_XID
6750 * The Xid return value is a single update Xid to set as xmax.
6751 * FRM_MARK_COMMITTED
6752 * Xmax can be marked as HEAP_XMAX_COMMITTED
6753 * FRM_RETURN_IS_MULTI
6754 * The return value is a new MultiXactId to set as new Xmax.
6755 * (caller must obtain proper infomask bits using GetMultiXactIdHintBits)
6756 *
6757 * Caller delegates control of page freezing to us. In practice we always
6758 * force freezing of caller's page unless FRM_NOOP processing is indicated.
6759 * We help caller ensure that XIDs < FreezeLimit and MXIDs < MultiXactCutoff
6760 * can never be left behind. We freely choose when and how to process each
6761 * Multi, without ever violating the cutoff postconditions for freezing.
6762 *
6763 * It's useful to remove Multis on a proactive timeline (relative to freezing
6764 * XIDs) to keep MultiXact member SLRU buffer misses to a minimum. It can also
6765 * be cheaper in the short run, for us, since we too can avoid SLRU buffer
6766 * misses through eager processing.
6767 *
6768 * NB: Creates a _new_ MultiXactId when FRM_RETURN_IS_MULTI is set, though only
6769 * when FreezeLimit and/or MultiXactCutoff cutoffs leave us with no choice.
6770 * This can usually be put off, which is usually enough to avoid it altogether.
6771 * Allocating new multis during VACUUM should be avoided on general principle;
6772 * only VACUUM can advance relminmxid, so allocating new Multis here comes with
6773 * its own special risks.
6774 *
6775 * NB: Caller must maintain "no freeze" NewRelfrozenXid/NewRelminMxid trackers
6776 * using heap_tuple_should_freeze when we haven't forced page-level freezing.
6777 *
6778 * NB: Caller should avoid needlessly calling heap_tuple_should_freeze when we
6779 * have already forced page-level freezing, since that might incur the same
6780 * SLRU buffer misses that we specifically intended to avoid by freezing.
6781 */
6782static TransactionId
6783FreezeMultiXactId(MultiXactId multi, uint16 t_infomask,
6784 const struct VacuumCutoffs *cutoffs, uint16 *flags,
6785 HeapPageFreeze *pagefrz)
6786{
6788 MultiXactMember *members;
6789 int nmembers;
6790 bool need_replace;
6791 int nnewmembers;
6793 bool has_lockers;
6795 bool update_committed;
6796 TransactionId FreezePageRelfrozenXid;
6797
6798 *flags = 0;
6799
6800 /* We should only be called in Multis */
6801 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6802
6803 if (!MultiXactIdIsValid(multi) ||
6804 HEAP_LOCKED_UPGRADED(t_infomask))
6805 {
6806 *flags |= FRM_INVALIDATE_XMAX;
6807 pagefrz->freeze_required = true;
6808 return InvalidTransactionId;
6809 }
6810 else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
6811 ereport(ERROR,
6813 errmsg_internal("found multixact %u from before relminmxid %u",
6814 multi, cutoffs->relminmxid)));
6815 else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
6816 {
6818
6819 /*
6820 * This old multi cannot possibly have members still running, but
6821 * verify just in case. If it was a locker only, it can be removed
6822 * without any further consideration; but if it contained an update,
6823 * we might need to preserve it.
6824 */
6825 if (MultiXactIdIsRunning(multi,
6826 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6827 ereport(ERROR,
6829 errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
6830 multi, cutoffs->OldestMxact)));
6831
6832 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6833 {
6834 *flags |= FRM_INVALIDATE_XMAX;
6835 pagefrz->freeze_required = true;
6836 return InvalidTransactionId;
6837 }
6838
6839 /* replace multi with single XID for its updater? */
6840 update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
6842 ereport(ERROR,
6844 errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
6845 multi, update_xact,
6846 cutoffs->relfrozenxid)));
6847 else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
6848 {
6849 /*
6850 * Updater XID has to have aborted (otherwise the tuple would have
6851 * been pruned away instead, since updater XID is < OldestXmin).
6852 * Just remove xmax.
6853 */
6855 ereport(ERROR,
6857 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6858 multi, update_xact,
6859 cutoffs->OldestXmin)));
6860 *flags |= FRM_INVALIDATE_XMAX;
6861 pagefrz->freeze_required = true;
6862 return InvalidTransactionId;
6863 }
6864
6865 /* Have to keep updater XID as new xmax */
6866 *flags |= FRM_RETURN_IS_XID;
6867 pagefrz->freeze_required = true;
6868 return update_xact;
6869 }
6870
6871 /*
6872 * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
6873 * need to walk the whole members array to figure out what to do, if
6874 * anything.
6875 */
6876 nmembers =
6877 GetMultiXactIdMembers(multi, &members, false,
6878 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6879 if (nmembers <= 0)
6880 {
6881 /* Nothing worth keeping */
6882 *flags |= FRM_INVALIDATE_XMAX;
6883 pagefrz->freeze_required = true;
6884 return InvalidTransactionId;
6885 }
6886
6887 /*
6888 * The FRM_NOOP case is the only case where we might need to ratchet back
6889 * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
6890 * case where our caller might ratchet back its NoFreezePageRelfrozenXid
6891 * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
6892 * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
6893 * trackers managed by VACUUM being ratcheting back by xmax to the degree
6894 * required to make it safe to leave xmax undisturbed, independent of
6895 * whether or not page freezing is triggered somewhere else.
6896 *
6897 * Our policy is to force freezing in every case other than FRM_NOOP,
6898 * which obviates the need to maintain either set of trackers, anywhere.
6899 * Every other case will reliably execute a freeze plan for xmax that
6900 * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
6901 * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
6902 * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
6903 * OldestXmin/OldestMxact, so later values never need to be tracked here.)
6904 */
6905 need_replace = false;
6906 FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
6907 for (int i = 0; i < nmembers; i++)
6908 {
6909 TransactionId xid = members[i].xid;
6910
6911 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6912
6913 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
6914 {
6915 /* Can't violate the FreezeLimit postcondition */
6916 need_replace = true;
6917 break;
6918 }
6919 if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
6920 FreezePageRelfrozenXid = xid;
6921 }
6922
6923 /* Can't violate the MultiXactCutoff postcondition, either */
6924 if (!need_replace)
6926
6927 if (!need_replace)
6928 {
6929 /*
6930 * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
6931 * both together to make it safe to retain this particular multi after
6932 * freezing its page
6933 */
6934 *flags |= FRM_NOOP;
6935 pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
6936 if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
6937 pagefrz->FreezePageRelminMxid = multi;
6938 pfree(members);
6939 return multi;
6940 }
6941
6942 /*
6943 * Do a more thorough second pass over the multi to figure out which
6944 * member XIDs actually need to be kept. Checking the precise status of
6945 * individual members might even show that we don't need to keep anything.
6946 * That is quite possible even though the Multi must be >= OldestMxact,
6947 * since our second pass only keeps member XIDs when it's truly necessary;
6948 * even member XIDs >= OldestXmin often won't be kept by second pass.
6949 */
6950 nnewmembers = 0;
6952 has_lockers = false;
6954 update_committed = false;
6955
6956 /*
6957 * Determine whether to keep each member xid, or to ignore it instead
6958 */
6959 for (int i = 0; i < nmembers; i++)
6960 {
6961 TransactionId xid = members[i].xid;
6962 MultiXactStatus mstatus = members[i].status;
6963
6964 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6965
6966 if (!ISUPDATE_from_mxstatus(mstatus))
6967 {
6968 /*
6969 * Locker XID (not updater XID). We only keep lockers that are
6970 * still running.
6971 */
6974 {
6975 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6976 ereport(ERROR,
6978 errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
6979 multi, xid,
6980 cutoffs->OldestXmin)));
6981 newmembers[nnewmembers++] = members[i];
6982 has_lockers = true;
6983 }
6984
6985 continue;
6986 }
6987
6988 /*
6989 * Updater XID (not locker XID). Should we keep it?
6990 *
6991 * Since the tuple wasn't totally removed when vacuum pruned, the
6992 * update Xid cannot possibly be older than OldestXmin cutoff unless
6993 * the updater XID aborted. If the updater transaction is known
6994 * aborted or crashed then it's okay to ignore it, otherwise not.
6995 *
6996 * In any case the Multi should never contain two updaters, whatever
6997 * their individual commit status. Check for that first, in passing.
6998 */
7000 ereport(ERROR,
7002 errmsg_internal("multixact %u has two or more updating members",
7003 multi),
7004 errdetail_internal("First updater XID=%u second updater XID=%u.",
7005 update_xid, xid)));
7006
7007 /*
7008 * As with all tuple visibility routines, it's critical to test
7009 * TransactionIdIsInProgress before TransactionIdDidCommit, because of
7010 * race conditions explained in detail in heapam_visibility.c.
7011 */
7014 update_xid = xid;
7015 else if (TransactionIdDidCommit(xid))
7016 {
7017 /*
7018 * The transaction committed, so we can tell caller to set
7019 * HEAP_XMAX_COMMITTED. (We can only do this because we know the
7020 * transaction is not running.)
7021 */
7022 update_committed = true;
7023 update_xid = xid;
7024 }
7025 else
7026 {
7027 /*
7028 * Not in progress, not committed -- must be aborted or crashed;
7029 * we can ignore it.
7030 */
7031 continue;
7032 }
7033
7034 /*
7035 * We determined that updater must be kept -- add it to pending new
7036 * members list
7037 */
7038 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
7039 ereport(ERROR,
7041 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
7042 multi, xid, cutoffs->OldestXmin)));
7043 newmembers[nnewmembers++] = members[i];
7044 }
7045
7046 pfree(members);
7047
7048 /*
7049 * Determine what to do with caller's multi based on information gathered
7050 * during our second pass
7051 */
7052 if (nnewmembers == 0)
7053 {
7054 /* Nothing worth keeping */
7055 *flags |= FRM_INVALIDATE_XMAX;
7057 }
7059 {
7060 /*
7061 * If there's a single member and it's an update, pass it back alone
7062 * without creating a new Multi. (XXX we could do this when there's a
7063 * single remaining locker, too, but that would complicate the API too
7064 * much; moreover, the case with the single updater is more
7065 * interesting, because those are longer-lived.)
7066 */
7067 Assert(nnewmembers == 1);
7068 *flags |= FRM_RETURN_IS_XID;
7069 if (update_committed)
7070 *flags |= FRM_MARK_COMMITTED;
7072 }
7073 else
7074 {
7075 /*
7076 * Create a new multixact with the surviving members of the previous
7077 * one, to set as new Xmax in the tuple
7078 */
7080 *flags |= FRM_RETURN_IS_MULTI;
7081 }
7082
7084
7085 pagefrz->freeze_required = true;
7086 return newxmax;
7087}
7088
7089/*
7090 * heap_prepare_freeze_tuple
7091 *
7092 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7093 * are older than the OldestXmin and/or OldestMxact freeze cutoffs. If so,
7094 * setup enough state (in the *frz output argument) to enable caller to
7095 * process this tuple as part of freezing its page, and return true. Return
7096 * false if nothing can be changed about the tuple right now.
7097 *
7098 * Also sets *totally_frozen to true if the tuple will be totally frozen once
7099 * caller executes returned freeze plan (or if the tuple was already totally
7100 * frozen by an earlier VACUUM). This indicates that there are no remaining
7101 * XIDs or MultiXactIds that will need to be processed by a future VACUUM.
7102 *
7103 * VACUUM caller must assemble HeapTupleFreeze freeze plan entries for every
7104 * tuple that we returned true for, and then execute freezing. Caller must
7105 * initialize pagefrz fields for page as a whole before first call here for
7106 * each heap page.
7107 *
7108 * VACUUM caller decides on whether or not to freeze the page as a whole.
7109 * We'll often prepare freeze plans for a page that caller just discards.
7110 * However, VACUUM doesn't always get to make a choice; it must freeze when
7111 * pagefrz.freeze_required is set, to ensure that any XIDs < FreezeLimit (and
7112 * MXIDs < MultiXactCutoff) can never be left behind. We help to make sure
7113 * that VACUUM always follows that rule.
7114 *
7115 * We sometimes force freezing of xmax MultiXactId values long before it is
7116 * strictly necessary to do so just to ensure the FreezeLimit postcondition.
7117 * It's worth processing MultiXactIds proactively when it is cheap to do so,
7118 * and it's convenient to make that happen by piggy-backing it on the "force
7119 * freezing" mechanism. Conversely, we sometimes delay freezing MultiXactIds
7120 * because it is expensive right now (though only when it's still possible to
7121 * do so without violating the FreezeLimit/MultiXactCutoff postcondition).
7122 *
7123 * It is assumed that the caller has checked the tuple with
7124 * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD
7125 * (else we should be removing the tuple, not freezing it).
7126 *
7127 * NB: This function has side effects: it might allocate a new MultiXactId.
7128 * It will be set as tuple's new xmax when our *frz output is processed within
7129 * heap_execute_freeze_tuple later on. If the tuple is in a shared buffer
7130 * then caller had better have an exclusive lock on it already.
7131 */
7132bool
7134 const struct VacuumCutoffs *cutoffs,
7135 HeapPageFreeze *pagefrz,
7137{
7138 bool xmin_already_frozen = false,
7139 xmax_already_frozen = false;
7140 bool freeze_xmin = false,
7141 replace_xvac = false,
7142 replace_xmax = false,
7143 freeze_xmax = false;
7144 TransactionId xid;
7145
7146 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
7147 frz->t_infomask2 = tuple->t_infomask2;
7148 frz->t_infomask = tuple->t_infomask;
7149 frz->frzflags = 0;
7150 frz->checkflags = 0;
7151
7152 /*
7153 * Process xmin, while keeping track of whether it's already frozen, or
7154 * will become frozen iff our freeze plan is executed by caller (could be
7155 * neither).
7156 */
7157 xid = HeapTupleHeaderGetXmin(tuple);
7158 if (!TransactionIdIsNormal(xid))
7159 xmin_already_frozen = true;
7160 else
7161 {
7162 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7163 ereport(ERROR,
7165 errmsg_internal("found xmin %u from before relfrozenxid %u",
7166 xid, cutoffs->relfrozenxid)));
7167
7168 /* Will set freeze_xmin flags in freeze plan below */
7170
7171 /* Verify that xmin committed if and when freeze plan is executed */
7172 if (freeze_xmin)
7174 }
7175
7176 /*
7177 * Old-style VACUUM FULL is gone, but we have to process xvac for as long
7178 * as we support having MOVED_OFF/MOVED_IN tuples in the database
7179 */
7180 xid = HeapTupleHeaderGetXvac(tuple);
7181 if (TransactionIdIsNormal(xid))
7182 {
7184 Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
7185
7186 /*
7187 * For Xvac, we always freeze proactively. This allows totally_frozen
7188 * tracking to ignore xvac.
7189 */
7190 replace_xvac = pagefrz->freeze_required = true;
7191
7192 /* Will set replace_xvac flags in freeze plan below */
7193 }
7194
7195 /* Now process xmax */
7196 xid = frz->xmax;
7197 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7198 {
7199 /* Raw xmax is a MultiXactId */
7201 uint16 flags;
7202
7203 /*
7204 * We will either remove xmax completely (in the "freeze_xmax" path),
7205 * process xmax by replacing it (in the "replace_xmax" path), or
7206 * perform no-op xmax processing. The only constraint is that the
7207 * FreezeLimit/MultiXactCutoff postcondition must never be violated.
7208 */
7209 newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
7210 &flags, pagefrz);
7211
7212 if (flags & FRM_NOOP)
7213 {
7214 /*
7215 * xmax is a MultiXactId, and nothing about it changes for now.
7216 * This is the only case where 'freeze_required' won't have been
7217 * set for us by FreezeMultiXactId, as well as the only case where
7218 * neither freeze_xmax nor replace_xmax are set (given a multi).
7219 *
7220 * This is a no-op, but the call to FreezeMultiXactId might have
7221 * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
7222 * for us (the "freeze page" variants, specifically). That'll
7223 * make it safe for our caller to freeze the page later on, while
7224 * leaving this particular xmax undisturbed.
7225 *
7226 * FreezeMultiXactId is _not_ responsible for the "no freeze"
7227 * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
7228 * job. A call to heap_tuple_should_freeze for this same tuple
7229 * will take place below if 'freeze_required' isn't set already.
7230 * (This repeats work from FreezeMultiXactId, but allows "no
7231 * freeze" tracker maintenance to happen in only one place.)
7232 */
7235 }
7236 else if (flags & FRM_RETURN_IS_XID)
7237 {
7238 /*
7239 * xmax will become an updater Xid (original MultiXact's updater
7240 * member Xid will be carried forward as a simple Xid in Xmax).
7241 */
7243
7244 /*
7245 * NB -- some of these transformations are only valid because we
7246 * know the return Xid is a tuple updater (i.e. not merely a
7247 * locker.) Also note that the only reason we don't explicitly
7248 * worry about HEAP_KEYS_UPDATED is because it lives in
7249 * t_infomask2 rather than t_infomask.
7250 */
7251 frz->t_infomask &= ~HEAP_XMAX_BITS;
7252 frz->xmax = newxmax;
7253 if (flags & FRM_MARK_COMMITTED)
7254 frz->t_infomask |= HEAP_XMAX_COMMITTED;
7255 replace_xmax = true;
7256 }
7257 else if (flags & FRM_RETURN_IS_MULTI)
7258 {
7261
7262 /*
7263 * xmax is an old MultiXactId that we have to replace with a new
7264 * MultiXactId, to carry forward two or more original member XIDs.
7265 */
7267
7268 /*
7269 * We can't use GetMultiXactIdHintBits directly on the new multi
7270 * here; that routine initializes the masks to all zeroes, which
7271 * would lose other bits we need. Doing it this way ensures all
7272 * unrelated bits remain untouched.
7273 */
7274 frz->t_infomask &= ~HEAP_XMAX_BITS;
7275 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7277 frz->t_infomask |= newbits;
7278 frz->t_infomask2 |= newbits2;
7279 frz->xmax = newxmax;
7280 replace_xmax = true;
7281 }
7282 else
7283 {
7284 /*
7285 * Freeze plan for tuple "freezes xmax" in the strictest sense:
7286 * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
7287 */
7288 Assert(flags & FRM_INVALIDATE_XMAX);
7290
7291 /* Will set freeze_xmax flags in freeze plan below */
7292 freeze_xmax = true;
7293 }
7294
7295 /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
7296 Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
7297 }
7298 else if (TransactionIdIsNormal(xid))
7299 {
7300 /* Raw xmax is normal XID */
7301 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7302 ereport(ERROR,
7304 errmsg_internal("found xmax %u from before relfrozenxid %u",
7305 xid, cutoffs->relfrozenxid)));
7306
7307 /* Will set freeze_xmax flags in freeze plan below */
7309
7310 /*
7311 * Verify that xmax aborted if and when freeze plan is executed,
7312 * provided it's from an update. (A lock-only xmax can be removed
7313 * independent of this, since the lock is released at xact end.)
7314 */
7316 frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
7317 }
7318 else if (!TransactionIdIsValid(xid))
7319 {
7320 /* Raw xmax is InvalidTransactionId XID */
7321 Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
7322 xmax_already_frozen = true;
7323 }
7324 else
7325 ereport(ERROR,
7327 errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
7328 xid, tuple->t_infomask)));
7329
7330 if (freeze_xmin)
7331 {
7333
7334 frz->t_infomask |= HEAP_XMIN_FROZEN;
7335 }
7336 if (replace_xvac)
7337 {
7338 /*
7339 * If a MOVED_OFF tuple is not dead, the xvac transaction must have
7340 * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
7341 * transaction succeeded.
7342 */
7343 Assert(pagefrz->freeze_required);
7344 if (tuple->t_infomask & HEAP_MOVED_OFF)
7345 frz->frzflags |= XLH_INVALID_XVAC;
7346 else
7347 frz->frzflags |= XLH_FREEZE_XVAC;
7348 }
7349 if (replace_xmax)
7350 {
7352 Assert(pagefrz->freeze_required);
7353
7354 /* Already set replace_xmax flags in freeze plan earlier */
7355 }
7356 if (freeze_xmax)
7357 {
7359
7360 frz->xmax = InvalidTransactionId;
7361
7362 /*
7363 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7364 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
7365 * Also get rid of the HEAP_KEYS_UPDATED bit.
7366 */
7367 frz->t_infomask &= ~HEAP_XMAX_BITS;
7368 frz->t_infomask |= HEAP_XMAX_INVALID;
7369 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7370 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7371 }
7372
7373 /*
7374 * Determine if this tuple is already totally frozen, or will become
7375 * totally frozen (provided caller executes freeze plans for the page)
7376 */
7379
7380 if (!pagefrz->freeze_required && !(xmin_already_frozen &&
7382 {
7383 /*
7384 * So far no previous tuple from the page made freezing mandatory.
7385 * Does this tuple force caller to freeze the entire page?
7386 */
7387 pagefrz->freeze_required =
7388 heap_tuple_should_freeze(tuple, cutoffs,
7389 &pagefrz->NoFreezePageRelfrozenXid,
7390 &pagefrz->NoFreezePageRelminMxid);
7391 }
7392
7393 /* Tell caller if this tuple has a usable freeze plan set in *frz */
7395}
7396
7397/*
7398 * Perform xmin/xmax XID status sanity checks before actually executing freeze
7399 * plans.
7400 *
7401 * heap_prepare_freeze_tuple doesn't perform these checks directly because
7402 * pg_xact lookups are relatively expensive. They shouldn't be repeated by
7403 * successive VACUUMs that each decide against freezing the same page.
7404 */
7405void
7407 HeapTupleFreeze *tuples, int ntuples)
7408{
7409 Page page = BufferGetPage(buffer);
7410
7411 for (int i = 0; i < ntuples; i++)
7412 {
7413 HeapTupleFreeze *frz = tuples + i;
7414 ItemId itemid = PageGetItemId(page, frz->offset);
7415 HeapTupleHeader htup;
7416
7417 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7418
7419 /* Deliberately avoid relying on tuple hint bits here */
7420 if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
7421 {
7423
7425 if (unlikely(!TransactionIdDidCommit(xmin)))
7426 ereport(ERROR,
7428 errmsg_internal("uncommitted xmin %u needs to be frozen",
7429 xmin)));
7430 }
7431
7432 /*
7433 * TransactionIdDidAbort won't work reliably in the presence of XIDs
7434 * left behind by transactions that were in progress during a crash,
7435 * so we can only check that xmax didn't commit
7436 */
7437 if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
7438 {
7440
7443 ereport(ERROR,
7445 errmsg_internal("cannot freeze committed xmax %u",
7446 xmax)));
7447 }
7448 }
7449}
7450
7451/*
7452 * Helper which executes freezing of one or more heap tuples on a page on
7453 * behalf of caller. Caller passes an array of tuple plans from
7454 * heap_prepare_freeze_tuple. Caller must set 'offset' in each plan for us.
7455 * Must be called in a critical section that also marks the buffer dirty and,
7456 * if needed, emits WAL.
7457 */
7458void
7459heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
7460{
7461 Page page = BufferGetPage(buffer);
7462
7463 for (int i = 0; i < ntuples; i++)
7464 {
7465 HeapTupleFreeze *frz = tuples + i;
7466 ItemId itemid = PageGetItemId(page, frz->offset);
7467 HeapTupleHeader htup;
7468
7469 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7471 }
7472}
7473
7474/*
7475 * heap_freeze_tuple
7476 * Freeze tuple in place, without WAL logging.
7477 *
7478 * Useful for callers like CLUSTER that perform their own WAL logging.
7479 */
7480bool
7482 TransactionId relfrozenxid, TransactionId relminmxid,
7483 TransactionId FreezeLimit, TransactionId MultiXactCutoff)
7484{
7486 bool do_freeze;
7487 bool totally_frozen;
7488 struct VacuumCutoffs cutoffs;
7489 HeapPageFreeze pagefrz;
7490
7491 cutoffs.relfrozenxid = relfrozenxid;
7492 cutoffs.relminmxid = relminmxid;
7493 cutoffs.OldestXmin = FreezeLimit;
7494 cutoffs.OldestMxact = MultiXactCutoff;
7495 cutoffs.FreezeLimit = FreezeLimit;
7497
7498 pagefrz.freeze_required = true;
7499 pagefrz.FreezePageRelfrozenXid = FreezeLimit;
7500 pagefrz.FreezePageRelminMxid = MultiXactCutoff;
7501 pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
7502 pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
7503
7504 do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
7505 &pagefrz, &frz, &totally_frozen);
7506
7507 /*
7508 * Note that because this is not a WAL-logged operation, we don't need to
7509 * fill in the offset in the freeze record.
7510 */
7511
7512 if (do_freeze)
7514 return do_freeze;
7515}
7516
7517/*
7518 * For a given MultiXactId, return the hint bits that should be set in the
7519 * tuple's infomask.
7520 *
7521 * Normally this should be called for a multixact that was just created, and
7522 * so is on our local cache, so the GetMembers call is fast.
7523 */
7524static void
7527{
7528 int nmembers;
7529 MultiXactMember *members;
7530 int i;
7532 uint16 bits2 = 0;
7533 bool has_update = false;
7535
7536 /*
7537 * We only use this in multis we just created, so they cannot be values
7538 * pre-pg_upgrade.
7539 */
7540 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7541
7542 for (i = 0; i < nmembers; i++)
7543 {
7545
7546 /*
7547 * Remember the strongest lock mode held by any member of the
7548 * multixact.
7549 */
7550 mode = TUPLOCK_from_mxstatus(members[i].status);
7551 if (mode > strongest)
7552 strongest = mode;
7553
7554 /* See what other bits we need */
7555 switch (members[i].status)
7556 {
7560 break;
7561
7564 break;
7565
7567 has_update = true;
7568 break;
7569
7572 has_update = true;
7573 break;
7574 }
7575 }
7576
7579 bits |= HEAP_XMAX_EXCL_LOCK;
7580 else if (strongest == LockTupleShare)
7581 bits |= HEAP_XMAX_SHR_LOCK;
7582 else if (strongest == LockTupleKeyShare)
7583 bits |= HEAP_XMAX_KEYSHR_LOCK;
7584
7585 if (!has_update)
7586 bits |= HEAP_XMAX_LOCK_ONLY;
7587
7588 if (nmembers > 0)
7589 pfree(members);
7590
7591 *new_infomask = bits;
7593}
7594
7595/*
7596 * MultiXactIdGetUpdateXid
7597 *
7598 * Given a multixact Xmax and corresponding infomask, which does not have the
7599 * HEAP_XMAX_LOCK_ONLY bit set, obtain and return the Xid of the updating
7600 * transaction.
7601 *
7602 * Caller is expected to check the status of the updating transaction, if
7603 * necessary.
7604 */
7605static TransactionId
7607{
7609 MultiXactMember *members;
7610 int nmembers;
7611
7612 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7613 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7614
7615 /*
7616 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7617 * pre-pg_upgrade.
7618 */
7619 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7620
7621 if (nmembers > 0)
7622 {
7623 int i;
7624
7625 for (i = 0; i < nmembers; i++)
7626 {
7627 /* Ignore lockers */
7628 if (!ISUPDATE_from_mxstatus(members[i].status))
7629 continue;
7630
7631 /* there can be at most one updater */
7633 update_xact = members[i].xid;
7634#ifndef USE_ASSERT_CHECKING
7635
7636 /*
7637 * in an assert-enabled build, walk the whole array to ensure
7638 * there's no other updater.
7639 */
7640 break;
7641#endif
7642 }
7643
7644 pfree(members);
7645 }
7646
7647 return update_xact;
7648}
7649
7650/*
7651 * HeapTupleGetUpdateXid
7652 * As above, but use a HeapTupleHeader
7653 *
7654 * See also HeapTupleHeaderGetUpdateXid, which can be used without previously
7655 * checking the hint bits.
7656 */
7659{
7661 tup->t_infomask);
7662}
7663
7664/*
7665 * Does the given multixact conflict with the current transaction grabbing a
7666 * tuple lock of the given strength?
7667 *
7668 * The passed infomask pairs up with the given multixact in the tuple header.
7669 *
7670 * If current_is_member is not NULL, it is set to 'true' if the current
7671 * transaction is a member of the given multixact.
7672 */
7673static bool
7675 LockTupleMode lockmode, bool *current_is_member)
7676{
7677 int nmembers;
7678 MultiXactMember *members;
7679 bool result = false;
7680 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7681
7683 return false;
7684
7685 nmembers = GetMultiXactIdMembers(multi, &members, false,
7687 if (nmembers >= 0)
7688 {
7689 int i;
7690
7691 for (i = 0; i < nmembers; i++)
7692 {
7695
7696 if (result && (current_is_member == NULL || *current_is_member))
7697 break;
7698
7699 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7700
7701 /* ignore members from current xact (but track their presence) */
7702 memxid = members[i].xid;
7704 {
7705 if (current_is_member != NULL)
7706 *current_is_member = true;
7707 continue;
7708 }
7709 else if (result)
7710 continue;
7711
7712 /* ignore members that don't conflict with the lock we want */
7714 continue;
7715
7716 if (ISUPDATE_from_mxstatus(members[i].status))
7717 {
7718 /* ignore aborted updaters */
7720 continue;
7721 }
7722 else
7723 {
7724 /* ignore lockers-only that are no longer in progress */
7726 continue;
7727 }
7728
7729 /*
7730 * Whatever remains are either live lockers that conflict with our
7731 * wanted lock, and updaters that are not aborted. Those conflict
7732 * with what we want. Set up to return true, but keep going to
7733 * look for the current transaction among the multixact members,
7734 * if needed.
7735 */
7736 result = true;
7737 }
7738 pfree(members);
7739 }
7740
7741 return result;
7742}
7743
7744/*
7745 * Do_MultiXactIdWait
7746 * Actual implementation for the two functions below.
7747 *
7748 * 'multi', 'status' and 'infomask' indicate what to sleep on (the status is
7749 * needed to ensure we only sleep on conflicting members, and the infomask is
7750 * used to optimize multixact access in case it's a lock-only multi); 'nowait'
7751 * indicates whether to use conditional lock acquisition, to allow callers to
7752 * fail if lock is unavailable. 'rel', 'ctid' and 'oper' are used to set up
7753 * context information for error messages. 'remaining', if not NULL, receives
7754 * the number of members that are still running, including any (non-aborted)
7755 * subtransactions of our own transaction. 'logLockFailure' indicates whether
7756 * to log details when a lock acquisition fails with 'nowait' enabled.
7757 *
7758 * We do this by sleeping on each member using XactLockTableWait. Any
7759 * members that belong to the current backend are *not* waited for, however;
7760 * this would not merely be useless but would lead to Assert failure inside
7761 * XactLockTableWait. By the time this returns, it is certain that all
7762 * transactions *of other backends* that were members of the MultiXactId
7763 * that conflict with the requested status are dead (and no new ones can have
7764 * been added, since it is not legal to add members to an existing
7765 * MultiXactId).
7766 *
7767 * But by the time we finish sleeping, someone else may have changed the Xmax
7768 * of the containing tuple, so the caller needs to iterate on us somehow.
7769 *
7770 * Note that in case we return false, the number of remaining members is
7771 * not to be trusted.
7772 */
7773static bool
7775 uint16 infomask, bool nowait,
7776 Relation rel, const ItemPointerData *ctid, XLTW_Oper oper,
7777 int *remaining, bool logLockFailure)
7778{
7779 bool result = true;
7780 MultiXactMember *members;
7781 int nmembers;
7782 int remain = 0;
7783
7784 /* for pre-pg_upgrade tuples, no need to sleep at all */
7785 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7786 GetMultiXactIdMembers(multi, &members, false,
7788
7789 if (nmembers >= 0)
7790 {
7791 int i;
7792
7793 for (i = 0; i < nmembers; i++)
7794 {
7795 TransactionId memxid = members[i].xid;
7796 MultiXactStatus memstatus = members[i].status;
7797
7799 {
7800 remain++;
7801 continue;
7802 }
7803
7805 LOCKMODE_from_mxstatus(status)))
7806 {
7808 remain++;
7809 continue;
7810 }
7811
7812 /*
7813 * This member conflicts with our multi, so we have to sleep (or
7814 * return failure, if asked to avoid waiting.)
7815 *
7816 * Note that we don't set up an error context callback ourselves,
7817 * but instead we pass the info down to XactLockTableWait. This
7818 * might seem a bit wasteful because the context is set up and
7819 * tore down for each member of the multixact, but in reality it
7820 * should be barely noticeable, and it avoids duplicate code.
7821 */
7822 if (nowait)
7823 {
7825 if (!result)
7826 break;
7827 }
7828 else
7829 XactLockTableWait(memxid, rel, ctid, oper);
7830 }
7831
7832 pfree(members);
7833 }
7834
7835 if (remaining)
7836 *remaining = remain;
7837
7838 return result;
7839}
7840
7841/*
7842 * MultiXactIdWait
7843 * Sleep on a MultiXactId.
7844 *
7845 * By the time we finish sleeping, someone else may have changed the Xmax
7846 * of the containing tuple, so the caller needs to iterate on us somehow.
7847 *
7848 * We return (in *remaining, if not NULL) the number of members that are still
7849 * running, including any (non-aborted) subtransactions of our own transaction.
7850 */
7851static void
7853 Relation rel, const ItemPointerData *ctid, XLTW_Oper oper,
7854 int *remaining)
7855{
7856 (void) Do_MultiXactIdWait(multi, status, infomask, false,
7857 rel, ctid, oper, remaining, false);
7858}
7859
7860/*
7861 * ConditionalMultiXactIdWait
7862 * As above, but only lock if we can get the lock without blocking.
7863 *
7864 * By the time we finish sleeping, someone else may have changed the Xmax
7865 * of the containing tuple, so the caller needs to iterate on us somehow.
7866 *
7867 * If the multixact is now all gone, return true. Returns false if some
7868 * transactions might still be running.
7869 *
7870 * We return (in *remaining, if not NULL) the number of members that are still
7871 * running, including any (non-aborted) subtransactions of our own transaction.
7872 */
7873static bool
7875 uint16 infomask, Relation rel, int *remaining,
7876 bool logLockFailure)
7877{
7878 return Do_MultiXactIdWait(multi, status, infomask, true,
7880}
7881
7882/*
7883 * heap_tuple_needs_eventual_freeze
7884 *
7885 * Check to see whether any of the XID fields of a tuple (xmin, xmax, xvac)
7886 * will eventually require freezing (if tuple isn't removed by pruning first).
7887 */
7888bool
7890{
7891 TransactionId xid;
7892
7893 /*
7894 * If xmin is a normal transaction ID, this tuple is definitely not
7895 * frozen.
7896 */
7897 xid = HeapTupleHeaderGetXmin(tuple);
7898 if (TransactionIdIsNormal(xid))
7899 return true;
7900
7901 /*
7902 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7903 */
7904 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7905 {
7906 MultiXactId multi;
7907
7908 multi = HeapTupleHeaderGetRawXmax(tuple);
7909 if (MultiXactIdIsValid(multi))
7910 return true;
7911 }
7912 else
7913 {
7914 xid = HeapTupleHeaderGetRawXmax(tuple);
7915 if (TransactionIdIsNormal(xid))
7916 return true;
7917 }
7918
7919 if (tuple->t_infomask & HEAP_MOVED)
7920 {
7921 xid = HeapTupleHeaderGetXvac(tuple);
7922 if (TransactionIdIsNormal(xid))
7923 return true;
7924 }
7925
7926 return false;
7927}
7928
7929/*
7930 * heap_tuple_should_freeze
7931 *
7932 * Return value indicates if heap_prepare_freeze_tuple sibling function would
7933 * (or should) force freezing of the heap page that contains caller's tuple.
7934 * Tuple header XIDs/MXIDs < FreezeLimit/MultiXactCutoff trigger freezing.
7935 * This includes (xmin, xmax, xvac) fields, as well as MultiXact member XIDs.
7936 *
7937 * The *NoFreezePageRelfrozenXid and *NoFreezePageRelminMxid input/output
7938 * arguments help VACUUM track the oldest extant XID/MXID remaining in rel.
7939 * Our working assumption is that caller won't decide to freeze this tuple.
7940 * It's up to caller to only ratchet back its own top-level trackers after the
7941 * point that it fully commits to not freezing the tuple/page in question.
7942 */
7943bool
7945 const struct VacuumCutoffs *cutoffs,
7946 TransactionId *NoFreezePageRelfrozenXid,
7947 MultiXactId *NoFreezePageRelminMxid)
7948{
7949 TransactionId xid;
7950 MultiXactId multi;
7951 bool freeze = false;
7952
7953 /* First deal with xmin */
7954 xid = HeapTupleHeaderGetXmin(tuple);
7955 if (TransactionIdIsNormal(xid))
7956 {
7958 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7959 *NoFreezePageRelfrozenXid = xid;
7960 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7961 freeze = true;
7962 }
7963
7964 /* Now deal with xmax */
7966 multi = InvalidMultiXactId;
7967 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7968 multi = HeapTupleHeaderGetRawXmax(tuple);
7969 else
7970 xid = HeapTupleHeaderGetRawXmax(tuple);
7971
7972 if (TransactionIdIsNormal(xid))
7973 {
7975 /* xmax is a non-permanent XID */
7976 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7977 *NoFreezePageRelfrozenXid = xid;
7978 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7979 freeze = true;
7980 }
7981 else if (!MultiXactIdIsValid(multi))
7982 {
7983 /* xmax is a permanent XID or invalid MultiXactId/XID */
7984 }
7985 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7986 {
7987 /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
7988 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
7989 *NoFreezePageRelminMxid = multi;
7990 /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
7991 freeze = true;
7992 }
7993 else
7994 {
7995 /* xmax is a MultiXactId that may have an updater XID */
7996 MultiXactMember *members;
7997 int nmembers;
7998
8000 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
8001 *NoFreezePageRelminMxid = multi;
8002 if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
8003 freeze = true;
8004
8005 /* need to check whether any member of the mxact is old */
8006 nmembers = GetMultiXactIdMembers(multi, &members, false,
8008
8009 for (int i = 0; i < nmembers; i++)
8010 {
8011 xid = members[i].xid;
8013 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8014 *NoFreezePageRelfrozenXid = xid;
8015 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
8016 freeze = true;
8017 }
8018 if (nmembers > 0)
8019 pfree(members);
8020 }
8021
8022 if (tuple->t_infomask & HEAP_MOVED)
8023 {
8024 xid = HeapTupleHeaderGetXvac(tuple);
8025 if (TransactionIdIsNormal(xid))
8026 {
8028 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8029 *NoFreezePageRelfrozenXid = xid;
8030 /* heap_prepare_freeze_tuple forces xvac freezing */
8031 freeze = true;
8032 }
8033 }
8034
8035 return freeze;
8036}
8037
8038/*
8039 * Maintain snapshotConflictHorizon for caller by ratcheting forward its value
8040 * using any committed XIDs contained in 'tuple', an obsolescent heap tuple
8041 * that caller is in the process of physically removing, e.g. via HOT pruning
8042 * or index deletion.
8043 *
8044 * Caller must initialize its value to InvalidTransactionId, which is
8045 * generally interpreted as "definitely no need for a recovery conflict".
8046 * Final value must reflect all heap tuples that caller will physically remove
8047 * (or remove TID references to) via its ongoing pruning/deletion operation.
8048 * ResolveRecoveryConflictWithSnapshot() is passed the final value (taken from
8049 * caller's WAL record) by REDO routine when it replays caller's operation.
8050 */
8051void
8053 TransactionId *snapshotConflictHorizon)
8054{
8058
8059 if (tuple->t_infomask & HEAP_MOVED)
8060 {
8061 if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
8062 *snapshotConflictHorizon = xvac;
8063 }
8064
8065 /*
8066 * Ignore tuples inserted by an aborted transaction or if the tuple was
8067 * updated/deleted by the inserting transaction.
8068 *
8069 * Look for a committed hint bit, or if no xmin bit is set, check clog.
8070 */
8071 if (HeapTupleHeaderXminCommitted(tuple) ||
8073 {
8074 if (xmax != xmin &&
8075 TransactionIdFollows(xmax, *snapshotConflictHorizon))
8076 *snapshotConflictHorizon = xmax;
8077 }
8078}
8079
8080#ifdef USE_PREFETCH
8081/*
8082 * Helper function for heap_index_delete_tuples. Issues prefetch requests for
8083 * prefetch_count buffers. The prefetch_state keeps track of all the buffers
8084 * we can prefetch, and which have already been prefetched; each call to this
8085 * function picks up where the previous call left off.
8086 *
8087 * Note: we expect the deltids array to be sorted in an order that groups TIDs
8088 * by heap block, with all TIDs for each block appearing together in exactly
8089 * one group.
8090 */
8091static void
8094 int prefetch_count)
8095{
8097 int count = 0;
8098 int i;
8099 int ndeltids = prefetch_state->ndeltids;
8100 TM_IndexDelete *deltids = prefetch_state->deltids;
8101
8102 for (i = prefetch_state->next_item;
8103 i < ndeltids && count < prefetch_count;
8104 i++)
8105 {
8106 ItemPointer htid = &deltids[i].tid;
8107
8110 {
8113 count++;
8114 }
8115 }
8116
8117 /*
8118 * Save the prefetch position so that next time we can continue from that
8119 * position.
8120 */
8121 prefetch_state->next_item = i;
8122 prefetch_state->cur_hblkno = cur_hblkno;
8123}
8124#endif
8125
8126/*
8127 * Helper function for heap_index_delete_tuples. Checks for index corruption
8128 * involving an invalid TID in index AM caller's index page.
8129 *
8130 * This is an ideal place for these checks. The index AM must hold a buffer
8131 * lock on the index page containing the TIDs we examine here, so we don't
8132 * have to worry about concurrent VACUUMs at all. We can be sure that the
8133 * index is corrupt when htid points directly to an LP_UNUSED item or
8134 * heap-only tuple, which is not the case during standard index scans.
8135 */
8136static inline void
8138 Page page, OffsetNumber maxoff,
8140{
8142 ItemId iid;
8143
8144 Assert(OffsetNumberIsValid(istatus->idxoffnum));
8145
8146 if (unlikely(indexpagehoffnum > maxoff))
8147 ereport(ERROR,
8149 errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
8152 istatus->idxoffnum, delstate->iblknum,
8154
8156 if (unlikely(!ItemIdIsUsed(iid)))
8157 ereport(ERROR,
8159 errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
8162 istatus->idxoffnum, delstate->iblknum,
8164
8165 if (ItemIdHasStorage(iid))
8166 {
8167 HeapTupleHeader htup;
8168
8170 htup = (HeapTupleHeader) PageGetItem(page, iid);
8171
8173 ereport(ERROR,
8175 errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
8178 istatus->idxoffnum, delstate->iblknum,
8180 }
8181}
8182
8183/*
8184 * heapam implementation of tableam's index_delete_tuples interface.
8185 *
8186 * This helper function is called by index AMs during index tuple deletion.
8187 * See tableam header comments for an explanation of the interface implemented
8188 * here and a general theory of operation. Note that each call here is either
8189 * a simple index deletion call, or a bottom-up index deletion call.
8190 *
8191 * It's possible for this to generate a fair amount of I/O, since we may be
8192 * deleting hundreds of tuples from a single index block. To amortize that
8193 * cost to some degree, this uses prefetching and combines repeat accesses to
8194 * the same heap block.
8195 */
8198{
8199 /* Initial assumption is that earlier pruning took care of conflict */
8200 TransactionId snapshotConflictHorizon = InvalidTransactionId;
8203 Page page = NULL;
8206#ifdef USE_PREFETCH
8209#endif
8211 int finalndeltids = 0,
8212 nblocksaccessed = 0;
8213
8214 /* State that's only used in bottom-up index deletion case */
8215 int nblocksfavorable = 0;
8216 int curtargetfreespace = delstate->bottomupfreespace,
8217 lastfreespace = 0,
8218 actualfreespace = 0;
8219 bool bottomup_final_block = false;
8220
8222
8223 /* Sort caller's deltids array by TID for further processing */
8225
8226 /*
8227 * Bottom-up case: resort deltids array in an order attuned to where the
8228 * greatest number of promising TIDs are to be found, and determine how
8229 * many blocks from the start of sorted array should be considered
8230 * favorable. This will also shrink the deltids array in order to
8231 * eliminate completely unfavorable blocks up front.
8232 */
8233 if (delstate->bottomup)
8235
8236#ifdef USE_PREFETCH
8237 /* Initialize prefetch state. */
8239 prefetch_state.next_item = 0;
8240 prefetch_state.ndeltids = delstate->ndeltids;
8241 prefetch_state.deltids = delstate->deltids;
8242
8243 /*
8244 * Determine the prefetch distance that we will attempt to maintain.
8245 *
8246 * Since the caller holds a buffer lock somewhere in rel, we'd better make
8247 * sure that isn't a catalog relation before we call code that does
8248 * syscache lookups, to avoid risk of deadlock.
8249 */
8250 if (IsCatalogRelation(rel))
8252 else
8255
8256 /* Cap initial prefetch distance for bottom-up deletion caller */
8257 if (delstate->bottomup)
8258 {
8262 }
8263
8264 /* Start prefetching. */
8266#endif
8267
8268 /* Iterate over deltids, determine which to delete, check their horizon */
8269 Assert(delstate->ndeltids > 0);
8270 for (int i = 0; i < delstate->ndeltids; i++)
8271 {
8272 TM_IndexDelete *ideltid = &delstate->deltids[i];
8273 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8274 ItemPointer htid = &ideltid->tid;
8275 OffsetNumber offnum;
8276
8277 /*
8278 * Read buffer, and perform required extra steps each time a new block
8279 * is encountered. Avoid refetching if it's the same block as the one
8280 * from the last htid.
8281 */
8282 if (blkno == InvalidBlockNumber ||
8284 {
8285 /*
8286 * Consider giving up early for bottom-up index deletion caller
8287 * first. (Only prefetch next-next block afterwards, when it
8288 * becomes clear that we're at least going to access the next
8289 * block in line.)
8290 *
8291 * Sometimes the first block frees so much space for bottom-up
8292 * caller that the deletion process can end without accessing any
8293 * more blocks. It is usually necessary to access 2 or 3 blocks
8294 * per bottom-up deletion operation, though.
8295 */
8296 if (delstate->bottomup)
8297 {
8298 /*
8299 * We often allow caller to delete a few additional items
8300 * whose entries we reached after the point that space target
8301 * from caller was satisfied. The cost of accessing the page
8302 * was already paid at that point, so it made sense to finish
8303 * it off. When that happened, we finalize everything here
8304 * (by finishing off the whole bottom-up deletion operation
8305 * without needlessly paying the cost of accessing any more
8306 * blocks).
8307 */
8309 break;
8310
8311 /*
8312 * Give up when we didn't enable our caller to free any
8313 * additional space as a result of processing the page that we
8314 * just finished up with. This rule is the main way in which
8315 * we keep the cost of bottom-up deletion under control.
8316 */
8318 break;
8319 lastfreespace = actualfreespace; /* for next time */
8320
8321 /*
8322 * Deletion operation (which is bottom-up) will definitely
8323 * access the next block in line. Prepare for that now.
8324 *
8325 * Decay target free space so that we don't hang on for too
8326 * long with a marginal case. (Space target is only truly
8327 * helpful when it allows us to recognize that we don't need
8328 * to access more than 1 or 2 blocks to satisfy caller due to
8329 * agreeable workload characteristics.)
8330 *
8331 * We are a bit more patient when we encounter contiguous
8332 * blocks, though: these are treated as favorable blocks. The
8333 * decay process is only applied when the next block in line
8334 * is not a favorable/contiguous block. This is not an
8335 * exception to the general rule; we still insist on finding
8336 * at least one deletable item per block accessed. See
8337 * bottomup_nblocksfavorable() for full details of the theory
8338 * behind favorable blocks and heap block locality in general.
8339 *
8340 * Note: The first block in line is always treated as a
8341 * favorable block, so the earliest possible point that the
8342 * decay can be applied is just before we access the second
8343 * block in line. The Assert() verifies this for us.
8344 */
8346 if (nblocksfavorable > 0)
8348 else
8349 curtargetfreespace /= 2;
8350 }
8351
8352 /* release old buffer */
8353 if (BufferIsValid(buf))
8355
8357 buf = ReadBuffer(rel, blkno);
8359 Assert(!delstate->bottomup ||
8361
8362#ifdef USE_PREFETCH
8363
8364 /*
8365 * To maintain the prefetch distance, prefetch one more page for
8366 * each page we read.
8367 */
8369#endif
8370
8372
8373 page = BufferGetPage(buf);
8374 maxoff = PageGetMaxOffsetNumber(page);
8375 }
8376
8377 /*
8378 * In passing, detect index corruption involving an index page with a
8379 * TID that points to a location in the heap that couldn't possibly be
8380 * correct. We only do this with actual TIDs from caller's index page
8381 * (not items reached by traversing through a HOT chain).
8382 */
8384
8385 if (istatus->knowndeletable)
8386 Assert(!delstate->bottomup && !istatus->promising);
8387 else
8388 {
8389 ItemPointerData tmp = *htid;
8391
8392 /* Are any tuples from this HOT chain non-vacuumable? */
8394 &heapTuple, NULL, true))
8395 continue; /* can't delete entry */
8396
8397 /* Caller will delete, since whole HOT chain is vacuumable */
8398 istatus->knowndeletable = true;
8399
8400 /* Maintain index free space info for bottom-up deletion case */
8401 if (delstate->bottomup)
8402 {
8403 Assert(istatus->freespace > 0);
8404 actualfreespace += istatus->freespace;
8406 bottomup_final_block = true;
8407 }
8408 }
8409
8410 /*
8411 * Maintain snapshotConflictHorizon value for deletion operation as a
8412 * whole by advancing current value using heap tuple headers. This is
8413 * loosely based on the logic for pruning a HOT chain.
8414 */
8416 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
8417 for (;;)
8418 {
8419 ItemId lp;
8420 HeapTupleHeader htup;
8421
8422 /* Sanity check (pure paranoia) */
8423 if (offnum < FirstOffsetNumber)
8424 break;
8425
8426 /*
8427 * An offset past the end of page's line pointer array is possible
8428 * when the array was truncated
8429 */
8430 if (offnum > maxoff)
8431 break;
8432
8433 lp = PageGetItemId(page, offnum);
8435 {
8436 offnum = ItemIdGetRedirect(lp);
8437 continue;
8438 }
8439
8440 /*
8441 * We'll often encounter LP_DEAD line pointers (especially with an
8442 * entry marked knowndeletable by our caller up front). No heap
8443 * tuple headers get examined for an htid that leads us to an
8444 * LP_DEAD item. This is okay because the earlier pruning
8445 * operation that made the line pointer LP_DEAD in the first place
8446 * must have considered the original tuple header as part of
8447 * generating its own snapshotConflictHorizon value.
8448 *
8449 * Relying on XLOG_HEAP2_PRUNE_VACUUM_SCAN records like this is
8450 * the same strategy that index vacuuming uses in all cases. Index
8451 * VACUUM WAL records don't even have a snapshotConflictHorizon
8452 * field of their own for this reason.
8453 */
8454 if (!ItemIdIsNormal(lp))
8455 break;
8456
8457 htup = (HeapTupleHeader) PageGetItem(page, lp);
8458
8459 /*
8460 * Check the tuple XMIN against prior XMAX, if any
8461 */
8464 break;
8465
8467 &snapshotConflictHorizon);
8468
8469 /*
8470 * If the tuple is not HOT-updated, then we are at the end of this
8471 * HOT-chain. No need to visit later tuples from the same update
8472 * chain (they get their own index entries) -- just move on to
8473 * next htid from index AM caller.
8474 */
8475 if (!HeapTupleHeaderIsHotUpdated(htup))
8476 break;
8477
8478 /* Advance to next HOT chain member */
8479 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
8480 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
8482 }
8483
8484 /* Enable further/final shrinking of deltids for caller */
8485 finalndeltids = i + 1;
8486 }
8487
8489
8490 /*
8491 * Shrink deltids array to exclude non-deletable entries at the end. This
8492 * is not just a minor optimization. Final deltids array size might be
8493 * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
8494 * ndeltids being zero in all cases with zero total deletable entries.
8495 */
8496 Assert(finalndeltids > 0 || delstate->bottomup);
8497 delstate->ndeltids = finalndeltids;
8498
8499 return snapshotConflictHorizon;
8500}
8501
8502/*
8503 * Specialized inlineable comparison function for index_delete_sort()
8504 */
8505static inline int
8507{
8508 ItemPointer tid1 = &deltid1->tid;
8509 ItemPointer tid2 = &deltid2->tid;
8510
8511 {
8514
8515 if (blk1 != blk2)
8516 return (blk1 < blk2) ? -1 : 1;
8517 }
8518 {
8521
8522 if (pos1 != pos2)
8523 return (pos1 < pos2) ? -1 : 1;
8524 }
8525
8526 Assert(false);
8527
8528 return 0;
8529}
8530
8531/*
8532 * Sort deltids array from delstate by TID. This prepares it for further
8533 * processing by heap_index_delete_tuples().
8534 *
8535 * This operation becomes a noticeable consumer of CPU cycles with some
8536 * workloads, so we go to the trouble of specialization/micro optimization.
8537 * We use shellsort for this because it's easy to specialize, compiles to
8538 * relatively few instructions, and is adaptive to presorted inputs/subsets
8539 * (which are typical here).
8540 */
8541static void
8543{
8544 TM_IndexDelete *deltids = delstate->deltids;
8545 int ndeltids = delstate->ndeltids;
8546
8547 /*
8548 * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
8549 *
8550 * This implementation is fast with array sizes up to ~4500. This covers
8551 * all supported BLCKSZ values.
8552 */
8553 const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
8554
8555 /* Think carefully before changing anything here -- keep swaps cheap */
8556 StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
8557 "element size exceeds 8 bytes");
8558
8559 for (int g = 0; g < lengthof(gaps); g++)
8560 {
8561 for (int hi = gaps[g], i = hi; i < ndeltids; i++)
8562 {
8563 TM_IndexDelete d = deltids[i];
8564 int j = i;
8565
8566 while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
8567 {
8568 deltids[j] = deltids[j - hi];
8569 j -= hi;
8570 }
8571 deltids[j] = d;
8572 }
8573 }
8574}
8575
8576/*
8577 * Returns how many blocks should be considered favorable/contiguous for a
8578 * bottom-up index deletion pass. This is a number of heap blocks that starts
8579 * from and includes the first block in line.
8580 *
8581 * There is always at least one favorable block during bottom-up index
8582 * deletion. In the worst case (i.e. with totally random heap blocks) the
8583 * first block in line (the only favorable block) can be thought of as a
8584 * degenerate array of contiguous blocks that consists of a single block.
8585 * heap_index_delete_tuples() will expect this.
8586 *
8587 * Caller passes blockgroups, a description of the final order that deltids
8588 * will be sorted in for heap_index_delete_tuples() bottom-up index deletion
8589 * processing. Note that deltids need not actually be sorted just yet (caller
8590 * only passes deltids to us so that we can interpret blockgroups).
8591 *
8592 * You might guess that the existence of contiguous blocks cannot matter much,
8593 * since in general the main factor that determines which blocks we visit is
8594 * the number of promising TIDs, which is a fixed hint from the index AM.
8595 * We're not really targeting the general case, though -- the actual goal is
8596 * to adapt our behavior to a wide variety of naturally occurring conditions.
8597 * The effects of most of the heuristics we apply are only noticeable in the
8598 * aggregate, over time and across many _related_ bottom-up index deletion
8599 * passes.
8600 *
8601 * Deeming certain blocks favorable allows heapam to recognize and adapt to
8602 * workloads where heap blocks visited during bottom-up index deletion can be
8603 * accessed contiguously, in the sense that each newly visited block is the
8604 * neighbor of the block that bottom-up deletion just finished processing (or
8605 * close enough to it). It will likely be cheaper to access more favorable
8606 * blocks sooner rather than later (e.g. in this pass, not across a series of
8607 * related bottom-up passes). Either way it is probably only a matter of time
8608 * (or a matter of further correlated version churn) before all blocks that
8609 * appear together as a single large batch of favorable blocks get accessed by
8610 * _some_ bottom-up pass. Large batches of favorable blocks tend to either
8611 * appear almost constantly or not even once (it all depends on per-index
8612 * workload characteristics).
8613 *
8614 * Note that the blockgroups sort order applies a power-of-two bucketing
8615 * scheme that creates opportunities for contiguous groups of blocks to get
8616 * batched together, at least with workloads that are naturally amenable to
8617 * being driven by heap block locality. This doesn't just enhance the spatial
8618 * locality of bottom-up heap block processing in the obvious way. It also
8619 * enables temporal locality of access, since sorting by heap block number
8620 * naturally tends to make the bottom-up processing order deterministic.
8621 *
8622 * Consider the following example to get a sense of how temporal locality
8623 * might matter: There is a heap relation with several indexes, each of which
8624 * is low to medium cardinality. It is subject to constant non-HOT updates.
8625 * The updates are skewed (in one part of the primary key, perhaps). None of
8626 * the indexes are logically modified by the UPDATE statements (if they were
8627 * then bottom-up index deletion would not be triggered in the first place).
8628 * Naturally, each new round of index tuples (for each heap tuple that gets a
8629 * heap_update() call) will have the same heap TID in each and every index.
8630 * Since these indexes are low cardinality and never get logically modified,
8631 * heapam processing during bottom-up deletion passes will access heap blocks
8632 * in approximately sequential order. Temporal locality of access occurs due
8633 * to bottom-up deletion passes behaving very similarly across each of the
8634 * indexes at any given moment. This keeps the number of buffer misses needed
8635 * to visit heap blocks to a minimum.
8636 */
8637static int
8639 TM_IndexDelete *deltids)
8640{
8641 int64 lastblock = -1;
8642 int nblocksfavorable = 0;
8643
8644 Assert(nblockgroups >= 1);
8646
8647 /*
8648 * We tolerate heap blocks that will be accessed only slightly out of
8649 * physical order. Small blips occur when a pair of almost-contiguous
8650 * blocks happen to fall into different buckets (perhaps due only to a
8651 * small difference in npromisingtids that the bucketing scheme didn't
8652 * quite manage to ignore). We effectively ignore these blips by applying
8653 * a small tolerance. The precise tolerance we use is a little arbitrary,
8654 * but it works well enough in practice.
8655 */
8656 for (int b = 0; b < nblockgroups; b++)
8657 {
8658 IndexDeleteCounts *group = blockgroups + b;
8659 TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
8661
8662 if (lastblock != -1 &&
8665 break;
8666
8668 lastblock = block;
8669 }
8670
8671 /* Always indicate that there is at least 1 favorable block */
8673
8674 return nblocksfavorable;
8675}
8676
8677/*
8678 * qsort comparison function for bottomup_sort_and_shrink()
8679 */
8680static int
8681bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
8682{
8685
8686 /*
8687 * Most significant field is npromisingtids (which we invert the order of
8688 * so as to sort in desc order).
8689 *
8690 * Caller should have already normalized npromisingtids fields into
8691 * power-of-two values (buckets).
8692 */
8693 if (group1->npromisingtids > group2->npromisingtids)
8694 return -1;
8695 if (group1->npromisingtids < group2->npromisingtids)
8696 return 1;
8697
8698 /*
8699 * Tiebreak: desc ntids sort order.
8700 *
8701 * We cannot expect power-of-two values for ntids fields. We should
8702 * behave as if they were already rounded up for us instead.
8703 */
8704 if (group1->ntids != group2->ntids)
8705 {
8708
8709 if (ntids1 > ntids2)
8710 return -1;
8711 if (ntids1 < ntids2)
8712 return 1;
8713 }
8714
8715 /*
8716 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
8717 * block in deltids array) order.
8718 *
8719 * This is equivalent to sorting in ascending heap block number order
8720 * (among otherwise equal subsets of the array). This approach allows us
8721 * to avoid accessing the out-of-line TID. (We rely on the assumption
8722 * that the deltids array was sorted in ascending heap TID order when
8723 * these offsets to the first TID from each heap block group were formed.)
8724 */
8725 if (group1->ifirsttid > group2->ifirsttid)
8726 return 1;
8727 if (group1->ifirsttid < group2->ifirsttid)
8728 return -1;
8729
8731
8732 return 0;
8733}
8734
8735/*
8736 * heap_index_delete_tuples() helper function for bottom-up deletion callers.
8737 *
8738 * Sorts deltids array in the order needed for useful processing by bottom-up
8739 * deletion. The array should already be sorted in TID order when we're
8740 * called. The sort process groups heap TIDs from deltids into heap block
8741 * groupings. Earlier/more-promising groups/blocks are usually those that are
8742 * known to have the most "promising" TIDs.
8743 *
8744 * Sets new size of deltids array (ndeltids) in state. deltids will only have
8745 * TIDs from the BOTTOMUP_MAX_NBLOCKS most promising heap blocks when we
8746 * return. This often means that deltids will be shrunk to a small fraction
8747 * of its original size (we eliminate many heap blocks from consideration for
8748 * caller up front).
8749 *
8750 * Returns the number of "favorable" blocks. See bottomup_nblocksfavorable()
8751 * for a definition and full details.
8752 */
8753static int
8755{
8759 int nblockgroups = 0;
8760 int ncopied = 0;
8761 int nblocksfavorable = 0;
8762
8763 Assert(delstate->bottomup);
8764 Assert(delstate->ndeltids > 0);
8765
8766 /* Calculate per-heap-block count of TIDs */
8768 for (int i = 0; i < delstate->ndeltids; i++)
8769 {
8770 TM_IndexDelete *ideltid = &delstate->deltids[i];
8771 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8772 ItemPointer htid = &ideltid->tid;
8773 bool promising = istatus->promising;
8774
8776 {
8777 /* New block group */
8778 nblockgroups++;
8779
8782
8784 blockgroups[nblockgroups - 1].ifirsttid = i;
8785 blockgroups[nblockgroups - 1].ntids = 1;
8786 blockgroups[nblockgroups - 1].npromisingtids = 0;
8787 }
8788 else
8789 {
8790 blockgroups[nblockgroups - 1].ntids++;
8791 }
8792
8793 if (promising)
8794 blockgroups[nblockgroups - 1].npromisingtids++;
8795 }
8796
8797 /*
8798 * We're about ready to sort block groups to determine the optimal order
8799 * for visiting heap blocks. But before we do, round the number of
8800 * promising tuples for each block group up to the next power-of-two,
8801 * unless it is very low (less than 4), in which case we round up to 4.
8802 * npromisingtids is far too noisy to trust when choosing between a pair
8803 * of block groups that both have very low values.
8804 *
8805 * This scheme divides heap blocks/block groups into buckets. Each bucket
8806 * contains blocks that have _approximately_ the same number of promising
8807 * TIDs as each other. The goal is to ignore relatively small differences
8808 * in the total number of promising entries, so that the whole process can
8809 * give a little weight to heapam factors (like heap block locality)
8810 * instead. This isn't a trade-off, really -- we have nothing to lose. It
8811 * would be foolish to interpret small differences in npromisingtids
8812 * values as anything more than noise.
8813 *
8814 * We tiebreak on nhtids when sorting block group subsets that have the
8815 * same npromisingtids, but this has the same issues as npromisingtids,
8816 * and so nhtids is subject to the same power-of-two bucketing scheme. The
8817 * only reason that we don't fix nhtids in the same way here too is that
8818 * we'll need accurate nhtids values after the sort. We handle nhtids
8819 * bucketization dynamically instead (in the sort comparator).
8820 *
8821 * See bottomup_nblocksfavorable() for a full explanation of when and how
8822 * heap locality/favorable blocks can significantly influence when and how
8823 * heap blocks are accessed.
8824 */
8825 for (int b = 0; b < nblockgroups; b++)
8826 {
8827 IndexDeleteCounts *group = blockgroups + b;
8828
8829 /* Better off falling back on nhtids with low npromisingtids */
8830 if (group->npromisingtids <= 4)
8831 group->npromisingtids = 4;
8832 else
8833 group->npromisingtids =
8835 }
8836
8837 /* Sort groups and rearrange caller's deltids array */
8840 reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
8841
8843 /* Determine number of favorable blocks at the start of final deltids */
8845 delstate->deltids);
8846
8847 for (int b = 0; b < nblockgroups; b++)
8848 {
8849 IndexDeleteCounts *group = blockgroups + b;
8850 TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
8851
8853 sizeof(TM_IndexDelete) * group->ntids);
8854 ncopied += group->ntids;
8855 }
8856
8857 /* Copy final grouped and sorted TIDs back into start of caller's array */
8859 sizeof(TM_IndexDelete) * ncopied);
8860 delstate->ndeltids = ncopied;
8861
8864
8865 return nblocksfavorable;
8866}
8867
8868/*
8869 * Perform XLogInsert for a heap-visible operation. 'block' is the block
8870 * being marked all-visible, and vm_buffer is the buffer containing the
8871 * corresponding visibility map block. Both should have already been modified
8872 * and dirtied.
8873 *
8874 * snapshotConflictHorizon comes from the largest xmin on the page being
8875 * marked all-visible. REDO routine uses it to generate recovery conflicts.
8876 *
8877 * If checksums or wal_log_hints are enabled, we may also generate a full-page
8878 * image of heap_buffer. Otherwise, we optimize away the FPI (by specifying
8879 * REGBUF_NO_IMAGE for the heap buffer), in which case the caller should *not*
8880 * update the heap page's LSN.
8881 */
8884 TransactionId snapshotConflictHorizon, uint8 vmflags)
8885{
8888 uint8 flags;
8889
8892
8893 xlrec.snapshotConflictHorizon = snapshotConflictHorizon;
8894 xlrec.flags = vmflags;
8899
8901
8902 flags = REGBUF_STANDARD;
8903 if (!XLogHintBitIsNeeded())
8904 flags |= REGBUF_NO_IMAGE;
8906
8908
8909 return recptr;
8910}
8911
8912/*
8913 * Perform XLogInsert for a heap-update operation. Caller must already
8914 * have modified the buffer(s) and marked them dirty.
8915 */
8916static XLogRecPtr
8921{
8925 uint8 info;
8927 uint16 prefixlen = 0,
8928 suffixlen = 0;
8930 Page page = BufferGetPage(newbuf);
8932 bool init;
8933 int bufflags;
8934
8935 /* Caller should not call me on a non-WAL-logged relation */
8937
8939
8941 info = XLOG_HEAP_HOT_UPDATE;
8942 else
8943 info = XLOG_HEAP_UPDATE;
8944
8945 /*
8946 * If the old and new tuple are on the same page, we only need to log the
8947 * parts of the new tuple that were changed. That saves on the amount of
8948 * WAL we need to write. Currently, we just count any unchanged bytes in
8949 * the beginning and end of the tuple. That's quick to check, and
8950 * perfectly covers the common case that only one field is updated.
8951 *
8952 * We could do this even if the old and new tuple are on different pages,
8953 * but only if we don't make a full-page image of the old page, which is
8954 * difficult to know in advance. Also, if the old tuple is corrupt for
8955 * some reason, it would allow the corruption to propagate the new page,
8956 * so it seems best to avoid. Under the general assumption that most
8957 * updates tend to create the new tuple version on the same page, there
8958 * isn't much to be gained by doing this across pages anyway.
8959 *
8960 * Skip this if we're taking a full-page image of the new page, as we
8961 * don't include the new tuple in the WAL record in that case. Also
8962 * disable if effective_wal_level='logical', as logical decoding needs to
8963 * be able to read the new tuple in whole from the WAL record alone.
8964 */
8965 if (oldbuf == newbuf && !need_tuple_data &&
8967 {
8968 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8969 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8970 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8971 int newlen = newtup->t_len - newtup->t_data->t_hoff;
8972
8973 /* Check for common prefix between old and new tuple */
8974 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8975 {
8976 if (newp[prefixlen] != oldp[prefixlen])
8977 break;
8978 }
8979
8980 /*
8981 * Storing the length of the prefix takes 2 bytes, so we need to save
8982 * at least 3 bytes or there's no point.
8983 */
8984 if (prefixlen < 3)
8985 prefixlen = 0;
8986
8987 /* Same for suffix */
8989 {
8990 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
8991 break;
8992 }
8993 if (suffixlen < 3)
8994 suffixlen = 0;
8995 }
8996
8997 /* Prepare main WAL data chain */
8998 xlrec.flags = 0;
9003 if (prefixlen > 0)
9005 if (suffixlen > 0)
9007 if (need_tuple_data)
9008 {
9010 if (old_key_tuple)
9011 {
9012 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
9014 else
9016 }
9017 }
9018
9019 /* If new tuple is the single and first tuple on page... */
9022 {
9023 info |= XLOG_HEAP_INIT_PAGE;
9024 init = true;
9025 }
9026 else
9027 init = false;
9028
9029 /* Prepare WAL data for the old page */
9030 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
9031 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
9032 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
9033 oldtup->t_data->t_infomask2);
9034
9035 /* Prepare WAL data for the new page */
9036 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
9037 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
9038
9040 if (init)
9042 if (need_tuple_data)
9044
9046 if (oldbuf != newbuf)
9048
9050
9051 /*
9052 * Prepare WAL data for the new tuple.
9053 */
9054 if (prefixlen > 0 || suffixlen > 0)
9055 {
9056 if (prefixlen > 0 && suffixlen > 0)
9057 {
9060 XLogRegisterBufData(0, &prefix_suffix, sizeof(uint16) * 2);
9061 }
9062 else if (prefixlen > 0)
9063 {
9064 XLogRegisterBufData(0, &prefixlen, sizeof(uint16));
9065 }
9066 else
9067 {
9068 XLogRegisterBufData(0, &suffixlen, sizeof(uint16));
9069 }
9070 }
9071
9072 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
9073 xlhdr.t_infomask = newtup->t_data->t_infomask;
9074 xlhdr.t_hoff = newtup->t_data->t_hoff;
9076
9077 /*
9078 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
9079 *
9080 * The 'data' doesn't include the common prefix or suffix.
9081 */
9083 if (prefixlen == 0)
9084 {
9086 (char *) newtup->t_data + SizeofHeapTupleHeader,
9088 }
9089 else
9090 {
9091 /*
9092 * Have to write the null bitmap and data after the common prefix as
9093 * two separate rdata entries.
9094 */
9095 /* bitmap [+ padding] [+ oid] */
9096 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
9097 {
9099 (char *) newtup->t_data + SizeofHeapTupleHeader,
9100 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
9101 }
9102
9103 /* data after common prefix */
9105 (char *) newtup->t_data + newtup->t_data->t_hoff + prefixlen,
9106 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
9107 }
9108
9109 /* We need to log a tuple identity */
9111 {
9112 /* don't really need this, but its more comfy to decode */
9113 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
9114 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
9115 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
9116
9118
9119 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
9122 }
9123
9124 /* filtering by origin on a row level is much more efficient */
9126
9127 recptr = XLogInsert(RM_HEAP_ID, info);
9128
9129 return recptr;
9130}
9131
9132/*
9133 * Perform XLogInsert of an XLOG_HEAP2_NEW_CID record
9134 *
9135 * This is only used when effective_wal_level is logical, and only for
9136 * catalog tuples.
9137 */
9138static XLogRecPtr
9140{
9142
9144 HeapTupleHeader hdr = tup->t_data;
9145
9146 Assert(ItemPointerIsValid(&tup->t_self));
9147 Assert(tup->t_tableOid != InvalidOid);
9148
9149 xlrec.top_xid = GetTopTransactionId();
9150 xlrec.target_locator = relation->rd_locator;
9151 xlrec.target_tid = tup->t_self;
9152
9153 /*
9154 * If the tuple got inserted & deleted in the same TX we definitely have a
9155 * combo CID, set cmin and cmax.
9156 */
9157 if (hdr->t_infomask & HEAP_COMBOCID)
9158 {
9161 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
9162 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
9163 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
9164 }
9165 /* No combo CID, so only cmin or cmax can be set by this TX */
9166 else
9167 {
9168 /*
9169 * Tuple inserted.
9170 *
9171 * We need to check for LOCK ONLY because multixacts might be
9172 * transferred to the new tuple in case of FOR KEY SHARE updates in
9173 * which case there will be an xmax, although the tuple just got
9174 * inserted.
9175 */
9176 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
9178 {
9180 xlrec.cmax = InvalidCommandId;
9181 }
9182 /* Tuple from a different tx updated or deleted. */
9183 else
9184 {
9185 xlrec.cmin = InvalidCommandId;
9187 }
9188 xlrec.combocid = InvalidCommandId;
9189 }
9190
9191 /*
9192 * Note that we don't need to register the buffer here, because this
9193 * operation does not modify the page. The insert/update/delete that
9194 * called us certainly did, but that's WAL-logged separately.
9195 */
9198
9199 /* will be looked at irrespective of origin */
9200
9202
9203 return recptr;
9204}
9205
9206/*
9207 * Build a heap tuple representing the configured REPLICA IDENTITY to represent
9208 * the old tuple in an UPDATE or DELETE.
9209 *
9210 * Returns NULL if there's no need to log an identity or if there's no suitable
9211 * key defined.
9212 *
9213 * Pass key_required true if any replica identity columns changed value, or if
9214 * any of them have any external data. Delete must always pass true.
9215 *
9216 * *copy is set to true if the returned tuple is a modified copy rather than
9217 * the same tuple that was passed in.
9218 */
9219static HeapTuple
9221 bool *copy)
9222{
9223 TupleDesc desc = RelationGetDescr(relation);
9224 char replident = relation->rd_rel->relreplident;
9227 bool nulls[MaxHeapAttributeNumber];
9229
9230 *copy = false;
9231
9232 if (!RelationIsLogicallyLogged(relation))
9233 return NULL;
9234
9235 if (replident == REPLICA_IDENTITY_NOTHING)
9236 return NULL;
9237
9238 if (replident == REPLICA_IDENTITY_FULL)
9239 {
9240 /*
9241 * When logging the entire old tuple, it very well could contain
9242 * toasted columns. If so, force them to be inlined.
9243 */
9244 if (HeapTupleHasExternal(tp))
9245 {
9246 *copy = true;
9247 tp = toast_flatten_tuple(tp, desc);
9248 }
9249 return tp;
9250 }
9251
9252 /* if the key isn't required and we're only logging the key, we're done */
9253 if (!key_required)
9254 return NULL;
9255
9256 /* find out the replica identity columns */
9259
9260 /*
9261 * If there's no defined replica identity columns, treat as !key_required.
9262 * (This case should not be reachable from heap_update, since that should
9263 * calculate key_required accurately. But heap_delete just passes
9264 * constant true for key_required, so we can hit this case in deletes.)
9265 */
9266 if (bms_is_empty(idattrs))
9267 return NULL;
9268
9269 /*
9270 * Construct a new tuple containing only the replica identity columns,
9271 * with nulls elsewhere. While we're at it, assert that the replica
9272 * identity columns aren't null.
9273 */
9274 heap_deform_tuple(tp, desc, values, nulls);
9275
9276 for (int i = 0; i < desc->natts; i++)
9277 {
9279 idattrs))
9280 Assert(!nulls[i]);
9281 else
9282 nulls[i] = true;
9283 }
9284
9285 key_tuple = heap_form_tuple(desc, values, nulls);
9286 *copy = true;
9287
9289
9290 /*
9291 * If the tuple, which by here only contains indexed columns, still has
9292 * toasted columns, force them to be inlined. This is somewhat unlikely
9293 * since there's limits on the size of indexed columns, so we don't
9294 * duplicate toast_flatten_tuple()s functionality in the above loop over
9295 * the indexed columns, even if it would be more efficient.
9296 */
9298 {
9300
9303 }
9304
9305 return key_tuple;
9306}
9307
9308/*
9309 * HeapCheckForSerializableConflictOut
9310 * We are reading a tuple. If it's not visible, there may be a
9311 * rw-conflict out with the inserter. Otherwise, if it is visible to us
9312 * but has been deleted, there may be a rw-conflict out with the deleter.
9313 *
9314 * We will determine the top level xid of the writing transaction with which
9315 * we may be in conflict, and ask CheckForSerializableConflictOut() to check
9316 * for overlap with our own transaction.
9317 *
9318 * This function should be called just about anywhere in heapam.c where a
9319 * tuple has been read. The caller must hold at least a shared lock on the
9320 * buffer, because this function might set hint bits on the tuple. There is
9321 * currently no known reason to call this function from an index AM.
9322 */
9323void
9324HeapCheckForSerializableConflictOut(bool visible, Relation relation,
9325 HeapTuple tuple, Buffer buffer,
9326 Snapshot snapshot)
9327{
9328 TransactionId xid;
9330
9331 if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9332 return;
9333
9334 /*
9335 * Check to see whether the tuple has been written to by a concurrent
9336 * transaction, either to create it not visible to us, or to delete it
9337 * while it is visible to us. The "visible" bool indicates whether the
9338 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9339 * is going on with it.
9340 *
9341 * In the event of a concurrently inserted tuple that also happens to have
9342 * been concurrently updated (by a separate transaction), the xmin of the
9343 * tuple will be used -- not the updater's xid.
9344 */
9346 switch (htsvResult)
9347 {
9348 case HEAPTUPLE_LIVE:
9349 if (visible)
9350 return;
9351 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9352 break;
9355 if (visible)
9356 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9357 else
9358 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9359
9361 {
9362 /* This is like the HEAPTUPLE_DEAD case */
9363 Assert(!visible);
9364 return;
9365 }
9366 break;
9368 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9369 break;
9370 case HEAPTUPLE_DEAD:
9371 Assert(!visible);
9372 return;
9373 default:
9374
9375 /*
9376 * The only way to get to this default clause is if a new value is
9377 * added to the enum type without adding it to this switch
9378 * statement. That's a bug, so elog.
9379 */
9380 elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9381
9382 /*
9383 * In spite of having all enum values covered and calling elog on
9384 * this default, some compilers think this is a code path which
9385 * allows xid to be used below without initialization. Silence
9386 * that warning.
9387 */
9389 }
9390
9393
9394 /*
9395 * Find top level xid. Bail out if xid is too early to be a conflict, or
9396 * if it's our own xid.
9397 */
9399 return;
9402 return;
9403
9404 CheckForSerializableConflictOut(relation, xid, snapshot);
9405}
int16 AttrNumber
Definition attnum.h:21
int bms_next_member(const Bitmapset *a, int prevbit)
Definition bitmapset.c:1305
void bms_free(Bitmapset *a)
Definition bitmapset.c:239
bool bms_is_member(int x, const Bitmapset *a)
Definition bitmapset.c:510
Bitmapset * bms_add_member(Bitmapset *a, int x)
Definition bitmapset.c:814
Bitmapset * bms_add_members(Bitmapset *a, const Bitmapset *b)
Definition bitmapset.c:916
bool bms_overlap(const Bitmapset *a, const Bitmapset *b)
Definition bitmapset.c:581
#define bms_is_empty(a)
Definition bitmapset.h:118
uint32 BlockNumber
Definition block.h:31
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static int32 next
Definition blutils.c:225
static Datum values[MAXATTR]
Definition bootstrap.c:155
int Buffer
Definition buf.h:23
#define InvalidBuffer
Definition buf.h:25
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4356
PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:772
void BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
Definition bufmgr.c:4377
bool BufferIsDirty(Buffer buffer)
Definition bufmgr.c:3024
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5501
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5518
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3056
int maintenance_io_concurrency
Definition bufmgr.c:191
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:864
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
#define RelationGetNumberOfBlocks(reln)
Definition bufmgr.h:307
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:466
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:433
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:210
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:220
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:205
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:328
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:417
Size PageGetHeapFreeSpace(const PageData *page)
Definition bufpage.c:990
PageHeaderData * PageHeader
Definition bufpage.h:173
static bool PageIsAllVisible(const PageData *page)
Definition bufpage.h:428
static void PageClearAllVisible(Page page)
Definition bufpage.h:438
#define SizeOfPageHeaderData
Definition bufpage.h:216
static void PageSetAllVisible(Page page)
Definition bufpage.h:433
static ItemId PageGetItemId(Page page, OffsetNumber offsetNumber)
Definition bufpage.h:243
static void * PageGetItem(PageData *page, const ItemIdData *itemId)
Definition bufpage.h:353
static void PageSetFull(Page page)
Definition bufpage.h:417
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:390
PageData * Page
Definition bufpage.h:81
#define PageSetPrunable(page, xid)
Definition bufpage.h:446
static OffsetNumber PageGetMaxOffsetNumber(const PageData *page)
Definition bufpage.h:371
#define NameStr(name)
Definition c.h:765
#define InvalidCommandId
Definition c.h:683
#define pg_noinline
Definition c.h:295
#define Min(x, y)
Definition c.h:997
#define likely(x)
Definition c.h:411
#define MAXALIGN(LEN)
Definition c.h:826
uint8_t uint8
Definition c.h:544
#define Assert(condition)
Definition c.h:873
int64_t int64
Definition c.h:543
TransactionId MultiXactId
Definition c.h:676
#define pg_attribute_always_inline
Definition c.h:279
int16_t int16
Definition c.h:541
#define SHORTALIGN(LEN)
Definition c.h:822
uint16_t uint16
Definition c.h:545
#define pg_unreachable()
Definition c.h:341
#define unlikely(x)
Definition c.h:412
uint32_t uint32
Definition c.h:546
#define lengthof(array)
Definition c.h:803
#define StaticAssertDecl(condition, errmessage)
Definition c.h:942
uint32 CommandId
Definition c.h:680
uint32 TransactionId
Definition c.h:666
#define OidIsValid(objectId)
Definition c.h:788
size_t Size
Definition c.h:619
bool IsToastRelation(Relation relation)
Definition catalog.c:206
bool IsCatalogRelation(Relation relation)
Definition catalog.c:104
bool IsSharedRelation(Oid relationId)
Definition catalog.c:304
bool IsInplaceUpdateRelation(Relation relation)
Definition catalog.c:183
CommandId HeapTupleHeaderGetCmin(const HeapTupleHeaderData *tup)
Definition combocid.c:104
void HeapTupleHeaderAdjustCmax(const HeapTupleHeaderData *tup, CommandId *cmax, bool *iscombo)
Definition combocid.c:153
CommandId HeapTupleHeaderGetCmax(const HeapTupleHeaderData *tup)
Definition combocid.c:118
bool datumIsEqual(Datum value1, Datum value2, bool typByVal, int typLen)
Definition datum.c:223
int errmsg_internal(const char *fmt,...)
Definition elog.c:1170
int errdetail_internal(const char *fmt,...)
Definition elog.c:1243
int errcode(int sqlerrcode)
Definition elog.c:863
int errmsg(const char *fmt,...)
Definition elog.c:1080
#define WARNING
Definition elog.h:36
#define ERROR
Definition elog.h:39
#define elog(elevel,...)
Definition elog.h:226
#define ereport(elevel,...)
Definition elog.h:150
HeapTuple ExecFetchSlotHeapTuple(TupleTableSlot *slot, bool materialize, bool *shouldFree)
TupleTableSlot * ExecStoreBufferHeapTuple(HeapTuple tuple, TupleTableSlot *slot, Buffer buffer)
#define palloc_object(type)
Definition fe_memutils.h:74
#define palloc_array(type, count)
Definition fe_memutils.h:76
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:461
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:643
int NBuffers
Definition globals.c:142
Oid MyDatabaseTableSpace
Definition globals.c:96
Oid MyDatabaseId
Definition globals.c:94
void simple_heap_update(Relation relation, const ItemPointerData *otid, HeapTuple tup, TU_UpdateIndexes *update_indexes)
Definition heapam.c:4555
static bool DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, LockTupleMode lockmode, bool *current_is_member)
Definition heapam.c:7675
void heap_insert(Relation relation, HeapTuple tup, CommandId cid, int options, BulkInsertState bistate)
Definition heapam.c:2141
static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup)
Definition heapam.c:9140
XLogRecPtr log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer, TransactionId snapshotConflictHorizon, uint8 vmflags)
Definition heapam.c:8884
static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, uint16 old_infomask2, TransactionId add_to_xmax, LockTupleMode mode, bool is_update, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2)
Definition heapam.c:5394
static TM_Result heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax, const ItemPointerData *tid, TransactionId xid, LockTupleMode mode)
Definition heapam.c:5766
static void heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir)
Definition heapam.c:706
bool heap_inplace_lock(Relation relation, HeapTuple oldtup_ptr, Buffer buffer, void(*release_callback)(void *), void *arg)
Definition heapam.c:6436
bool heap_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf)
Definition heapam.c:1658
#define BOTTOMUP_TOLERANCE_NBLOCKS
Definition heapam.c:189
static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options)
Definition heapam.c:2332
static BlockNumber heap_scan_stream_read_next_parallel(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition heapam.c:251
static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate)
Definition heapam.c:8755
static bool heap_acquire_tuplock(Relation relation, const ItemPointerData *tid, LockTupleMode mode, LockWaitPolicy wait_policy, bool *have_tuple_lock)
Definition heapam.c:5345
static int heap_multi_insert_pages(HeapTuple *heaptuples, int done, int ntuples, Size saveFreeSpace)
Definition heapam.c:2380
static pg_attribute_always_inline int page_collect_tuples(HeapScanDesc scan, Snapshot snapshot, Page page, Buffer buffer, BlockNumber block, int lines, bool all_visible, bool check_serializable)
Definition heapam.c:521
static BlockNumber heap_scan_stream_read_next_serial(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition heapam.c:291
static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask, uint16 *new_infomask2)
Definition heapam.c:7526
void heap_finish_speculative(Relation relation, const ItemPointerData *tid)
Definition heapam.c:6167
void HeapTupleHeaderAdvanceConflictHorizon(HeapTupleHeader tuple, TransactionId *snapshotConflictHorizon)
Definition heapam.c:8053
bool heap_getnextslot(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
Definition heapam.c:1448
#define LOCKMODE_from_mxstatus(status)
Definition heapam.c:158
void heap_endscan(TableScanDesc sscan)
Definition heapam.c:1370
#define FRM_RETURN_IS_XID
Definition heapam.c:6733
#define TUPLOCK_from_mxstatus(status)
Definition heapam.c:217
void heap_rescan(TableScanDesc sscan, ScanKey key, bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode)
Definition heapam.c:1317
void heap_inplace_unlock(Relation relation, HeapTuple oldtup, Buffer buffer)
Definition heapam.c:6723
TM_Result heap_update(Relation relation, const ItemPointerData *otid, HeapTuple newtup, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, LockTupleMode *lockmode, TU_UpdateIndexes *update_indexes)
Definition heapam.c:3311
static int index_delete_sort_cmp(TM_IndexDelete *deltid1, TM_IndexDelete *deltid2)
Definition heapam.c:8507
static bool ConditionalMultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, int *remaining, bool logLockFailure)
Definition heapam.c:7875
bool heap_tuple_needs_eventual_freeze(HeapTupleHeader tuple)
Definition heapam.c:7890
TM_Result heap_delete(Relation relation, const ItemPointerData *tid, CommandId cid, Snapshot crosscheck, bool wait, TM_FailureData *tmfd, bool changingPart)
Definition heapam.c:2842
static TransactionId FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, const struct VacuumCutoffs *cutoffs, uint16 *flags, HeapPageFreeze *pagefrz)
Definition heapam.c:6784
static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy)
Definition heapam.c:9221
static pg_noinline BlockNumber heapgettup_initial_block(HeapScanDesc scan, ScanDirection dir)
Definition heapam.c:751
static TM_Result heap_lock_updated_tuple(Relation rel, uint16 prior_infomask, TransactionId prior_raw_xmax, const ItemPointerData *prior_ctid, TransactionId xid, LockTupleMode mode)
Definition heapam.c:6114
#define LockTupleTuplock(rel, tup, mode)
Definition heapam.c:166
bool heap_tuple_should_freeze(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, TransactionId *NoFreezePageRelfrozenXid, MultiXactId *NoFreezePageRelminMxid)
Definition heapam.c:7945
bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId FreezeLimit, TransactionId MultiXactCutoff)
Definition heapam.c:7482
void heap_inplace_update_and_unlock(Relation relation, HeapTuple oldtup, HeapTuple tuple, Buffer buffer)
Definition heapam.c:6574
static BlockNumber heapgettup_advance_block(HeapScanDesc scan, BlockNumber block, ScanDirection dir)
Definition heapam.c:875
static TransactionId MultiXactIdGetUpdateXid(TransactionId xmax, uint16 t_infomask)
Definition heapam.c:7607
#define BOTTOMUP_MAX_NBLOCKS
Definition heapam.c:188
void ReleaseBulkInsertStatePin(BulkInsertState bistate)
Definition heapam.c:2103
#define FRM_MARK_COMMITTED
Definition heapam.c:6735
#define FRM_NOOP
Definition heapam.c:6731
static void index_delete_check_htid(TM_IndexDeleteOp *delstate, Page page, OffsetNumber maxoff, const ItemPointerData *htid, TM_IndexStatus *istatus)
Definition heapam.c:8138
HeapTuple heap_getnext(TableScanDesc sscan, ScanDirection direction)
Definition heapam.c:1409
bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, bool *all_dead, bool first_call)
Definition heapam.c:1778
void heap_freeze_prepared_tuples(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
Definition heapam.c:7460
bool heap_getnextslot_tidrange(TableScanDesc sscan, ScanDirection direction, TupleTableSlot *slot)
Definition heapam.c:1551
static void MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining)
Definition heapam.c:7853
void heap_set_tidrange(TableScanDesc sscan, ItemPointer mintid, ItemPointer maxtid)
Definition heapam.c:1478
void heap_abort_speculative(Relation relation, const ItemPointerData *tid)
Definition heapam.c:6254
static BlockNumber bitmapheap_stream_read_next(ReadStream *pgsr, void *private_data, void *per_buffer_data)
Definition heapam.c:316
TableScanDesc heap_beginscan(Relation relation, Snapshot snapshot, int nkeys, ScanKey key, ParallelTableScanDesc parallel_scan, uint32 flags)
Definition heapam.c:1163
static void heapgettup(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
Definition heapam.c:959
static Page heapgettup_continue_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
Definition heapam.c:829
static uint8 compute_infobits(uint16 infomask, uint16 infomask2)
Definition heapam.c:2797
#define FRM_RETURN_IS_MULTI
Definition heapam.c:6734
#define FRM_INVALIDATE_XMAX
Definition heapam.c:6732
static bool heap_attr_equals(TupleDesc tupdesc, int attrnum, Datum value1, Datum value2, bool isnull1, bool isnull2)
Definition heapam.c:4414
static void index_delete_sort(TM_IndexDeleteOp *delstate)
Definition heapam.c:8543
void heap_prepare_pagescan(TableScanDesc sscan)
Definition heapam.c:615
static Bitmapset * HeapDetermineColumnsInfo(Relation relation, Bitmapset *interesting_cols, Bitmapset *external_cols, HeapTuple oldtup, HeapTuple newtup, bool *has_external)
Definition heapam.c:4465
static const int MultiXactStatusLock[MaxMultiXactStatus+1]
Definition heapam.c:206
void simple_heap_insert(Relation relation, HeapTuple tup)
Definition heapam.c:2784
static bool xmax_infomask_changed(uint16 new_infomask, uint16 old_infomask)
Definition heapam.c:2819
#define UnlockTupleTuplock(rel, tup, mode)
Definition heapam.c:168
static TM_Result test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, LockTupleMode mode, HeapTuple tup, bool *needwait)
Definition heapam.c:5675
bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, const struct VacuumCutoffs *cutoffs, HeapPageFreeze *pagefrz, HeapTupleFreeze *frz, bool *totally_frozen)
Definition heapam.c:7134
static void AssertHasSnapshotForToast(Relation rel)
Definition heapam.c:224
void simple_heap_delete(Relation relation, const ItemPointerData *tid)
Definition heapam.c:3265
static const struct @15 tupleLockExtraInfo[]
static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf, Buffer newbuf, HeapTuple oldtup, HeapTuple newtup, HeapTuple old_key_tuple, bool all_visible_cleared, bool new_all_visible_cleared)
Definition heapam.c:8918
TransactionId HeapTupleGetUpdateXid(const HeapTupleHeaderData *tup)
Definition heapam.c:7659
TransactionId heap_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate)
Definition heapam.c:8198
void heap_multi_insert(Relation relation, TupleTableSlot **slots, int ntuples, CommandId cid, int options, BulkInsertState bistate)
Definition heapam.c:2412
#define ConditionalLockTupleTuplock(rel, tup, mode, log)
Definition heapam.c:170
static void initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
Definition heapam.c:356
static int bottomup_nblocksfavorable(IndexDeleteCounts *blockgroups, int nblockgroups, TM_IndexDelete *deltids)
Definition heapam.c:8639
static void heapgettup_pagemode(HeapScanDesc scan, ScanDirection dir, int nkeys, ScanKey key)
Definition heapam.c:1069
TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, CommandId cid, LockTupleMode mode, LockWaitPolicy wait_policy, bool follow_updates, Buffer *buffer, TM_FailureData *tmfd)
Definition heapam.c:4643
static void UpdateXmaxHintBits(HeapTupleHeader tuple, Buffer buffer, TransactionId xid)
Definition heapam.c:2052
static bool Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, uint16 infomask, bool nowait, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper, int *remaining, bool logLockFailure)
Definition heapam.c:7775
static int bottomup_sort_and_shrink_cmp(const void *arg1, const void *arg2)
Definition heapam.c:8682
void heap_get_latest_tid(TableScanDesc sscan, ItemPointer tid)
Definition heapam.c:1930
void heap_setscanlimits(TableScanDesc sscan, BlockNumber startBlk, BlockNumber numBlks)
Definition heapam.c:499
void HeapCheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot)
Definition heapam.c:9325
static Page heapgettup_start_page(HeapScanDesc scan, ScanDirection dir, int *linesleft, OffsetNumber *lineoff)
Definition heapam.c:798
static MultiXactStatus get_mxact_status_for_lock(LockTupleMode mode, bool is_update)
Definition heapam.c:4596
void heap_pre_freeze_checks(Buffer buffer, HeapTupleFreeze *tuples, int ntuples)
Definition heapam.c:7407
BulkInsertState GetBulkInsertState(void)
Definition heapam.c:2074
void FreeBulkInsertState(BulkInsertState bistate)
Definition heapam.c:2091
#define HEAP_INSERT_SPECULATIVE
Definition heapam.h:40
#define HEAP_FREEZE_CHECK_XMAX_ABORTED
Definition heapam.h:138
struct HeapScanDescData * HeapScanDesc
Definition heapam.h:102
HTSV_Result
Definition heapam.h:125
@ HEAPTUPLE_RECENTLY_DEAD
Definition heapam.h:128
@ HEAPTUPLE_INSERT_IN_PROGRESS
Definition heapam.h:129
@ HEAPTUPLE_LIVE
Definition heapam.h:127
@ HEAPTUPLE_DELETE_IN_PROGRESS
Definition heapam.h:130
@ HEAPTUPLE_DEAD
Definition heapam.h:126
struct BitmapHeapScanDescData * BitmapHeapScanDesc
Definition heapam.h:110
#define HEAP_INSERT_FROZEN
Definition heapam.h:38
static void heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapTupleFreeze *frz)
Definition heapam.h:492
#define HEAP_FREEZE_CHECK_XMIN_COMMITTED
Definition heapam.h:137
#define HEAP_INSERT_NO_LOGICAL
Definition heapam.h:39
struct BulkInsertStateData * BulkInsertState
Definition heapam.h:46
const TableAmRoutine * GetHeapamTableAmRoutine(void)
void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid)
bool HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer)
bool HeapTupleIsSurelyDead(HeapTuple htup, GlobalVisState *vistest)
HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer)
int HeapTupleSatisfiesMVCCBatch(Snapshot snapshot, Buffer buffer, int ntups, BatchMVCCState *batchmvcc, OffsetNumber *vistuples_dense)
bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple)
TM_Result HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, Buffer buffer)
#define XLH_INSERT_ON_TOAST_RELATION
Definition heapam_xlog.h:76
#define SizeOfHeapMultiInsert
#define XLOG_HEAP2_MULTI_INSERT
Definition heapam_xlog.h:64
#define SizeOfHeapUpdate
#define XLH_INVALID_XVAC
#define XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:87
#define SizeOfHeapVisible
#define XLOG_HEAP_HOT_UPDATE
Definition heapam_xlog.h:37
#define XLOG_HEAP_DELETE
Definition heapam_xlog.h:34
#define XLH_INSERT_IS_SPECULATIVE
Definition heapam_xlog.h:74
#define XLH_LOCK_ALL_FROZEN_CLEARED
#define XLH_DELETE_CONTAINS_OLD_KEY
#define XLH_UPDATE_CONTAINS_NEW_TUPLE
Definition heapam_xlog.h:90
#define XLH_INSERT_LAST_IN_MULTI
Definition heapam_xlog.h:73
#define XLH_INSERT_ALL_FROZEN_SET
Definition heapam_xlog.h:79
#define XLH_FREEZE_XVAC
#define XLOG_HEAP_UPDATE
Definition heapam_xlog.h:35
#define XLHL_XMAX_KEYSHR_LOCK
#define XLH_DELETE_ALL_VISIBLE_CLEARED
#define XLH_UPDATE_CONTAINS_OLD_TUPLE
Definition heapam_xlog.h:88
#define SizeOfHeapNewCid
#define SizeOfHeapLockUpdated
#define XLHL_XMAX_IS_MULTI
#define XLH_INSERT_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:72
#define SizeOfHeapHeader
#define XLH_DELETE_IS_PARTITION_MOVE
#define MinSizeOfHeapInplace
#define XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED
Definition heapam_xlog.h:85
#define XLHL_XMAX_LOCK_ONLY
#define XLOG_HEAP_INPLACE
Definition heapam_xlog.h:40
#define XLOG_HEAP2_LOCK_UPDATED
Definition heapam_xlog.h:65
#define XLH_UPDATE_SUFFIX_FROM_OLD
Definition heapam_xlog.h:92
#define XLH_UPDATE_PREFIX_FROM_OLD
Definition heapam_xlog.h:91
#define SizeOfMultiInsertTuple
#define XLHL_XMAX_EXCL_LOCK
#define XLOG_HEAP2_NEW_CID
Definition heapam_xlog.h:66
#define XLH_DELETE_CONTAINS_OLD_TUPLE
#define XLOG_HEAP_LOCK
Definition heapam_xlog.h:39
#define XLOG_HEAP_INSERT
Definition heapam_xlog.h:33
#define SizeOfHeapInsert
#define SizeOfHeapDelete
#define XLH_DELETE_IS_SUPER
#define XLH_UPDATE_CONTAINS_OLD_KEY
Definition heapam_xlog.h:89
#define XLHL_KEYS_UPDATED
#define XLOG_HEAP2_VISIBLE
Definition heapam_xlog.h:63
#define XLH_INSERT_CONTAINS_NEW_TUPLE
Definition heapam_xlog.h:75
#define XLOG_HEAP_INIT_PAGE
Definition heapam_xlog.h:47
#define SizeOfHeapConfirm
#define SizeOfHeapLock
#define XLOG_HEAP_CONFIRM
Definition heapam_xlog.h:38
void heap_toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative)
Definition heaptoast.c:43
HeapTuple heap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, int options)
Definition heaptoast.c:96
HeapTuple toast_flatten_tuple(HeapTuple tup, TupleDesc tupleDesc)
Definition heaptoast.c:350
#define TOAST_TUPLE_THRESHOLD
Definition heaptoast.h:48
HeapTuple heap_form_tuple(TupleDesc tupleDescriptor, const Datum *values, const bool *isnull)
Definition heaptuple.c:1117
void heap_deform_tuple(HeapTuple tuple, TupleDesc tupleDesc, Datum *values, bool *isnull)
Definition heaptuple.c:1346
void heap_freetuple(HeapTuple htup)
Definition heaptuple.c:1435
void RelationPutHeapTuple(Relation relation, Buffer buffer, HeapTuple tuple, bool token)
Definition hio.c:35
Buffer RelationGetBufferForTuple(Relation relation, Size len, Buffer otherBuffer, int options, BulkInsertState bistate, Buffer *vmbuffer, Buffer *vmbuffer_other, int num_pages)
Definition hio.c:500
HeapTupleHeaderData * HeapTupleHeader
Definition htup.h:23
#define HEAP_MOVED_OFF
#define HEAP_XMAX_SHR_LOCK
static bool HeapTupleIsHotUpdated(const HeapTupleData *tuple)
#define HEAP_XMIN_FROZEN
static Datum heap_getattr(HeapTuple tup, int attnum, TupleDesc tupleDesc, bool *isnull)
static bool HeapTupleHeaderXminFrozen(const HeapTupleHeaderData *tup)
#define HeapTupleHeaderGetNatts(tup)
static void HeapTupleHeaderSetXminFrozen(HeapTupleHeaderData *tup)
#define SizeofHeapTupleHeader
#define HEAP_KEYS_UPDATED
static bool HEAP_XMAX_IS_SHR_LOCKED(uint16 infomask)
static bool HEAP_XMAX_IS_LOCKED_ONLY(uint16 infomask)
static bool HeapTupleHeaderXminInvalid(const HeapTupleHeaderData *tup)
static void HeapTupleClearHotUpdated(const HeapTupleData *tuple)
static bool HeapTupleHasExternal(const HeapTupleData *tuple)
static TransactionId HeapTupleHeaderGetXvac(const HeapTupleHeaderData *tup)
#define HEAP2_XACT_MASK
static void HeapTupleHeaderSetCmax(HeapTupleHeaderData *tup, CommandId cid, bool iscombo)
#define HEAP_XMAX_LOCK_ONLY
static void HeapTupleHeaderClearHotUpdated(HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetCmin(HeapTupleHeaderData *tup, CommandId cid)
#define HEAP_XMAX_BITS
#define HEAP_LOCK_MASK
static CommandId HeapTupleHeaderGetRawCommandId(const HeapTupleHeaderData *tup)
static TransactionId HeapTupleHeaderGetRawXmax(const HeapTupleHeaderData *tup)
static bool HeapTupleHeaderIsHeapOnly(const HeapTupleHeaderData *tup)
static bool HeapTupleIsHeapOnly(const HeapTupleData *tuple)
#define HEAP_MOVED
static void HeapTupleSetHeapOnly(const HeapTupleData *tuple)
#define HEAP_XMAX_IS_MULTI
static bool HEAP_XMAX_IS_KEYSHR_LOCKED(uint16 infomask)
#define HEAP_XMAX_COMMITTED
static TransactionId HeapTupleHeaderGetXmin(const HeapTupleHeaderData *tup)
#define HEAP_COMBOCID
#define HEAP_XACT_MASK
static bool HeapTupleHeaderIndicatesMovedPartitions(const HeapTupleHeaderData *tup)
static void HeapTupleSetHotUpdated(const HeapTupleData *tuple)
#define HEAP_XMAX_EXCL_LOCK
static bool HeapTupleHeaderIsHotUpdated(const HeapTupleHeaderData *tup)
#define HEAP_XMAX_INVALID
static TransactionId HeapTupleHeaderGetRawXmin(const HeapTupleHeaderData *tup)
static void * GETSTRUCT(const HeapTupleData *tuple)
static void HeapTupleClearHeapOnly(const HeapTupleData *tuple)
#define MaxHeapAttributeNumber
static bool HeapTupleHeaderIsSpeculative(const HeapTupleHeaderData *tup)
static TransactionId HeapTupleHeaderGetUpdateXid(const HeapTupleHeaderData *tup)
#define MaxHeapTuplesPerPage
static bool HEAP_XMAX_IS_EXCL_LOCKED(uint16 infomask)
static void HeapTupleHeaderSetXmin(HeapTupleHeaderData *tup, TransactionId xid)
static bool HEAP_LOCKED_UPGRADED(uint16 infomask)
#define HEAP_UPDATED
#define HEAP_XMAX_KEYSHR_LOCK
static void HeapTupleHeaderSetMovedPartitions(HeapTupleHeaderData *tup)
static void HeapTupleHeaderSetXmax(HeapTupleHeaderData *tup, TransactionId xid)
static bool HeapTupleHeaderXminCommitted(const HeapTupleHeaderData *tup)
#define IsParallelWorker()
Definition parallel.h:60
void index_close(Relation relation, LOCKMODE lockmode)
Definition indexam.c:177
Relation index_open(Oid relationId, LOCKMODE lockmode)
Definition indexam.c:133
int remaining
Definition informix.c:692
#define INJECTION_POINT(name, arg)
void AcceptInvalidationMessages(void)
Definition inval.c:930
int inplaceGetInvalidationMessages(SharedInvalidationMessage **msgs, bool *RelcacheInitFileInval)
Definition inval.c:1088
void PreInplace_Inval(void)
Definition inval.c:1250
void CacheInvalidateHeapTupleInplace(Relation relation, HeapTuple key_equivalent_tuple)
Definition inval.c:1593
void AtInplace_Inval(void)
Definition inval.c:1263
void ForgetInplace_Inval(void)
Definition inval.c:1286
void CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple, HeapTuple newtuple)
Definition inval.c:1571
int b
Definition isn.c:74
int j
Definition isn.c:78
int i
Definition isn.c:77
#define ItemIdGetLength(itemId)
Definition itemid.h:59
#define ItemIdIsNormal(itemId)
Definition itemid.h:99
#define ItemIdGetRedirect(itemId)
Definition itemid.h:78
#define ItemIdIsUsed(itemId)
Definition itemid.h:92
#define ItemIdIsRedirected(itemId)
Definition itemid.h:106
#define ItemIdHasStorage(itemId)
Definition itemid.h:120
int32 ItemPointerCompare(const ItemPointerData *arg1, const ItemPointerData *arg2)
Definition itemptr.c:51
bool ItemPointerEquals(const ItemPointerData *pointer1, const ItemPointerData *pointer2)
Definition itemptr.c:35
static void ItemPointerSet(ItemPointerData *pointer, BlockNumber blockNumber, OffsetNumber offNum)
Definition itemptr.h:135
static void ItemPointerSetInvalid(ItemPointerData *pointer)
Definition itemptr.h:184
static void ItemPointerSetOffsetNumber(ItemPointerData *pointer, OffsetNumber offsetNumber)
Definition itemptr.h:158
static void ItemPointerSetBlockNumber(ItemPointerData *pointer, BlockNumber blockNumber)
Definition itemptr.h:147
static OffsetNumber ItemPointerGetOffsetNumber(const ItemPointerData *pointer)
Definition itemptr.h:124
static bool ItemPointerIndicatesMovedPartitions(const ItemPointerData *pointer)
Definition itemptr.h:197
static BlockNumber ItemPointerGetBlockNumber(const ItemPointerData *pointer)
Definition itemptr.h:103
static BlockNumber ItemPointerGetBlockNumberNoCheck(const ItemPointerData *pointer)
Definition itemptr.h:93
static void ItemPointerCopy(const ItemPointerData *fromPointer, ItemPointerData *toPointer)
Definition itemptr.h:172
static bool ItemPointerIsValid(const ItemPointerData *pointer)
Definition itemptr.h:83
void UnlockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode)
Definition lmgr.c:601
bool ConditionalXactLockTableWait(TransactionId xid, bool logLockFailure)
Definition lmgr.c:739
void LockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE lockmode)
Definition lmgr.c:562
void XactLockTableWait(TransactionId xid, Relation rel, const ItemPointerData *ctid, XLTW_Oper oper)
Definition lmgr.c:663
XLTW_Oper
Definition lmgr.h:25
@ XLTW_None
Definition lmgr.h:26
@ XLTW_Lock
Definition lmgr.h:29
@ XLTW_Delete
Definition lmgr.h:28
@ XLTW_LockUpdated
Definition lmgr.h:30
@ XLTW_Update
Definition lmgr.h:27
bool LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode, bool orstronger)
Definition lock.c:643
bool DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
Definition lock.c:623
bool log_lock_failures
Definition lock.c:54
#define SET_LOCKTAG_RELATION(locktag, dboid, reloid)
Definition lock.h:183
#define SET_LOCKTAG_TUPLE(locktag, dboid, reloid, blocknum, offnum)
Definition lock.h:219
int LOCKMODE
Definition lockdefs.h:26
#define ShareRowExclusiveLock
Definition lockdefs.h:41
#define AccessShareLock
Definition lockdefs.h:36
#define InplaceUpdateTupleLock
Definition lockdefs.h:48
#define ShareUpdateExclusiveLock
Definition lockdefs.h:39
LockWaitPolicy
Definition lockoptions.h:37
@ LockWaitSkip
Definition lockoptions.h:41
@ LockWaitBlock
Definition lockoptions.h:39
@ LockWaitError
Definition lockoptions.h:43
LockTupleMode
Definition lockoptions.h:50
@ LockTupleExclusive
Definition lockoptions.h:58
@ LockTupleNoKeyExclusive
Definition lockoptions.h:56
@ LockTupleShare
Definition lockoptions.h:54
@ LockTupleKeyShare
Definition lockoptions.h:52
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
#define IsBootstrapProcessingMode()
Definition miscadmin.h:477
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
#define IsNormalProcessingMode()
Definition miscadmin.h:479
#define END_CRIT_SECTION()
Definition miscadmin.h:152
MultiXactId MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status)
Definition multixact.c:352
bool MultiXactIdPrecedes(MultiXactId multi1, MultiXactId multi2)
Definition multixact.c:2765
bool MultiXactIdPrecedesOrEquals(MultiXactId multi1, MultiXactId multi2)
Definition multixact.c:2779
bool MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly)
Definition multixact.c:463
void MultiXactIdSetOldestMember(void)
Definition multixact.c:537
MultiXactId MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members)
Definition multixact.c:656
MultiXactId MultiXactIdCreate(TransactionId xid1, MultiXactStatus status1, TransactionId xid2, MultiXactStatus status2)
Definition multixact.c:299
int GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, bool from_pgupgrade, bool isLockOnly)
Definition multixact.c:1113
#define MultiXactIdIsValid(multi)
Definition multixact.h:29
MultiXactStatus
Definition multixact.h:37
@ MultiXactStatusForShare
Definition multixact.h:39
@ MultiXactStatusForNoKeyUpdate
Definition multixact.h:40
@ MultiXactStatusNoKeyUpdate
Definition multixact.h:43
@ MultiXactStatusUpdate
Definition multixact.h:45
@ MultiXactStatusForUpdate
Definition multixact.h:41
@ MultiXactStatusForKeyShare
Definition multixact.h:38
#define ISUPDATE_from_mxstatus(status)
Definition multixact.h:51
#define InvalidMultiXactId
Definition multixact.h:25
#define MaxMultiXactStatus
Definition multixact.h:48
#define InvalidOffsetNumber
Definition off.h:26
#define OffsetNumberIsValid(offsetNumber)
Definition off.h:39
#define OffsetNumberNext(offsetNumber)
Definition off.h:52
uint16 OffsetNumber
Definition off.h:24
#define FirstOffsetNumber
Definition off.h:27
#define OffsetNumberPrev(offsetNumber)
Definition off.h:54
#define MaxOffsetNumber
Definition off.h:28
Datum lower(PG_FUNCTION_ARGS)
Datum upper(PG_FUNCTION_ARGS)
Operator oper(ParseState *pstate, List *opname, Oid ltypeId, Oid rtypeId, bool noError, int location)
Definition parse_oper.c:371
int16 attlen
void * arg
#define ERRCODE_DATA_CORRUPTED
static uint32 pg_nextpower2_32(uint32 num)
static PgChecksumMode mode
static const struct exclude_list_item skip[]
FormData_pg_class * Form_pg_class
Definition pg_class.h:156
FormData_pg_database * Form_pg_database
Definition pg_database.h:96
static char buf[DEFAULT_XLOG_SEG_SIZE]
#define pgstat_count_heap_getnext(rel)
Definition pgstat.h:695
#define pgstat_count_heap_scan(rel)
Definition pgstat.h:690
void pgstat_count_heap_update(Relation rel, bool hot, bool newpage)
void pgstat_count_heap_delete(Relation rel)
void pgstat_count_heap_insert(Relation rel, PgStat_Counter n)
#define qsort(a, b, c, d)
Definition port.h:495
static Oid DatumGetObjectId(Datum X)
Definition postgres.h:252
uint64_t Datum
Definition postgres.h:70
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:342
#define InvalidOid
unsigned int Oid
void CheckForSerializableConflictIn(Relation relation, const ItemPointerData *tid, BlockNumber blkno)
Definition predicate.c:4334
void CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
Definition predicate.c:4021
void PredicateLockRelation(Relation relation, Snapshot snapshot)
Definition predicate.c:2574
void PredicateLockTID(Relation relation, const ItemPointerData *tid, Snapshot snapshot, TransactionId tuple_xid)
Definition predicate.c:2619
bool CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
Definition predicate.c:3989
static int fb(int x)
#define DELAY_CHKPT_START
Definition proc.h:135
GlobalVisState * GlobalVisTestFor(Relation rel)
Definition procarray.c:4086
bool TransactionIdIsInProgress(TransactionId xid)
Definition procarray.c:1404
void heap_page_prune_opt(Relation relation, Buffer buffer)
Definition pruneheap.c:209
void read_stream_reset(ReadStream *stream)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
ReadStream * read_stream_begin_relation(int flags, BufferAccessStrategy strategy, Relation rel, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
void read_stream_end(ReadStream *stream)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
BlockNumber(* ReadStreamBlockNumberCB)(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition read_stream.h:77
#define READ_STREAM_DEFAULT
Definition read_stream.h:21
#define READ_STREAM_SEQUENTIAL
Definition read_stream.h:36
#define RelationGetRelid(relation)
Definition rel.h:514
#define RelationIsLogicallyLogged(relation)
Definition rel.h:710
#define RelationGetTargetPageFreeSpace(relation, defaultff)
Definition rel.h:389
#define RelationGetDescr(relation)
Definition rel.h:540
#define RelationGetNumberOfAttributes(relation)
Definition rel.h:520
#define RelationGetRelationName(relation)
Definition rel.h:548
#define RelationIsAccessibleInLogicalDecoding(relation)
Definition rel.h:693
#define RelationNeedsWAL(relation)
Definition rel.h:637
#define RelationUsesLocalBuffers(relation)
Definition rel.h:646
#define HEAP_DEFAULT_FILLFACTOR
Definition rel.h:360
void RelationDecrementReferenceCount(Relation rel)
Definition relcache.c:2195
Bitmapset * RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind)
Definition relcache.c:5298
void RelationIncrementReferenceCount(Relation rel)
Definition relcache.c:2182
@ INDEX_ATTR_BITMAP_KEY
Definition relcache.h:69
@ INDEX_ATTR_BITMAP_HOT_BLOCKING
Definition relcache.h:72
@ INDEX_ATTR_BITMAP_SUMMARIZED
Definition relcache.h:73
@ INDEX_ATTR_BITMAP_IDENTITY_KEY
Definition relcache.h:71
ForkNumber
Definition relpath.h:56
@ MAIN_FORKNUM
Definition relpath.h:58
struct ParallelBlockTableScanDescData * ParallelBlockTableScanDesc
Definition relscan.h:103
#define ScanDirectionIsForward(direction)
Definition sdir.h:64
#define ScanDirectionIsBackward(direction)
Definition sdir.h:50
ScanDirection
Definition sdir.h:25
@ ForwardScanDirection
Definition sdir.h:28
TransactionId RecentXmin
Definition snapmgr.c:160
void UnregisterSnapshot(Snapshot snapshot)
Definition snapmgr.c:866
TransactionId TransactionXmin
Definition snapmgr.c:159
bool HaveRegisteredOrActiveSnapshot(void)
Definition snapmgr.c:1644
void InvalidateCatalogSnapshot(void)
Definition snapmgr.c:455
#define IsHistoricMVCCSnapshot(snapshot)
Definition snapmgr.h:59
#define SnapshotAny
Definition snapmgr.h:33
#define InitNonVacuumableSnapshot(snapshotdata, vistestp)
Definition snapmgr.h:50
#define IsMVCCSnapshot(snapshot)
Definition snapmgr.h:55
#define InvalidSnapshot
Definition snapshot.h:119
int get_tablespace_maintenance_io_concurrency(Oid spcid)
Definition spccache.c:229
#define init()
PGPROC * MyProc
Definition proc.c:67
BlockNumber last_free
Definition hio.h:49
BufferAccessStrategy strategy
Definition hio.h:31
uint32 already_extended_by
Definition hio.h:50
BlockNumber next_free
Definition hio.h:48
Buffer current_buf
Definition hio.h:32
MultiXactId NoFreezePageRelminMxid
Definition heapam.h:220
TransactionId FreezePageRelfrozenXid
Definition heapam.h:208
bool freeze_required
Definition heapam.h:182
MultiXactId FreezePageRelminMxid
Definition heapam.h:209
TransactionId NoFreezePageRelfrozenXid
Definition heapam.h:219
BufferAccessStrategy rs_strategy
Definition heapam.h:73
ScanDirection rs_dir
Definition heapam.h:88
uint32 rs_ntuples
Definition heapam.h:99
OffsetNumber rs_coffset
Definition heapam.h:68
Buffer rs_cbuf
Definition heapam.h:70
ParallelBlockTableScanWorkerData * rs_parallelworkerdata
Definition heapam.h:95
BlockNumber rs_startblock
Definition heapam.h:62
HeapTupleData rs_ctup
Definition heapam.h:75
OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]
Definition heapam.h:100
BlockNumber rs_numblocks
Definition heapam.h:63
BlockNumber rs_nblocks
Definition heapam.h:61
ReadStream * rs_read_stream
Definition heapam.h:78
uint32 rs_cindex
Definition heapam.h:98
BlockNumber rs_prefetch_block
Definition heapam.h:89
BlockNumber rs_cblock
Definition heapam.h:69
TableScanDescData rs_base
Definition heapam.h:58
ItemPointerData t_self
Definition htup.h:65
uint32 t_len
Definition htup.h:64
HeapTupleHeader t_data
Definition htup.h:68
Oid t_tableOid
Definition htup.h:66
TransactionId t_xmin
union HeapTupleHeaderData::@49 t_choice
ItemPointerData t_ctid
HeapTupleFields t_heap
int16 npromisingtids
Definition heapam.c:197
LockRelId lockRelId
Definition rel.h:46
Oid relId
Definition rel.h:40
Oid dbId
Definition rel.h:41
TransactionId xid
Definition multixact.h:57
MultiXactStatus status
Definition multixact.h:58
int delayChkptFlags
Definition proc.h:263
LockInfoData rd_lockInfo
Definition rel.h:114
Form_pg_index rd_index
Definition rel.h:192
RelFileLocator rd_locator
Definition rel.h:57
Form_pg_class rd_rel
Definition rel.h:111
bool takenDuringRecovery
Definition snapshot.h:180
TransactionId xmax
Definition tableam.h:150
CommandId cmax
Definition tableam.h:151
ItemPointerData ctid
Definition tableam.h:149
ItemPointerData tid
Definition tableam.h:212
Relation rs_rd
Definition relscan.h:35
uint32 rs_flags
Definition relscan.h:63
struct ScanKeyData * rs_key
Definition relscan.h:38
struct SnapshotData * rs_snapshot
Definition relscan.h:36
struct ParallelTableScanDescData * rs_parallel
Definition relscan.h:65
TransactionId FreezeLimit
Definition vacuum.h:289
TransactionId OldestXmin
Definition vacuum.h:279
TransactionId relfrozenxid
Definition vacuum.h:263
MultiXactId relminmxid
Definition vacuum.h:264
MultiXactId MultiXactCutoff
Definition vacuum.h:290
MultiXactId OldestMxact
Definition vacuum.h:280
Definition c.h:706
OffsetNumber offnum
TransactionId SubTransGetTopmostTransaction(TransactionId xid)
Definition subtrans.c:162
void ss_report_location(Relation rel, BlockNumber location)
Definition syncscan.c:289
BlockNumber ss_get_location(Relation rel, BlockNumber relnblocks)
Definition syncscan.c:254
#define FirstLowInvalidHeapAttributeNumber
Definition sysattr.h:27
#define TableOidAttributeNumber
Definition sysattr.h:26
bool RelationSupportsSysCache(Oid relid)
Definition syscache.c:762
void table_block_parallelscan_startblock_init(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan, BlockNumber startblock, BlockNumber numblocks)
Definition tableam.c:451
BlockNumber table_block_parallelscan_nextpage(Relation rel, ParallelBlockTableScanWorker pbscanwork, ParallelBlockTableScanDesc pbscan)
Definition tableam.c:546
bool synchronize_seqscans
Definition tableam.c:50
@ SO_ALLOW_STRAT
Definition tableam.h:58
@ SO_TYPE_TIDRANGESCAN
Definition tableam.h:53
@ SO_TEMP_SNAPSHOT
Definition tableam.h:65
@ SO_ALLOW_PAGEMODE
Definition tableam.h:62
@ SO_TYPE_SAMPLESCAN
Definition tableam.h:51
@ SO_ALLOW_SYNC
Definition tableam.h:60
@ SO_TYPE_SEQSCAN
Definition tableam.h:49
@ SO_TYPE_BITMAPSCAN
Definition tableam.h:50
TU_UpdateIndexes
Definition tableam.h:111
@ TU_Summarizing
Definition tableam.h:119
@ TU_All
Definition tableam.h:116
@ TU_None
Definition tableam.h:113
TM_Result
Definition tableam.h:73
@ TM_Ok
Definition tableam.h:78
@ TM_BeingModified
Definition tableam.h:100
@ TM_Deleted
Definition tableam.h:93
@ TM_WouldBlock
Definition tableam.h:103
@ TM_Updated
Definition tableam.h:90
@ TM_SelfModified
Definition tableam.h:84
@ TM_Invisible
Definition tableam.h:81
bool tbm_iterate(TBMIterator *iterator, TBMIterateResult *tbmres)
Definition tidbitmap.c:1614
bool TransactionIdDidCommit(TransactionId transactionId)
Definition transam.c:126
bool TransactionIdDidAbort(TransactionId transactionId)
Definition transam.c:188
static bool TransactionIdFollows(TransactionId id1, TransactionId id2)
Definition transam.h:297
#define InvalidTransactionId
Definition transam.h:31
static bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2)
Definition transam.h:282
static bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2)
Definition transam.h:312
#define TransactionIdEquals(id1, id2)
Definition transam.h:43
#define TransactionIdIsValid(xid)
Definition transam.h:41
#define TransactionIdIsNormal(xid)
Definition transam.h:42
static bool TransactionIdPrecedes(TransactionId id1, TransactionId id2)
Definition transam.h:263
static CompactAttribute * TupleDescCompactAttr(TupleDesc tupdesc, int i)
Definition tupdesc.h:175
static TupleTableSlot * ExecClearTuple(TupleTableSlot *slot)
Definition tuptable.h:457
static bool HeapKeyTest(HeapTuple tuple, TupleDesc tupdesc, int nkeys, ScanKey keys)
Definition valid.h:28
static bool VARATT_IS_EXTERNAL(const void *PTR)
Definition varatt.h:354
bool visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer vmbuf, uint8 flags)
void visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *vmbuf)
void visibilitymap_set_vmbits(BlockNumber heapBlk, Buffer vmBuf, uint8 flags, const RelFileLocator rlocator)
#define VISIBILITYMAP_VALID_BITS
#define VISIBILITYMAP_ALL_FROZEN
#define VISIBILITYMAP_XLOG_CATALOG_REL
#define VISIBILITYMAP_ALL_VISIBLE
TransactionId GetTopTransactionId(void)
Definition xact.c:427
TransactionId GetTopTransactionIdIfAny(void)
Definition xact.c:442
bool TransactionIdIsCurrentTransactionId(TransactionId xid)
Definition xact.c:942
bool IsInParallelMode(void)
Definition xact.c:1090
TransactionId GetCurrentTransactionId(void)
Definition xact.c:455
CommandId GetCurrentCommandId(bool used)
Definition xact.c:830
#define IsolationIsSerializable()
Definition xact.h:53
#define XLOG_INCLUDE_ORIGIN
Definition xlog.h:165
#define XLogHintBitIsNeeded()
Definition xlog.h:122
#define XLogStandbyInfoActive()
Definition xlog.h:125
uint64 XLogRecPtr
Definition xlogdefs.h:21
XLogRecPtr XLogInsert(RmgrId rmid, uint8 info)
Definition xloginsert.c:478
void XLogRegisterBufData(uint8 block_id, const void *data, uint32 len)
Definition xloginsert.c:409
bool XLogCheckBufferNeedsBackup(Buffer buffer)
void XLogRegisterData(const void *data, uint32 len)
Definition xloginsert.c:368
void XLogSetRecordFlags(uint8 flags)
Definition xloginsert.c:460
void XLogRegisterBlock(uint8 block_id, RelFileLocator *rlocator, ForkNumber forknum, BlockNumber blknum, const PageData *page, uint8 flags)
Definition xloginsert.c:313
void XLogRegisterBuffer(uint8 block_id, Buffer buffer, uint8 flags)
Definition xloginsert.c:245
void XLogBeginInsert(void)
Definition xloginsert.c:152
#define REGBUF_STANDARD
Definition xloginsert.h:35
#define REGBUF_NO_IMAGE
Definition xloginsert.h:33
#define REGBUF_KEEP_DATA
Definition xloginsert.h:36
#define REGBUF_WILL_INIT
Definition xloginsert.h:34

◆ FRM_INVALIDATE_XMAX

#define FRM_INVALIDATE_XMAX   0x0002

Definition at line 6732 of file heapam.c.

◆ FRM_MARK_COMMITTED

#define FRM_MARK_COMMITTED   0x0010

Definition at line 6735 of file heapam.c.

◆ FRM_NOOP

#define FRM_NOOP   0x0001

Definition at line 6731 of file heapam.c.

◆ FRM_RETURN_IS_MULTI

#define FRM_RETURN_IS_MULTI   0x0008

Definition at line 6734 of file heapam.c.

◆ FRM_RETURN_IS_XID

#define FRM_RETURN_IS_XID   0x0004

Definition at line 6733 of file heapam.c.

◆ LOCKMODE_from_mxstatus

#define LOCKMODE_from_mxstatus (   status)     (tupleLockExtraInfo[TUPLOCK_from_mxstatus((status))].hwlock)

Definition at line 158 of file heapam.c.

◆ LockTupleTuplock

#define LockTupleTuplock (   rel,
  tup,
  mode 
)     LockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)

Definition at line 166 of file heapam.c.

◆ TUPLOCK_from_mxstatus

#define TUPLOCK_from_mxstatus (   status)     (MultiXactStatusLock[(status)])

Definition at line 217 of file heapam.c.

◆ UnlockTupleTuplock

#define UnlockTupleTuplock (   rel,
  tup,
  mode 
)     UnlockTuple((rel), (tup), tupleLockExtraInfo[mode].hwlock)

Definition at line 168 of file heapam.c.

Typedef Documentation

◆ IndexDeleteCounts

Function Documentation

◆ AssertHasSnapshotForToast()

static void AssertHasSnapshotForToast ( Relation  rel)
inlinestatic

Definition at line 224 of file heapam.c.

225{
226#ifdef USE_ASSERT_CHECKING
227
228 /* bootstrap mode in particular breaks this rule */
230 return;
231
232 /* if the relation doesn't have a TOAST table, we are good */
233 if (!OidIsValid(rel->rd_rel->reltoastrelid))
234 return;
235
237
238#endif /* USE_ASSERT_CHECKING */
239}

References Assert, HaveRegisteredOrActiveSnapshot(), IsNormalProcessingMode, OidIsValid, and RelationData::rd_rel.

Referenced by heap_delete(), heap_insert(), heap_multi_insert(), and heap_update().

◆ bitmapheap_stream_read_next()

static BlockNumber bitmapheap_stream_read_next ( ReadStream pgsr,
void private_data,
void per_buffer_data 
)
static

Definition at line 316 of file heapam.c.

318{
319 TBMIterateResult *tbmres = per_buffer_data;
322 TableScanDesc sscan = &hscan->rs_base;
323
324 for (;;)
325 {
327
328 /* no more entries in the bitmap */
329 if (!tbm_iterate(&sscan->st.rs_tbmiterator, tbmres))
330 return InvalidBlockNumber;
331
332 /*
333 * Ignore any claimed entries past what we think is the end of the
334 * relation. It may have been extended after the start of our scan (we
335 * only hold an AccessShareLock, and it could be inserts from this
336 * backend). We don't take this optimization in SERIALIZABLE
337 * isolation though, as we need to examine all invisible tuples
338 * reachable by the index.
339 */
341 tbmres->blockno >= hscan->rs_nblocks)
342 continue;
343
344 return tbmres->blockno;
345 }
346
347 /* not reachable */
348 Assert(false);
349}

References Assert, CHECK_FOR_INTERRUPTS, fb(), InvalidBlockNumber, IsolationIsSerializable, and tbm_iterate().

Referenced by heap_beginscan().

◆ bottomup_nblocksfavorable()

static int bottomup_nblocksfavorable ( IndexDeleteCounts blockgroups,
int  nblockgroups,
TM_IndexDelete deltids 
)
static

Definition at line 8639 of file heapam.c.

8641{
8642 int64 lastblock = -1;
8643 int nblocksfavorable = 0;
8644
8645 Assert(nblockgroups >= 1);
8647
8648 /*
8649 * We tolerate heap blocks that will be accessed only slightly out of
8650 * physical order. Small blips occur when a pair of almost-contiguous
8651 * blocks happen to fall into different buckets (perhaps due only to a
8652 * small difference in npromisingtids that the bucketing scheme didn't
8653 * quite manage to ignore). We effectively ignore these blips by applying
8654 * a small tolerance. The precise tolerance we use is a little arbitrary,
8655 * but it works well enough in practice.
8656 */
8657 for (int b = 0; b < nblockgroups; b++)
8658 {
8659 IndexDeleteCounts *group = blockgroups + b;
8660 TM_IndexDelete *firstdtid = deltids + group->ifirsttid;
8662
8663 if (lastblock != -1 &&
8666 break;
8667
8669 lastblock = block;
8670 }
8671
8672 /* Always indicate that there is at least 1 favorable block */
8674
8675 return nblocksfavorable;
8676}

References Assert, b, BOTTOMUP_MAX_NBLOCKS, BOTTOMUP_TOLERANCE_NBLOCKS, fb(), IndexDeleteCounts::ifirsttid, and ItemPointerGetBlockNumber().

Referenced by bottomup_sort_and_shrink().

◆ bottomup_sort_and_shrink()

static int bottomup_sort_and_shrink ( TM_IndexDeleteOp delstate)
static

Definition at line 8755 of file heapam.c.

8756{
8760 int nblockgroups = 0;
8761 int ncopied = 0;
8762 int nblocksfavorable = 0;
8763
8764 Assert(delstate->bottomup);
8765 Assert(delstate->ndeltids > 0);
8766
8767 /* Calculate per-heap-block count of TIDs */
8769 for (int i = 0; i < delstate->ndeltids; i++)
8770 {
8771 TM_IndexDelete *ideltid = &delstate->deltids[i];
8772 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8773 ItemPointer htid = &ideltid->tid;
8774 bool promising = istatus->promising;
8775
8777 {
8778 /* New block group */
8779 nblockgroups++;
8780
8783
8785 blockgroups[nblockgroups - 1].ifirsttid = i;
8786 blockgroups[nblockgroups - 1].ntids = 1;
8787 blockgroups[nblockgroups - 1].npromisingtids = 0;
8788 }
8789 else
8790 {
8791 blockgroups[nblockgroups - 1].ntids++;
8792 }
8793
8794 if (promising)
8795 blockgroups[nblockgroups - 1].npromisingtids++;
8796 }
8797
8798 /*
8799 * We're about ready to sort block groups to determine the optimal order
8800 * for visiting heap blocks. But before we do, round the number of
8801 * promising tuples for each block group up to the next power-of-two,
8802 * unless it is very low (less than 4), in which case we round up to 4.
8803 * npromisingtids is far too noisy to trust when choosing between a pair
8804 * of block groups that both have very low values.
8805 *
8806 * This scheme divides heap blocks/block groups into buckets. Each bucket
8807 * contains blocks that have _approximately_ the same number of promising
8808 * TIDs as each other. The goal is to ignore relatively small differences
8809 * in the total number of promising entries, so that the whole process can
8810 * give a little weight to heapam factors (like heap block locality)
8811 * instead. This isn't a trade-off, really -- we have nothing to lose. It
8812 * would be foolish to interpret small differences in npromisingtids
8813 * values as anything more than noise.
8814 *
8815 * We tiebreak on nhtids when sorting block group subsets that have the
8816 * same npromisingtids, but this has the same issues as npromisingtids,
8817 * and so nhtids is subject to the same power-of-two bucketing scheme. The
8818 * only reason that we don't fix nhtids in the same way here too is that
8819 * we'll need accurate nhtids values after the sort. We handle nhtids
8820 * bucketization dynamically instead (in the sort comparator).
8821 *
8822 * See bottomup_nblocksfavorable() for a full explanation of when and how
8823 * heap locality/favorable blocks can significantly influence when and how
8824 * heap blocks are accessed.
8825 */
8826 for (int b = 0; b < nblockgroups; b++)
8827 {
8828 IndexDeleteCounts *group = blockgroups + b;
8829
8830 /* Better off falling back on nhtids with low npromisingtids */
8831 if (group->npromisingtids <= 4)
8832 group->npromisingtids = 4;
8833 else
8834 group->npromisingtids =
8836 }
8837
8838 /* Sort groups and rearrange caller's deltids array */
8841 reordereddeltids = palloc(delstate->ndeltids * sizeof(TM_IndexDelete));
8842
8844 /* Determine number of favorable blocks at the start of final deltids */
8846 delstate->deltids);
8847
8848 for (int b = 0; b < nblockgroups; b++)
8849 {
8850 IndexDeleteCounts *group = blockgroups + b;
8851 TM_IndexDelete *firstdtid = delstate->deltids + group->ifirsttid;
8852
8854 sizeof(TM_IndexDelete) * group->ntids);
8855 ncopied += group->ntids;
8856 }
8857
8858 /* Copy final grouped and sorted TIDs back into start of caller's array */
8860 sizeof(TM_IndexDelete) * ncopied);
8861 delstate->ndeltids = ncopied;
8862
8865
8866 return nblocksfavorable;
8867}

References Assert, b, BlockNumberIsValid(), BOTTOMUP_MAX_NBLOCKS, bottomup_nblocksfavorable(), bottomup_sort_and_shrink_cmp(), fb(), i, IndexDeleteCounts::ifirsttid, InvalidBlockNumber, ItemPointerGetBlockNumber(), Min, IndexDeleteCounts::npromisingtids, IndexDeleteCounts::ntids, palloc(), palloc_array, pfree(), pg_nextpower2_32(), and qsort.

Referenced by heap_index_delete_tuples().

◆ bottomup_sort_and_shrink_cmp()

static int bottomup_sort_and_shrink_cmp ( const void arg1,
const void arg2 
)
static

Definition at line 8682 of file heapam.c.

8683{
8686
8687 /*
8688 * Most significant field is npromisingtids (which we invert the order of
8689 * so as to sort in desc order).
8690 *
8691 * Caller should have already normalized npromisingtids fields into
8692 * power-of-two values (buckets).
8693 */
8694 if (group1->npromisingtids > group2->npromisingtids)
8695 return -1;
8696 if (group1->npromisingtids < group2->npromisingtids)
8697 return 1;
8698
8699 /*
8700 * Tiebreak: desc ntids sort order.
8701 *
8702 * We cannot expect power-of-two values for ntids fields. We should
8703 * behave as if they were already rounded up for us instead.
8704 */
8705 if (group1->ntids != group2->ntids)
8706 {
8709
8710 if (ntids1 > ntids2)
8711 return -1;
8712 if (ntids1 < ntids2)
8713 return 1;
8714 }
8715
8716 /*
8717 * Tiebreak: asc offset-into-deltids-for-block (offset to first TID for
8718 * block in deltids array) order.
8719 *
8720 * This is equivalent to sorting in ascending heap block number order
8721 * (among otherwise equal subsets of the array). This approach allows us
8722 * to avoid accessing the out-of-line TID. (We rely on the assumption
8723 * that the deltids array was sorted in ascending heap TID order when
8724 * these offsets to the first TID from each heap block group were formed.)
8725 */
8726 if (group1->ifirsttid > group2->ifirsttid)
8727 return 1;
8728 if (group1->ifirsttid < group2->ifirsttid)
8729 return -1;
8730
8732
8733 return 0;
8734}

References fb(), pg_nextpower2_32(), and pg_unreachable.

Referenced by bottomup_sort_and_shrink().

◆ compute_infobits()

◆ compute_new_xmax_infomask()

static void compute_new_xmax_infomask ( TransactionId  xmax,
uint16  old_infomask,
uint16  old_infomask2,
TransactionId  add_to_xmax,
LockTupleMode  mode,
bool  is_update,
TransactionId result_xmax,
uint16 result_infomask,
uint16 result_infomask2 
)
static

Definition at line 5394 of file heapam.c.

5399{
5400 TransactionId new_xmax;
5403
5405
5406l5:
5407 new_infomask = 0;
5408 new_infomask2 = 0;
5410 {
5411 /*
5412 * No previous locker; we just insert our own TransactionId.
5413 *
5414 * Note that it's critical that this case be the first one checked,
5415 * because there are several blocks below that come back to this one
5416 * to implement certain optimizations; old_infomask might contain
5417 * other dirty bits in those cases, but we don't really care.
5418 */
5419 if (is_update)
5420 {
5421 new_xmax = add_to_xmax;
5422 if (mode == LockTupleExclusive)
5424 }
5425 else
5426 {
5428 switch (mode)
5429 {
5430 case LockTupleKeyShare:
5431 new_xmax = add_to_xmax;
5433 break;
5434 case LockTupleShare:
5435 new_xmax = add_to_xmax;
5437 break;
5439 new_xmax = add_to_xmax;
5441 break;
5442 case LockTupleExclusive:
5443 new_xmax = add_to_xmax;
5446 break;
5447 default:
5448 new_xmax = InvalidTransactionId; /* silence compiler */
5449 elog(ERROR, "invalid lock mode");
5450 }
5451 }
5452 }
5454 {
5456
5457 /*
5458 * Currently we don't allow XMAX_COMMITTED to be set for multis, so
5459 * cross-check.
5460 */
5462
5463 /*
5464 * A multixact together with LOCK_ONLY set but neither lock bit set
5465 * (i.e. a pg_upgraded share locked tuple) cannot possibly be running
5466 * anymore. This check is critical for databases upgraded by
5467 * pg_upgrade; both MultiXactIdIsRunning and MultiXactIdExpand assume
5468 * that such multis are never passed.
5469 */
5471 {
5474 goto l5;
5475 }
5476
5477 /*
5478 * If the XMAX is already a MultiXactId, then we need to expand it to
5479 * include add_to_xmax; but if all the members were lockers and are
5480 * all gone, we can do away with the IS_MULTI bit and just set
5481 * add_to_xmax as the only locker/updater. If all lockers are gone
5482 * and we have an updater that aborted, we can also do without a
5483 * multi.
5484 *
5485 * The cost of doing GetMultiXactIdMembers would be paid by
5486 * MultiXactIdExpand if we weren't to do this, so this check is not
5487 * incurring extra work anyhow.
5488 */
5490 {
5493 old_infomask)))
5494 {
5495 /*
5496 * Reset these bits and restart; otherwise fall through to
5497 * create a new multi below.
5498 */
5501 goto l5;
5502 }
5503 }
5504
5506
5507 new_xmax = MultiXactIdExpand((MultiXactId) xmax, add_to_xmax,
5508 new_status);
5510 }
5512 {
5513 /*
5514 * It's a committed update, so we need to preserve him as updater of
5515 * the tuple.
5516 */
5517 MultiXactStatus status;
5519
5521 status = MultiXactStatusUpdate;
5522 else
5524
5526
5527 /*
5528 * since it's not running, it's obviously impossible for the old
5529 * updater to be identical to the current one, so we need not check
5530 * for that case as we do in the block above.
5531 */
5532 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5534 }
5535 else if (TransactionIdIsInProgress(xmax))
5536 {
5537 /*
5538 * If the XMAX is a valid, in-progress TransactionId, then we need to
5539 * create a new MultiXactId that includes both the old locker or
5540 * updater and our own TransactionId.
5541 */
5545
5547 {
5553 {
5556 else
5558 }
5559 else
5560 {
5561 /*
5562 * LOCK_ONLY can be present alone only when a page has been
5563 * upgraded by pg_upgrade. But in that case,
5564 * TransactionIdIsInProgress() should have returned false. We
5565 * assume it's no longer locked in this case.
5566 */
5567 elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax);
5570 goto l5;
5571 }
5572 }
5573 else
5574 {
5575 /* it's an update, but which kind? */
5578 else
5580 }
5581
5583
5584 /*
5585 * If the lock to be acquired is for the same TransactionId as the
5586 * existing lock, there's an optimization possible: consider only the
5587 * strongest of both locks as the only one present, and restart.
5588 */
5589 if (xmax == add_to_xmax)
5590 {
5591 /*
5592 * Note that it's not possible for the original tuple to be
5593 * updated: we wouldn't be here because the tuple would have been
5594 * invisible and we wouldn't try to update it. As a subtlety,
5595 * this code can also run when traversing an update chain to lock
5596 * future versions of a tuple. But we wouldn't be here either,
5597 * because the add_to_xmax would be different from the original
5598 * updater.
5599 */
5601
5602 /* acquire the strongest of both */
5603 if (mode < old_mode)
5604 mode = old_mode;
5605 /* mustn't touch is_update */
5606
5608 goto l5;
5609 }
5610
5611 /* otherwise, just fall back to creating a new multixact */
5613 new_xmax = MultiXactIdCreate(xmax, old_status,
5616 }
5619 {
5620 /*
5621 * It's a committed update, so we gotta preserve him as updater of the
5622 * tuple.
5623 */
5624 MultiXactStatus status;
5626
5628 status = MultiXactStatusUpdate;
5629 else
5631
5633
5634 /*
5635 * since it's not running, it's obviously impossible for the old
5636 * updater to be identical to the current one, so we need not check
5637 * for that case as we do in the block above.
5638 */
5639 new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status);
5641 }
5642 else
5643 {
5644 /*
5645 * Can get here iff the locking/updating transaction was running when
5646 * the infomask was extracted from the tuple, but finished before
5647 * TransactionIdIsInProgress got to run. Deal with it as if there was
5648 * no locker at all in the first place.
5649 */
5651 goto l5;
5652 }
5653
5656 *result_xmax = new_xmax;
5657}

References Assert, elog, ERROR, fb(), get_mxact_status_for_lock(), GetMultiXactIdHintBits(), HEAP_KEYS_UPDATED, HEAP_LOCKED_UPGRADED(), HEAP_XMAX_COMMITTED, HEAP_XMAX_EXCL_LOCK, HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HEAP_XMAX_SHR_LOCK, InvalidTransactionId, LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, mode, MultiXactIdCreate(), MultiXactIdExpand(), MultiXactIdGetUpdateXid(), MultiXactIdIsRunning(), MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), TUPLOCK_from_mxstatus, and WARNING.

Referenced by heap_delete(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), and heap_update().

◆ ConditionalMultiXactIdWait()

static bool ConditionalMultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
Relation  rel,
int remaining,
bool  logLockFailure 
)
static

Definition at line 7875 of file heapam.c.

7878{
7879 return Do_MultiXactIdWait(multi, status, infomask, true,
7881}

References Do_MultiXactIdWait(), fb(), remaining, and XLTW_None.

Referenced by heap_lock_tuple().

◆ Do_MultiXactIdWait()

static bool Do_MultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
bool  nowait,
Relation  rel,
const ItemPointerData ctid,
XLTW_Oper  oper,
int remaining,
bool  logLockFailure 
)
static

Definition at line 7775 of file heapam.c.

7779{
7780 bool result = true;
7781 MultiXactMember *members;
7782 int nmembers;
7783 int remain = 0;
7784
7785 /* for pre-pg_upgrade tuples, no need to sleep at all */
7786 nmembers = HEAP_LOCKED_UPGRADED(infomask) ? -1 :
7787 GetMultiXactIdMembers(multi, &members, false,
7789
7790 if (nmembers >= 0)
7791 {
7792 int i;
7793
7794 for (i = 0; i < nmembers; i++)
7795 {
7796 TransactionId memxid = members[i].xid;
7797 MultiXactStatus memstatus = members[i].status;
7798
7800 {
7801 remain++;
7802 continue;
7803 }
7804
7806 LOCKMODE_from_mxstatus(status)))
7807 {
7809 remain++;
7810 continue;
7811 }
7812
7813 /*
7814 * This member conflicts with our multi, so we have to sleep (or
7815 * return failure, if asked to avoid waiting.)
7816 *
7817 * Note that we don't set up an error context callback ourselves,
7818 * but instead we pass the info down to XactLockTableWait. This
7819 * might seem a bit wasteful because the context is set up and
7820 * tore down for each member of the multixact, but in reality it
7821 * should be barely noticeable, and it avoids duplicate code.
7822 */
7823 if (nowait)
7824 {
7826 if (!result)
7827 break;
7828 }
7829 else
7830 XactLockTableWait(memxid, rel, ctid, oper);
7831 }
7832
7833 pfree(members);
7834 }
7835
7836 if (remaining)
7837 *remaining = remain;
7838
7839 return result;
7840}

References ConditionalXactLockTableWait(), DoLockModesConflict(), fb(), GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), i, LOCKMODE_from_mxstatus, oper(), pfree(), remaining, MultiXactMember::status, TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), XactLockTableWait(), and MultiXactMember::xid.

Referenced by ConditionalMultiXactIdWait(), and MultiXactIdWait().

◆ DoesMultiXactIdConflict()

static bool DoesMultiXactIdConflict ( MultiXactId  multi,
uint16  infomask,
LockTupleMode  lockmode,
bool current_is_member 
)
static

Definition at line 7675 of file heapam.c.

7677{
7678 int nmembers;
7679 MultiXactMember *members;
7680 bool result = false;
7681 LOCKMODE wanted = tupleLockExtraInfo[lockmode].hwlock;
7682
7684 return false;
7685
7686 nmembers = GetMultiXactIdMembers(multi, &members, false,
7688 if (nmembers >= 0)
7689 {
7690 int i;
7691
7692 for (i = 0; i < nmembers; i++)
7693 {
7696
7697 if (result && (current_is_member == NULL || *current_is_member))
7698 break;
7699
7700 memlockmode = LOCKMODE_from_mxstatus(members[i].status);
7701
7702 /* ignore members from current xact (but track their presence) */
7703 memxid = members[i].xid;
7705 {
7706 if (current_is_member != NULL)
7707 *current_is_member = true;
7708 continue;
7709 }
7710 else if (result)
7711 continue;
7712
7713 /* ignore members that don't conflict with the lock we want */
7715 continue;
7716
7717 if (ISUPDATE_from_mxstatus(members[i].status))
7718 {
7719 /* ignore aborted updaters */
7721 continue;
7722 }
7723 else
7724 {
7725 /* ignore lockers-only that are no longer in progress */
7727 continue;
7728 }
7729
7730 /*
7731 * Whatever remains are either live lockers that conflict with our
7732 * wanted lock, and updaters that are not aborted. Those conflict
7733 * with what we want. Set up to return true, but keep going to
7734 * look for the current transaction among the multixact members,
7735 * if needed.
7736 */
7737 result = true;
7738 }
7739 pfree(members);
7740 }
7741
7742 return result;
7743}

References DoLockModesConflict(), fb(), GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), i, ISUPDATE_from_mxstatus, LOCKMODE_from_mxstatus, pfree(), TransactionIdDidAbort(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), tupleLockExtraInfo, and MultiXactMember::xid.

Referenced by heap_delete(), heap_inplace_lock(), heap_lock_tuple(), and heap_update().

◆ ExtractReplicaIdentity()

static HeapTuple ExtractReplicaIdentity ( Relation  relation,
HeapTuple  tp,
bool  key_required,
bool copy 
)
static

Definition at line 9221 of file heapam.c.

9223{
9224 TupleDesc desc = RelationGetDescr(relation);
9225 char replident = relation->rd_rel->relreplident;
9228 bool nulls[MaxHeapAttributeNumber];
9230
9231 *copy = false;
9232
9233 if (!RelationIsLogicallyLogged(relation))
9234 return NULL;
9235
9236 if (replident == REPLICA_IDENTITY_NOTHING)
9237 return NULL;
9238
9239 if (replident == REPLICA_IDENTITY_FULL)
9240 {
9241 /*
9242 * When logging the entire old tuple, it very well could contain
9243 * toasted columns. If so, force them to be inlined.
9244 */
9245 if (HeapTupleHasExternal(tp))
9246 {
9247 *copy = true;
9248 tp = toast_flatten_tuple(tp, desc);
9249 }
9250 return tp;
9251 }
9252
9253 /* if the key isn't required and we're only logging the key, we're done */
9254 if (!key_required)
9255 return NULL;
9256
9257 /* find out the replica identity columns */
9260
9261 /*
9262 * If there's no defined replica identity columns, treat as !key_required.
9263 * (This case should not be reachable from heap_update, since that should
9264 * calculate key_required accurately. But heap_delete just passes
9265 * constant true for key_required, so we can hit this case in deletes.)
9266 */
9267 if (bms_is_empty(idattrs))
9268 return NULL;
9269
9270 /*
9271 * Construct a new tuple containing only the replica identity columns,
9272 * with nulls elsewhere. While we're at it, assert that the replica
9273 * identity columns aren't null.
9274 */
9275 heap_deform_tuple(tp, desc, values, nulls);
9276
9277 for (int i = 0; i < desc->natts; i++)
9278 {
9280 idattrs))
9281 Assert(!nulls[i]);
9282 else
9283 nulls[i] = true;
9284 }
9285
9286 key_tuple = heap_form_tuple(desc, values, nulls);
9287 *copy = true;
9288
9290
9291 /*
9292 * If the tuple, which by here only contains indexed columns, still has
9293 * toasted columns, force them to be inlined. This is somewhat unlikely
9294 * since there's limits on the size of indexed columns, so we don't
9295 * duplicate toast_flatten_tuple()s functionality in the above loop over
9296 * the indexed columns, even if it would be more efficient.
9297 */
9299 {
9301
9304 }
9305
9306 return key_tuple;
9307}

References Assert, bms_free(), bms_is_empty, bms_is_member(), fb(), FirstLowInvalidHeapAttributeNumber, heap_deform_tuple(), heap_form_tuple(), heap_freetuple(), HeapTupleHasExternal(), i, INDEX_ATTR_BITMAP_IDENTITY_KEY, MaxHeapAttributeNumber, TupleDescData::natts, RelationData::rd_rel, RelationGetDescr, RelationGetIndexAttrBitmap(), RelationIsLogicallyLogged, toast_flatten_tuple(), and values.

Referenced by heap_delete(), and heap_update().

◆ FreeBulkInsertState()

◆ FreezeMultiXactId()

static TransactionId FreezeMultiXactId ( MultiXactId  multi,
uint16  t_infomask,
const struct VacuumCutoffs cutoffs,
uint16 flags,
HeapPageFreeze pagefrz 
)
static

Definition at line 6784 of file heapam.c.

6787{
6789 MultiXactMember *members;
6790 int nmembers;
6791 bool need_replace;
6792 int nnewmembers;
6794 bool has_lockers;
6796 bool update_committed;
6797 TransactionId FreezePageRelfrozenXid;
6798
6799 *flags = 0;
6800
6801 /* We should only be called in Multis */
6802 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
6803
6804 if (!MultiXactIdIsValid(multi) ||
6805 HEAP_LOCKED_UPGRADED(t_infomask))
6806 {
6807 *flags |= FRM_INVALIDATE_XMAX;
6808 pagefrz->freeze_required = true;
6809 return InvalidTransactionId;
6810 }
6811 else if (MultiXactIdPrecedes(multi, cutoffs->relminmxid))
6812 ereport(ERROR,
6814 errmsg_internal("found multixact %u from before relminmxid %u",
6815 multi, cutoffs->relminmxid)));
6816 else if (MultiXactIdPrecedes(multi, cutoffs->OldestMxact))
6817 {
6819
6820 /*
6821 * This old multi cannot possibly have members still running, but
6822 * verify just in case. If it was a locker only, it can be removed
6823 * without any further consideration; but if it contained an update,
6824 * we might need to preserve it.
6825 */
6826 if (MultiXactIdIsRunning(multi,
6827 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask)))
6828 ereport(ERROR,
6830 errmsg_internal("multixact %u from before multi freeze cutoff %u found to be still running",
6831 multi, cutoffs->OldestMxact)));
6832
6833 if (HEAP_XMAX_IS_LOCKED_ONLY(t_infomask))
6834 {
6835 *flags |= FRM_INVALIDATE_XMAX;
6836 pagefrz->freeze_required = true;
6837 return InvalidTransactionId;
6838 }
6839
6840 /* replace multi with single XID for its updater? */
6841 update_xact = MultiXactIdGetUpdateXid(multi, t_infomask);
6843 ereport(ERROR,
6845 errmsg_internal("multixact %u contains update XID %u from before relfrozenxid %u",
6846 multi, update_xact,
6847 cutoffs->relfrozenxid)));
6848 else if (TransactionIdPrecedes(update_xact, cutoffs->OldestXmin))
6849 {
6850 /*
6851 * Updater XID has to have aborted (otherwise the tuple would have
6852 * been pruned away instead, since updater XID is < OldestXmin).
6853 * Just remove xmax.
6854 */
6856 ereport(ERROR,
6858 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
6859 multi, update_xact,
6860 cutoffs->OldestXmin)));
6861 *flags |= FRM_INVALIDATE_XMAX;
6862 pagefrz->freeze_required = true;
6863 return InvalidTransactionId;
6864 }
6865
6866 /* Have to keep updater XID as new xmax */
6867 *flags |= FRM_RETURN_IS_XID;
6868 pagefrz->freeze_required = true;
6869 return update_xact;
6870 }
6871
6872 /*
6873 * Some member(s) of this Multi may be below FreezeLimit xid cutoff, so we
6874 * need to walk the whole members array to figure out what to do, if
6875 * anything.
6876 */
6877 nmembers =
6878 GetMultiXactIdMembers(multi, &members, false,
6879 HEAP_XMAX_IS_LOCKED_ONLY(t_infomask));
6880 if (nmembers <= 0)
6881 {
6882 /* Nothing worth keeping */
6883 *flags |= FRM_INVALIDATE_XMAX;
6884 pagefrz->freeze_required = true;
6885 return InvalidTransactionId;
6886 }
6887
6888 /*
6889 * The FRM_NOOP case is the only case where we might need to ratchet back
6890 * FreezePageRelfrozenXid or FreezePageRelminMxid. It is also the only
6891 * case where our caller might ratchet back its NoFreezePageRelfrozenXid
6892 * or NoFreezePageRelminMxid "no freeze" trackers to deal with a multi.
6893 * FRM_NOOP handling should result in the NewRelfrozenXid/NewRelminMxid
6894 * trackers managed by VACUUM being ratcheting back by xmax to the degree
6895 * required to make it safe to leave xmax undisturbed, independent of
6896 * whether or not page freezing is triggered somewhere else.
6897 *
6898 * Our policy is to force freezing in every case other than FRM_NOOP,
6899 * which obviates the need to maintain either set of trackers, anywhere.
6900 * Every other case will reliably execute a freeze plan for xmax that
6901 * either replaces xmax with an XID/MXID >= OldestXmin/OldestMxact, or
6902 * sets xmax to an InvalidTransactionId XID, rendering xmax fully frozen.
6903 * (VACUUM's NewRelfrozenXid/NewRelminMxid trackers are initialized with
6904 * OldestXmin/OldestMxact, so later values never need to be tracked here.)
6905 */
6906 need_replace = false;
6907 FreezePageRelfrozenXid = pagefrz->FreezePageRelfrozenXid;
6908 for (int i = 0; i < nmembers; i++)
6909 {
6910 TransactionId xid = members[i].xid;
6911
6912 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6913
6914 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
6915 {
6916 /* Can't violate the FreezeLimit postcondition */
6917 need_replace = true;
6918 break;
6919 }
6920 if (TransactionIdPrecedes(xid, FreezePageRelfrozenXid))
6921 FreezePageRelfrozenXid = xid;
6922 }
6923
6924 /* Can't violate the MultiXactCutoff postcondition, either */
6925 if (!need_replace)
6927
6928 if (!need_replace)
6929 {
6930 /*
6931 * vacuumlazy.c might ratchet back NewRelminMxid, NewRelfrozenXid, or
6932 * both together to make it safe to retain this particular multi after
6933 * freezing its page
6934 */
6935 *flags |= FRM_NOOP;
6936 pagefrz->FreezePageRelfrozenXid = FreezePageRelfrozenXid;
6937 if (MultiXactIdPrecedes(multi, pagefrz->FreezePageRelminMxid))
6938 pagefrz->FreezePageRelminMxid = multi;
6939 pfree(members);
6940 return multi;
6941 }
6942
6943 /*
6944 * Do a more thorough second pass over the multi to figure out which
6945 * member XIDs actually need to be kept. Checking the precise status of
6946 * individual members might even show that we don't need to keep anything.
6947 * That is quite possible even though the Multi must be >= OldestMxact,
6948 * since our second pass only keeps member XIDs when it's truly necessary;
6949 * even member XIDs >= OldestXmin often won't be kept by second pass.
6950 */
6951 nnewmembers = 0;
6953 has_lockers = false;
6955 update_committed = false;
6956
6957 /*
6958 * Determine whether to keep each member xid, or to ignore it instead
6959 */
6960 for (int i = 0; i < nmembers; i++)
6961 {
6962 TransactionId xid = members[i].xid;
6963 MultiXactStatus mstatus = members[i].status;
6964
6965 Assert(!TransactionIdPrecedes(xid, cutoffs->relfrozenxid));
6966
6967 if (!ISUPDATE_from_mxstatus(mstatus))
6968 {
6969 /*
6970 * Locker XID (not updater XID). We only keep lockers that are
6971 * still running.
6972 */
6975 {
6976 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
6977 ereport(ERROR,
6979 errmsg_internal("multixact %u contains running locker XID %u from before removable cutoff %u",
6980 multi, xid,
6981 cutoffs->OldestXmin)));
6982 newmembers[nnewmembers++] = members[i];
6983 has_lockers = true;
6984 }
6985
6986 continue;
6987 }
6988
6989 /*
6990 * Updater XID (not locker XID). Should we keep it?
6991 *
6992 * Since the tuple wasn't totally removed when vacuum pruned, the
6993 * update Xid cannot possibly be older than OldestXmin cutoff unless
6994 * the updater XID aborted. If the updater transaction is known
6995 * aborted or crashed then it's okay to ignore it, otherwise not.
6996 *
6997 * In any case the Multi should never contain two updaters, whatever
6998 * their individual commit status. Check for that first, in passing.
6999 */
7001 ereport(ERROR,
7003 errmsg_internal("multixact %u has two or more updating members",
7004 multi),
7005 errdetail_internal("First updater XID=%u second updater XID=%u.",
7006 update_xid, xid)));
7007
7008 /*
7009 * As with all tuple visibility routines, it's critical to test
7010 * TransactionIdIsInProgress before TransactionIdDidCommit, because of
7011 * race conditions explained in detail in heapam_visibility.c.
7012 */
7015 update_xid = xid;
7016 else if (TransactionIdDidCommit(xid))
7017 {
7018 /*
7019 * The transaction committed, so we can tell caller to set
7020 * HEAP_XMAX_COMMITTED. (We can only do this because we know the
7021 * transaction is not running.)
7022 */
7023 update_committed = true;
7024 update_xid = xid;
7025 }
7026 else
7027 {
7028 /*
7029 * Not in progress, not committed -- must be aborted or crashed;
7030 * we can ignore it.
7031 */
7032 continue;
7033 }
7034
7035 /*
7036 * We determined that updater must be kept -- add it to pending new
7037 * members list
7038 */
7039 if (TransactionIdPrecedes(xid, cutoffs->OldestXmin))
7040 ereport(ERROR,
7042 errmsg_internal("multixact %u contains committed update XID %u from before removable cutoff %u",
7043 multi, xid, cutoffs->OldestXmin)));
7044 newmembers[nnewmembers++] = members[i];
7045 }
7046
7047 pfree(members);
7048
7049 /*
7050 * Determine what to do with caller's multi based on information gathered
7051 * during our second pass
7052 */
7053 if (nnewmembers == 0)
7054 {
7055 /* Nothing worth keeping */
7056 *flags |= FRM_INVALIDATE_XMAX;
7058 }
7060 {
7061 /*
7062 * If there's a single member and it's an update, pass it back alone
7063 * without creating a new Multi. (XXX we could do this when there's a
7064 * single remaining locker, too, but that would complicate the API too
7065 * much; moreover, the case with the single updater is more
7066 * interesting, because those are longer-lived.)
7067 */
7068 Assert(nnewmembers == 1);
7069 *flags |= FRM_RETURN_IS_XID;
7070 if (update_committed)
7071 *flags |= FRM_MARK_COMMITTED;
7073 }
7074 else
7075 {
7076 /*
7077 * Create a new multixact with the surviving members of the previous
7078 * one, to set as new Xmax in the tuple
7079 */
7081 *flags |= FRM_RETURN_IS_MULTI;
7082 }
7083
7085
7086 pagefrz->freeze_required = true;
7087 return newxmax;
7088}

References Assert, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errdetail_internal(), errmsg_internal(), ERROR, fb(), HeapPageFreeze::freeze_required, VacuumCutoffs::FreezeLimit, HeapPageFreeze::FreezePageRelfrozenXid, HeapPageFreeze::FreezePageRelminMxid, FRM_INVALIDATE_XMAX, FRM_MARK_COMMITTED, FRM_NOOP, FRM_RETURN_IS_MULTI, FRM_RETURN_IS_XID, GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, i, InvalidTransactionId, ISUPDATE_from_mxstatus, VacuumCutoffs::MultiXactCutoff, MultiXactIdCreateFromMembers(), MultiXactIdGetUpdateXid(), MultiXactIdIsRunning(), MultiXactIdIsValid, MultiXactIdPrecedes(), VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, palloc_array, pfree(), VacuumCutoffs::relfrozenxid, VacuumCutoffs::relminmxid, MultiXactMember::status, TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), TransactionIdIsInProgress(), TransactionIdIsValid, TransactionIdPrecedes(), and MultiXactMember::xid.

Referenced by heap_prepare_freeze_tuple().

◆ get_mxact_status_for_lock()

static MultiXactStatus get_mxact_status_for_lock ( LockTupleMode  mode,
bool  is_update 
)
static

Definition at line 4596 of file heapam.c.

4597{
4598 int retval;
4599
4600 if (is_update)
4601 retval = tupleLockExtraInfo[mode].updstatus;
4602 else
4603 retval = tupleLockExtraInfo[mode].lockstatus;
4604
4605 if (retval == -1)
4606 elog(ERROR, "invalid lock tuple mode %d/%s", mode,
4607 is_update ? "true" : "false");
4608
4609 return (MultiXactStatus) retval;
4610}

References elog, ERROR, fb(), mode, and tupleLockExtraInfo.

Referenced by compute_new_xmax_infomask(), heap_lock_tuple(), and test_lockmode_for_conflict().

◆ GetBulkInsertState()

◆ GetMultiXactIdHintBits()

static void GetMultiXactIdHintBits ( MultiXactId  multi,
uint16 new_infomask,
uint16 new_infomask2 
)
static

Definition at line 7526 of file heapam.c.

7528{
7529 int nmembers;
7530 MultiXactMember *members;
7531 int i;
7533 uint16 bits2 = 0;
7534 bool has_update = false;
7536
7537 /*
7538 * We only use this in multis we just created, so they cannot be values
7539 * pre-pg_upgrade.
7540 */
7541 nmembers = GetMultiXactIdMembers(multi, &members, false, false);
7542
7543 for (i = 0; i < nmembers; i++)
7544 {
7546
7547 /*
7548 * Remember the strongest lock mode held by any member of the
7549 * multixact.
7550 */
7551 mode = TUPLOCK_from_mxstatus(members[i].status);
7552 if (mode > strongest)
7553 strongest = mode;
7554
7555 /* See what other bits we need */
7556 switch (members[i].status)
7557 {
7561 break;
7562
7565 break;
7566
7568 has_update = true;
7569 break;
7570
7573 has_update = true;
7574 break;
7575 }
7576 }
7577
7580 bits |= HEAP_XMAX_EXCL_LOCK;
7581 else if (strongest == LockTupleShare)
7582 bits |= HEAP_XMAX_SHR_LOCK;
7583 else if (strongest == LockTupleKeyShare)
7584 bits |= HEAP_XMAX_KEYSHR_LOCK;
7585
7586 if (!has_update)
7587 bits |= HEAP_XMAX_LOCK_ONLY;
7588
7589 if (nmembers > 0)
7590 pfree(members);
7591
7592 *new_infomask = bits;
7594}

References fb(), GetMultiXactIdMembers(), HEAP_KEYS_UPDATED, HEAP_XMAX_EXCL_LOCK, HEAP_XMAX_IS_MULTI, HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HEAP_XMAX_SHR_LOCK, i, LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, mode, MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, pfree(), and TUPLOCK_from_mxstatus.

Referenced by compute_new_xmax_infomask(), heap_prepare_freeze_tuple(), and heap_update().

◆ heap_abort_speculative()

void heap_abort_speculative ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 6254 of file heapam.c.

6255{
6257 ItemId lp;
6258 HeapTupleData tp;
6259 Page page;
6260 BlockNumber block;
6261 Buffer buffer;
6262
6264
6265 block = ItemPointerGetBlockNumber(tid);
6266 buffer = ReadBuffer(relation, block);
6267 page = BufferGetPage(buffer);
6268
6270
6271 /*
6272 * Page can't be all visible, we just inserted into it, and are still
6273 * running.
6274 */
6275 Assert(!PageIsAllVisible(page));
6276
6279
6280 tp.t_tableOid = RelationGetRelid(relation);
6281 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
6282 tp.t_len = ItemIdGetLength(lp);
6283 tp.t_self = *tid;
6284
6285 /*
6286 * Sanity check that the tuple really is a speculatively inserted tuple,
6287 * inserted by us.
6288 */
6289 if (tp.t_data->t_choice.t_heap.t_xmin != xid)
6290 elog(ERROR, "attempted to kill a tuple inserted by another transaction");
6291 if (!(IsToastRelation(relation) || HeapTupleHeaderIsSpeculative(tp.t_data)))
6292 elog(ERROR, "attempted to kill a non-speculative tuple");
6294
6295 /*
6296 * No need to check for serializable conflicts here. There is never a
6297 * need for a combo CID, either. No need to extract replica identity, or
6298 * do anything special with infomask bits.
6299 */
6300
6302
6303 /*
6304 * The tuple will become DEAD immediately. Flag that this page is a
6305 * candidate for pruning by setting xmin to TransactionXmin. While not
6306 * immediately prunable, it is the oldest xid we can cheaply determine
6307 * that's safe against wraparound / being older than the table's
6308 * relfrozenxid. To defend against the unlikely case of a new relation
6309 * having a newer relfrozenxid than our TransactionXmin, use relfrozenxid
6310 * if so (vacuum can't subsequently move relfrozenxid to beyond
6311 * TransactionXmin, so there's no race here).
6312 */
6314 {
6315 TransactionId relfrozenxid = relation->rd_rel->relfrozenxid;
6317
6318 if (TransactionIdPrecedes(TransactionXmin, relfrozenxid))
6319 prune_xid = relfrozenxid;
6320 else
6323 }
6324
6325 /* store transaction information of xact deleting the tuple */
6328
6329 /*
6330 * Set the tuple header xmin to InvalidTransactionId. This makes the
6331 * tuple immediately invisible everyone. (In particular, to any
6332 * transactions waiting on the speculative token, woken up later.)
6333 */
6335
6336 /* Clear the speculative insertion token too */
6337 tp.t_data->t_ctid = tp.t_self;
6338
6339 MarkBufferDirty(buffer);
6340
6341 /*
6342 * XLOG stuff
6343 *
6344 * The WAL records generated here match heap_delete(). The same recovery
6345 * routines are used.
6346 */
6347 if (RelationNeedsWAL(relation))
6348 {
6351
6353 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
6354 tp.t_data->t_infomask2);
6356 xlrec.xmax = xid;
6357
6361
6362 /* No replica identity & replication origin logged */
6363
6365
6366 PageSetLSN(page, recptr);
6367 }
6368
6370
6372
6373 if (HeapTupleHasExternal(&tp))
6374 {
6375 Assert(!IsToastRelation(relation));
6376 heap_toast_delete(relation, &tp, true);
6377 }
6378
6379 /*
6380 * Never need to mark tuple for invalidation, since catalogs don't support
6381 * speculative insertion
6382 */
6383
6384 /* Now we can release the buffer */
6385 ReleaseBuffer(buffer);
6386
6387 /* count deletion, as we counted the insertion too */
6388 pgstat_count_heap_delete(relation);
6389}

References Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), compute_infobits(), elog, END_CRIT_SECTION, ERROR, fb(), xl_heap_delete::flags, GetCurrentTransactionId(), HEAP_MOVED, heap_toast_delete(), HEAP_XMAX_BITS, HeapTupleHasExternal(), HeapTupleHeaderIsHeapOnly(), HeapTupleHeaderIsSpeculative(), HeapTupleHeaderSetXmin(), InvalidTransactionId, IsToastRelation(), ItemIdGetLength, ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), MarkBufferDirty(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_delete(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetRelid, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapDelete, START_CRIT_SECTION, HeapTupleHeaderData::t_choice, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_heap, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, HeapTupleFields::t_xmin, TransactionIdIsValid, TransactionIdPrecedes(), TransactionXmin, XLH_DELETE_IS_SUPER, XLOG_HEAP_DELETE, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), and XLogRegisterData().

Referenced by heapam_tuple_complete_speculative(), and toast_delete_datum().

◆ heap_acquire_tuplock()

static bool heap_acquire_tuplock ( Relation  relation,
const ItemPointerData tid,
LockTupleMode  mode,
LockWaitPolicy  wait_policy,
bool have_tuple_lock 
)
static

Definition at line 5345 of file heapam.c.

5347{
5348 if (*have_tuple_lock)
5349 return true;
5350
5351 switch (wait_policy)
5352 {
5353 case LockWaitBlock:
5354 LockTupleTuplock(relation, tid, mode);
5355 break;
5356
5357 case LockWaitSkip:
5358 if (!ConditionalLockTupleTuplock(relation, tid, mode, false))
5359 return false;
5360 break;
5361
5362 case LockWaitError:
5364 ereport(ERROR,
5366 errmsg("could not obtain lock on row in relation \"%s\"",
5367 RelationGetRelationName(relation))));
5368 break;
5369 }
5370 *have_tuple_lock = true;
5371
5372 return true;
5373}

References ConditionalLockTupleTuplock, ereport, errcode(), errmsg(), ERROR, fb(), LockTupleTuplock, LockWaitBlock, LockWaitError, LockWaitSkip, log_lock_failures, mode, and RelationGetRelationName.

Referenced by heap_delete(), heap_lock_tuple(), and heap_update().

◆ heap_attr_equals()

static bool heap_attr_equals ( TupleDesc  tupdesc,
int  attrnum,
Datum  value1,
Datum  value2,
bool  isnull1,
bool  isnull2 
)
static

Definition at line 4414 of file heapam.c.

4416{
4417 /*
4418 * If one value is NULL and other is not, then they are certainly not
4419 * equal
4420 */
4421 if (isnull1 != isnull2)
4422 return false;
4423
4424 /*
4425 * If both are NULL, they can be considered equal.
4426 */
4427 if (isnull1)
4428 return true;
4429
4430 /*
4431 * We do simple binary comparison of the two datums. This may be overly
4432 * strict because there can be multiple binary representations for the
4433 * same logical value. But we should be OK as long as there are no false
4434 * positives. Using a type-specific equality operator is messy because
4435 * there could be multiple notions of equality in different operator
4436 * classes; furthermore, we cannot safely invoke user-defined functions
4437 * while holding exclusive buffer lock.
4438 */
4439 if (attrnum <= 0)
4440 {
4441 /* The only allowed system columns are OIDs, so do this */
4443 }
4444 else
4445 {
4447
4449 att = TupleDescCompactAttr(tupdesc, attrnum - 1);
4450 return datumIsEqual(value1, value2, att->attbyval, att->attlen);
4451 }
4452}

References Assert, DatumGetObjectId(), datumIsEqual(), fb(), and TupleDescCompactAttr().

Referenced by HeapDetermineColumnsInfo().

◆ heap_beginscan()

TableScanDesc heap_beginscan ( Relation  relation,
Snapshot  snapshot,
int  nkeys,
ScanKey  key,
ParallelTableScanDesc  parallel_scan,
uint32  flags 
)

Definition at line 1163 of file heapam.c.

1167{
1168 HeapScanDesc scan;
1169
1170 /*
1171 * increment relation ref count while scanning relation
1172 *
1173 * This is just to make really sure the relcache entry won't go away while
1174 * the scan has a pointer to it. Caller should be holding the rel open
1175 * anyway, so this is redundant in all normal scenarios...
1176 */
1178
1179 /*
1180 * allocate and initialize scan descriptor
1181 */
1182 if (flags & SO_TYPE_BITMAPSCAN)
1183 {
1185
1186 /*
1187 * Bitmap Heap scans do not have any fields that a normal Heap Scan
1188 * does not have, so no special initializations required here.
1189 */
1190 scan = (HeapScanDesc) bscan;
1191 }
1192 else
1194
1195 scan->rs_base.rs_rd = relation;
1196 scan->rs_base.rs_snapshot = snapshot;
1197 scan->rs_base.rs_nkeys = nkeys;
1198 scan->rs_base.rs_flags = flags;
1199 scan->rs_base.rs_parallel = parallel_scan;
1200 scan->rs_strategy = NULL; /* set in initscan */
1201 scan->rs_cbuf = InvalidBuffer;
1202
1203 /*
1204 * Disable page-at-a-time mode if it's not a MVCC-safe snapshot.
1205 */
1206 if (!(snapshot && IsMVCCSnapshot(snapshot)))
1208
1209 /* Check that a historic snapshot is not used for non-catalog tables */
1210 if (snapshot &&
1211 IsHistoricMVCCSnapshot(snapshot) &&
1213 {
1214 ereport(ERROR,
1216 errmsg("cannot query non-catalog table \"%s\" during logical decoding",
1217 RelationGetRelationName(relation))));
1218 }
1219
1220 /*
1221 * For seqscan and sample scans in a serializable transaction, acquire a
1222 * predicate lock on the entire relation. This is required not only to
1223 * lock all the matching tuples, but also to conflict with new insertions
1224 * into the table. In an indexscan, we take page locks on the index pages
1225 * covering the range specified in the scan qual, but in a heap scan there
1226 * is nothing more fine-grained to lock. A bitmap scan is a different
1227 * story, there we have already scanned the index and locked the index
1228 * pages covering the predicate. But in that case we still have to lock
1229 * any matching heap tuples. For sample scan we could optimize the locking
1230 * to be at least page-level granularity, but we'd need to add per-tuple
1231 * locking for that.
1232 */
1234 {
1235 /*
1236 * Ensure a missing snapshot is noticed reliably, even if the
1237 * isolation mode means predicate locking isn't performed (and
1238 * therefore the snapshot isn't used here).
1239 */
1240 Assert(snapshot);
1241 PredicateLockRelation(relation, snapshot);
1242 }
1243
1244 /* we only need to set this up once */
1245 scan->rs_ctup.t_tableOid = RelationGetRelid(relation);
1246
1247 /*
1248 * Allocate memory to keep track of page allocation for parallel workers
1249 * when doing a parallel scan.
1250 */
1251 if (parallel_scan != NULL)
1253 else
1255
1256 /*
1257 * we do this here instead of in initscan() because heap_rescan also calls
1258 * initscan() and we don't want to allocate memory again
1259 */
1260 if (nkeys > 0)
1261 scan->rs_base.rs_key = palloc_array(ScanKeyData, nkeys);
1262 else
1263 scan->rs_base.rs_key = NULL;
1264
1265 initscan(scan, key, false);
1266
1267 scan->rs_read_stream = NULL;
1268
1269 /*
1270 * Set up a read stream for sequential scans and TID range scans. This
1271 * should be done after initscan() because initscan() allocates the
1272 * BufferAccessStrategy object passed to the read stream API.
1273 */
1274 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN ||
1276 {
1278
1279 if (scan->rs_base.rs_parallel)
1281 else
1283
1284 /* ---
1285 * It is safe to use batchmode as the only locks taken by `cb`
1286 * are never taken while waiting for IO:
1287 * - SyncScanLock is used in the non-parallel case
1288 * - in the parallel case, only spinlocks and atomics are used
1289 * ---
1290 */
1293 scan->rs_strategy,
1294 scan->rs_base.rs_rd,
1296 cb,
1297 scan,
1298 0);
1299 }
1300 else if (scan->rs_base.rs_flags & SO_TYPE_BITMAPSCAN)
1301 {
1304 scan->rs_strategy,
1305 scan->rs_base.rs_rd,
1308 scan,
1309 sizeof(TBMIterateResult));
1310 }
1311
1312
1313 return (TableScanDesc) scan;
1314}

References Assert, bitmapheap_stream_read_next(), ereport, errcode(), errmsg(), ERROR, fb(), heap_scan_stream_read_next_parallel(), heap_scan_stream_read_next_serial(), initscan(), InvalidBuffer, IsHistoricMVCCSnapshot, IsMVCCSnapshot, MAIN_FORKNUM, palloc_array, palloc_object, PredicateLockRelation(), read_stream_begin_relation(), READ_STREAM_DEFAULT, READ_STREAM_SEQUENTIAL, READ_STREAM_USE_BATCHING, RelationGetRelationName, RelationGetRelid, RelationIncrementReferenceCount(), RelationIsAccessibleInLogicalDecoding, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_flags, TableScanDescData::rs_key, TableScanDescData::rs_nkeys, TableScanDescData::rs_parallel, HeapScanDescData::rs_parallelworkerdata, TableScanDescData::rs_rd, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, HeapScanDescData::rs_strategy, SO_TYPE_BITMAPSCAN, SO_TYPE_SAMPLESCAN, SO_TYPE_SEQSCAN, SO_TYPE_TIDRANGESCAN, and HeapTupleData::t_tableOid.

◆ heap_delete()

TM_Result heap_delete ( Relation  relation,
const ItemPointerData tid,
CommandId  cid,
Snapshot  crosscheck,
bool  wait,
TM_FailureData tmfd,
bool  changingPart 
)

Definition at line 2842 of file heapam.c.

2845{
2846 TM_Result result;
2848 ItemId lp;
2849 HeapTupleData tp;
2850 Page page;
2851 BlockNumber block;
2852 Buffer buffer;
2853 Buffer vmbuffer = InvalidBuffer;
2854 TransactionId new_xmax;
2857 bool have_tuple_lock = false;
2858 bool iscombo;
2859 bool all_visible_cleared = false;
2860 HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */
2861 bool old_key_copied = false;
2862
2864
2865 AssertHasSnapshotForToast(relation);
2866
2867 /*
2868 * Forbid this during a parallel operation, lest it allocate a combo CID.
2869 * Other workers might need that combo CID for visibility checks, and we
2870 * have no provision for broadcasting it to them.
2871 */
2872 if (IsInParallelMode())
2873 ereport(ERROR,
2875 errmsg("cannot delete tuples during a parallel operation")));
2876
2877 block = ItemPointerGetBlockNumber(tid);
2878 buffer = ReadBuffer(relation, block);
2879 page = BufferGetPage(buffer);
2880
2881 /*
2882 * Before locking the buffer, pin the visibility map page if it appears to
2883 * be necessary. Since we haven't got the lock yet, someone else might be
2884 * in the middle of changing this, so we'll need to recheck after we have
2885 * the lock.
2886 */
2887 if (PageIsAllVisible(page))
2888 visibilitymap_pin(relation, block, &vmbuffer);
2889
2891
2894
2895 tp.t_tableOid = RelationGetRelid(relation);
2896 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
2897 tp.t_len = ItemIdGetLength(lp);
2898 tp.t_self = *tid;
2899
2900l1:
2901
2902 /*
2903 * If we didn't pin the visibility map page and the page has become all
2904 * visible while we were busy locking the buffer, we'll have to unlock and
2905 * re-lock, to avoid holding the buffer lock across an I/O. That's a bit
2906 * unfortunate, but hopefully shouldn't happen often.
2907 */
2908 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
2909 {
2911 visibilitymap_pin(relation, block, &vmbuffer);
2913 }
2914
2915 result = HeapTupleSatisfiesUpdate(&tp, cid, buffer);
2916
2917 if (result == TM_Invisible)
2918 {
2919 UnlockReleaseBuffer(buffer);
2920 ereport(ERROR,
2922 errmsg("attempted to delete invisible tuple")));
2923 }
2924 else if (result == TM_BeingModified && wait)
2925 {
2928
2929 /* must copy state data before unlocking buffer */
2932
2933 /*
2934 * Sleep until concurrent transaction ends -- except when there's a
2935 * single locker and it's our own transaction. Note we don't care
2936 * which lock mode the locker has, because we need the strongest one.
2937 *
2938 * Before sleeping, we need to acquire tuple lock to establish our
2939 * priority for the tuple (see heap_lock_tuple). LockTuple will
2940 * release us when we are next-in-line for the tuple.
2941 *
2942 * If we are forced to "start over" below, we keep the tuple lock;
2943 * this arranges that we stay at the head of the line while rechecking
2944 * tuple state.
2945 */
2947 {
2948 bool current_is_member = false;
2949
2952 {
2954
2955 /*
2956 * Acquire the lock, if necessary (but skip it when we're
2957 * requesting a lock and already have one; avoids deadlock).
2958 */
2959 if (!current_is_member)
2962
2963 /* wait for multixact */
2965 relation, &(tp.t_self), XLTW_Delete,
2966 NULL);
2968
2969 /*
2970 * If xwait had just locked the tuple then some other xact
2971 * could update this tuple before we get to this point. Check
2972 * for xmax change, and start over if so.
2973 *
2974 * We also must start over if we didn't pin the VM page, and
2975 * the page has become all visible.
2976 */
2977 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
2980 xwait))
2981 goto l1;
2982 }
2983
2984 /*
2985 * You might think the multixact is necessarily done here, but not
2986 * so: it could have surviving members, namely our own xact or
2987 * other subxacts of this backend. It is legal for us to delete
2988 * the tuple in either case, however (the latter case is
2989 * essentially a situation of upgrading our former shared lock to
2990 * exclusive). We don't bother changing the on-disk hint bits
2991 * since we are about to overwrite the xmax altogether.
2992 */
2993 }
2995 {
2996 /*
2997 * Wait for regular transaction to end; but first, acquire tuple
2998 * lock.
2999 */
3003 XactLockTableWait(xwait, relation, &(tp.t_self), XLTW_Delete);
3005
3006 /*
3007 * xwait is done, but if xwait had just locked the tuple then some
3008 * other xact could update this tuple before we get to this point.
3009 * Check for xmax change, and start over if so.
3010 *
3011 * We also must start over if we didn't pin the VM page, and the
3012 * page has become all visible.
3013 */
3014 if ((vmbuffer == InvalidBuffer && PageIsAllVisible(page)) ||
3017 xwait))
3018 goto l1;
3019
3020 /* Otherwise check if it committed or aborted */
3021 UpdateXmaxHintBits(tp.t_data, buffer, xwait);
3022 }
3023
3024 /*
3025 * We may overwrite if previous xmax aborted, or if it committed but
3026 * only locked the tuple without updating it.
3027 */
3028 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3031 result = TM_Ok;
3032 else if (!ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid))
3033 result = TM_Updated;
3034 else
3035 result = TM_Deleted;
3036 }
3037
3038 /* sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3039 if (result != TM_Ok)
3040 {
3041 Assert(result == TM_SelfModified ||
3042 result == TM_Updated ||
3043 result == TM_Deleted ||
3044 result == TM_BeingModified);
3046 Assert(result != TM_Updated ||
3048 }
3049
3050 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3051 {
3052 /* Perform additional check for transaction-snapshot mode RI updates */
3053 if (!HeapTupleSatisfiesVisibility(&tp, crosscheck, buffer))
3054 result = TM_Updated;
3055 }
3056
3057 if (result != TM_Ok)
3058 {
3059 tmfd->ctid = tp.t_data->t_ctid;
3061 if (result == TM_SelfModified)
3063 else
3064 tmfd->cmax = InvalidCommandId;
3065 UnlockReleaseBuffer(buffer);
3066 if (have_tuple_lock)
3068 if (vmbuffer != InvalidBuffer)
3069 ReleaseBuffer(vmbuffer);
3070 return result;
3071 }
3072
3073 /*
3074 * We're about to do the actual delete -- check for conflict first, to
3075 * avoid possibly having to roll back work we've just done.
3076 *
3077 * This is safe without a recheck as long as there is no possibility of
3078 * another process scanning the page between this check and the delete
3079 * being visible to the scan (i.e., an exclusive buffer content lock is
3080 * continuously held from this point until the tuple delete is visible).
3081 */
3083
3084 /* replace cid with a combo CID if necessary */
3086
3087 /*
3088 * Compute replica identity tuple before entering the critical section so
3089 * we don't PANIC upon a memory allocation failure.
3090 */
3091 old_key_tuple = ExtractReplicaIdentity(relation, &tp, true, &old_key_copied);
3092
3093 /*
3094 * If this is the first possibly-multixact-able operation in the current
3095 * transaction, set my per-backend OldestMemberMXactId setting. We can be
3096 * certain that the transaction will never become a member of any older
3097 * MultiXactIds than that. (We have to do this even if we end up just
3098 * using our own TransactionId below, since some other backend could
3099 * incorporate our XID into a MultiXact immediately afterwards.)
3100 */
3102
3105 xid, LockTupleExclusive, true,
3106 &new_xmax, &new_infomask, &new_infomask2);
3107
3109
3110 /*
3111 * If this transaction commits, the tuple will become DEAD sooner or
3112 * later. Set flag that this page is a candidate for pruning once our xid
3113 * falls below the OldestXmin horizon. If the transaction finally aborts,
3114 * the subsequent page pruning will be a no-op and the hint will be
3115 * cleared.
3116 */
3117 PageSetPrunable(page, xid);
3118
3119 if (PageIsAllVisible(page))
3120 {
3121 all_visible_cleared = true;
3122 PageClearAllVisible(page);
3123 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
3124 vmbuffer, VISIBILITYMAP_VALID_BITS);
3125 }
3126
3127 /* store transaction information of xact deleting the tuple */
3133 HeapTupleHeaderSetXmax(tp.t_data, new_xmax);
3135 /* Make sure there is no forward chain link in t_ctid */
3136 tp.t_data->t_ctid = tp.t_self;
3137
3138 /* Signal that this is actually a move into another partition */
3139 if (changingPart)
3141
3142 MarkBufferDirty(buffer);
3143
3144 /*
3145 * XLOG stuff
3146 *
3147 * NB: heap_abort_speculative() uses the same xlog record and replay
3148 * routines.
3149 */
3150 if (RelationNeedsWAL(relation))
3151 {
3155
3156 /*
3157 * For logical decode we need combo CIDs to properly decode the
3158 * catalog
3159 */
3161 log_heap_new_cid(relation, &tp);
3162
3163 xlrec.flags = 0;
3166 if (changingPart)
3168 xlrec.infobits_set = compute_infobits(tp.t_data->t_infomask,
3169 tp.t_data->t_infomask2);
3171 xlrec.xmax = new_xmax;
3172
3173 if (old_key_tuple != NULL)
3174 {
3175 if (relation->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
3177 else
3179 }
3180
3183
3185
3186 /*
3187 * Log replica identity of the deleted tuple if there is one
3188 */
3189 if (old_key_tuple != NULL)
3190 {
3191 xlhdr.t_infomask2 = old_key_tuple->t_data->t_infomask2;
3192 xlhdr.t_infomask = old_key_tuple->t_data->t_infomask;
3193 xlhdr.t_hoff = old_key_tuple->t_data->t_hoff;
3194
3196 XLogRegisterData((char *) old_key_tuple->t_data
3198 old_key_tuple->t_len
3200 }
3201
3202 /* filtering by origin on a row level is much more efficient */
3204
3206
3207 PageSetLSN(page, recptr);
3208 }
3209
3211
3213
3214 if (vmbuffer != InvalidBuffer)
3215 ReleaseBuffer(vmbuffer);
3216
3217 /*
3218 * If the tuple has toasted out-of-line attributes, we need to delete
3219 * those items too. We have to do this before releasing the buffer
3220 * because we need to look at the contents of the tuple, but it's OK to
3221 * release the content lock on the buffer first.
3222 */
3223 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3224 relation->rd_rel->relkind != RELKIND_MATVIEW)
3225 {
3226 /* toast table entries should never be recursively toasted */
3228 }
3229 else if (HeapTupleHasExternal(&tp))
3230 heap_toast_delete(relation, &tp, false);
3231
3232 /*
3233 * Mark tuple for invalidation from system caches at next command
3234 * boundary. We have to do this before releasing the buffer because we
3235 * need to look at the contents of the tuple.
3236 */
3237 CacheInvalidateHeapTuple(relation, &tp, NULL);
3238
3239 /* Now we can release the buffer */
3240 ReleaseBuffer(buffer);
3241
3242 /*
3243 * Release the lmgr tuple lock, if we had it.
3244 */
3245 if (have_tuple_lock)
3247
3248 pgstat_count_heap_delete(relation);
3249
3252
3253 return TM_Ok;
3254}

References Assert, AssertHasSnapshotForToast(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), TM_FailureData::ctid, DoesMultiXactIdConflict(), END_CRIT_SECTION, ereport, errcode(), errmsg(), ERROR, ExtractReplicaIdentity(), fb(), GetCurrentTransactionId(), heap_acquire_tuplock(), heap_freetuple(), HEAP_MOVED, heap_toast_delete(), HEAP_XMAX_BITS, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HeapTupleHasExternal(), HeapTupleHeaderAdjustCmax(), HeapTupleHeaderClearHotUpdated(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetCmax(), HeapTupleHeaderSetMovedPartitions(), HeapTupleHeaderSetXmax(), HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesVisibility(), InvalidBuffer, InvalidCommandId, InvalidSnapshot, IsInParallelMode(), ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), LockTupleExclusive, LockWaitBlock, log_heap_new_cid(), MarkBufferDirty(), MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusUpdate, PageClearAllVisible(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_delete(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetRelid, RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapDelete, SizeOfHeapHeader, SizeofHeapTupleHeader, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TransactionIdEquals, TransactionIdIsCurrentTransactionId(), UnlockReleaseBuffer(), UnlockTupleTuplock, UpdateXmaxHintBits(), visibilitymap_clear(), visibilitymap_pin(), VISIBILITYMAP_VALID_BITS, XactLockTableWait(), XLH_DELETE_ALL_VISIBLE_CLEARED, XLH_DELETE_CONTAINS_OLD_KEY, XLH_DELETE_CONTAINS_OLD_TUPLE, XLH_DELETE_IS_PARTITION_MOVE, XLOG_HEAP_DELETE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLogSetRecordFlags(), XLTW_Delete, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_delete(), and simple_heap_delete().

◆ heap_endscan()

void heap_endscan ( TableScanDesc  sscan)

Definition at line 1370 of file heapam.c.

1371{
1373
1374 /* Note: no locking manipulations needed */
1375
1376 /*
1377 * unpin scan buffers
1378 */
1379 if (BufferIsValid(scan->rs_cbuf))
1380 ReleaseBuffer(scan->rs_cbuf);
1381
1382 /*
1383 * Must free the read stream before freeing the BufferAccessStrategy.
1384 */
1385 if (scan->rs_read_stream)
1387
1388 /*
1389 * decrement relation reference count and free scan descriptor storage
1390 */
1392
1393 if (scan->rs_base.rs_key)
1394 pfree(scan->rs_base.rs_key);
1395
1396 if (scan->rs_strategy != NULL)
1398
1399 if (scan->rs_parallelworkerdata != NULL)
1401
1402 if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)
1404
1405 pfree(scan);
1406}

References BufferIsValid(), fb(), FreeAccessStrategy(), pfree(), read_stream_end(), RelationDecrementReferenceCount(), ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, TableScanDescData::rs_key, HeapScanDescData::rs_parallelworkerdata, TableScanDescData::rs_rd, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, HeapScanDescData::rs_strategy, SO_TEMP_SNAPSHOT, and UnregisterSnapshot().

◆ heap_fetch()

bool heap_fetch ( Relation  relation,
Snapshot  snapshot,
HeapTuple  tuple,
Buffer userbuf,
bool  keep_buf 
)

Definition at line 1658 of file heapam.c.

1663{
1664 ItemPointer tid = &(tuple->t_self);
1665 ItemId lp;
1666 Buffer buffer;
1667 Page page;
1668 OffsetNumber offnum;
1669 bool valid;
1670
1671 /*
1672 * Fetch and pin the appropriate page of the relation.
1673 */
1674 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
1675
1676 /*
1677 * Need share lock on buffer to examine tuple commit status.
1678 */
1680 page = BufferGetPage(buffer);
1681
1682 /*
1683 * We'd better check for out-of-range offnum in case of VACUUM since the
1684 * TID was obtained.
1685 */
1686 offnum = ItemPointerGetOffsetNumber(tid);
1688 {
1690 ReleaseBuffer(buffer);
1692 tuple->t_data = NULL;
1693 return false;
1694 }
1695
1696 /*
1697 * get the item line pointer corresponding to the requested tid
1698 */
1699 lp = PageGetItemId(page, offnum);
1700
1701 /*
1702 * Must check for deleted tuple.
1703 */
1704 if (!ItemIdIsNormal(lp))
1705 {
1707 ReleaseBuffer(buffer);
1709 tuple->t_data = NULL;
1710 return false;
1711 }
1712
1713 /*
1714 * fill in *tuple fields
1715 */
1716 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1717 tuple->t_len = ItemIdGetLength(lp);
1718 tuple->t_tableOid = RelationGetRelid(relation);
1719
1720 /*
1721 * check tuple visibility, then release lock
1722 */
1723 valid = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
1724
1725 if (valid)
1726 PredicateLockTID(relation, &(tuple->t_self), snapshot,
1728
1729 HeapCheckForSerializableConflictOut(valid, relation, tuple, buffer, snapshot);
1730
1732
1733 if (valid)
1734 {
1735 /*
1736 * All checks passed, so return the tuple as valid. Caller is now
1737 * responsible for releasing the buffer.
1738 */
1739 *userbuf = buffer;
1740
1741 return true;
1742 }
1743
1744 /* Tuple failed time qual, but maybe caller wants to see it anyway. */
1745 if (keep_buf)
1746 *userbuf = buffer;
1747 else
1748 {
1749 ReleaseBuffer(buffer);
1751 tuple->t_data = NULL;
1752 }
1753
1754 return false;
1755}

References BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetPage(), fb(), HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetXmin(), HeapTupleSatisfiesVisibility(), InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PredicateLockTID(), ReadBuffer(), RelationGetRelid, ReleaseBuffer(), HeapTupleData::t_data, HeapTupleData::t_len, HeapTupleData::t_self, and HeapTupleData::t_tableOid.

Referenced by heap_lock_updated_tuple_rec(), heapam_fetch_row_version(), and heapam_tuple_lock().

◆ heap_fetch_next_buffer()

static void heap_fetch_next_buffer ( HeapScanDesc  scan,
ScanDirection  dir 
)
inlinestatic

Definition at line 706 of file heapam.c.

707{
708 Assert(scan->rs_read_stream);
709
710 /* release previous scan buffer, if any */
711 if (BufferIsValid(scan->rs_cbuf))
712 {
713 ReleaseBuffer(scan->rs_cbuf);
714 scan->rs_cbuf = InvalidBuffer;
715 }
716
717 /*
718 * Be sure to check for interrupts at least once per page. Checks at
719 * higher code levels won't be able to stop a seqscan that encounters many
720 * pages' worth of consecutive dead tuples.
721 */
723
724 /*
725 * If the scan direction is changing, reset the prefetch block to the
726 * current block. Otherwise, we will incorrectly prefetch the blocks
727 * between the prefetch block and the current block again before
728 * prefetching blocks in the new, correct scan direction.
729 */
730 if (unlikely(scan->rs_dir != dir))
731 {
732 scan->rs_prefetch_block = scan->rs_cblock;
734 }
735
736 scan->rs_dir = dir;
737
739 if (BufferIsValid(scan->rs_cbuf))
741}

References Assert, BufferGetBlockNumber(), BufferIsValid(), CHECK_FOR_INTERRUPTS, fb(), InvalidBuffer, read_stream_next_buffer(), read_stream_reset(), ReleaseBuffer(), HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_dir, HeapScanDescData::rs_prefetch_block, HeapScanDescData::rs_read_stream, and unlikely.

Referenced by heapgettup(), and heapgettup_pagemode().

◆ heap_finish_speculative()

void heap_finish_speculative ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 6167 of file heapam.c.

6168{
6169 Buffer buffer;
6170 Page page;
6171 OffsetNumber offnum;
6172 ItemId lp;
6173 HeapTupleHeader htup;
6174
6175 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
6177 page = BufferGetPage(buffer);
6178
6179 offnum = ItemPointerGetOffsetNumber(tid);
6181 elog(ERROR, "offnum out of range");
6182 lp = PageGetItemId(page, offnum);
6183 if (!ItemIdIsNormal(lp))
6184 elog(ERROR, "invalid lp");
6185
6186 htup = (HeapTupleHeader) PageGetItem(page, lp);
6187
6188 /* NO EREPORT(ERROR) from here till changes are logged */
6190
6192
6193 MarkBufferDirty(buffer);
6194
6195 /*
6196 * Replace the speculative insertion token with a real t_ctid, pointing to
6197 * itself like it does on regular tuples.
6198 */
6199 htup->t_ctid = *tid;
6200
6201 /* XLOG stuff */
6202 if (RelationNeedsWAL(relation))
6203 {
6206
6208
6210
6211 /* We want the same filtering on this as on a plain insert */
6213
6216
6218
6219 PageSetLSN(page, recptr);
6220 }
6221
6223
6224 UnlockReleaseBuffer(buffer);
6225}

References Assert, BUFFER_LOCK_EXCLUSIVE, BufferGetPage(), elog, END_CRIT_SECTION, ERROR, fb(), HeapTupleHeaderIsSpeculative(), ItemIdIsNormal, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), MarkBufferDirty(), xl_heap_confirm::offnum, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PageSetLSN(), ReadBuffer(), REGBUF_STANDARD, RelationNeedsWAL, SizeOfHeapConfirm, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, UnlockReleaseBuffer(), XLOG_HEAP_CONFIRM, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heapam_tuple_complete_speculative().

◆ heap_freeze_prepared_tuples()

void heap_freeze_prepared_tuples ( Buffer  buffer,
HeapTupleFreeze tuples,
int  ntuples 
)

Definition at line 7460 of file heapam.c.

7461{
7462 Page page = BufferGetPage(buffer);
7463
7464 for (int i = 0; i < ntuples; i++)
7465 {
7466 HeapTupleFreeze *frz = tuples + i;
7467 ItemId itemid = PageGetItemId(page, frz->offset);
7468 HeapTupleHeader htup;
7469
7470 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7472 }
7473}

References BufferGetPage(), fb(), heap_execute_freeze_tuple(), i, PageGetItem(), and PageGetItemId().

Referenced by heap_page_prune_and_freeze().

◆ heap_freeze_tuple()

bool heap_freeze_tuple ( HeapTupleHeader  tuple,
TransactionId  relfrozenxid,
TransactionId  relminmxid,
TransactionId  FreezeLimit,
TransactionId  MultiXactCutoff 
)

Definition at line 7482 of file heapam.c.

7485{
7487 bool do_freeze;
7488 bool totally_frozen;
7489 struct VacuumCutoffs cutoffs;
7490 HeapPageFreeze pagefrz;
7491
7492 cutoffs.relfrozenxid = relfrozenxid;
7493 cutoffs.relminmxid = relminmxid;
7494 cutoffs.OldestXmin = FreezeLimit;
7495 cutoffs.OldestMxact = MultiXactCutoff;
7496 cutoffs.FreezeLimit = FreezeLimit;
7497 cutoffs.MultiXactCutoff = MultiXactCutoff;
7498
7499 pagefrz.freeze_required = true;
7500 pagefrz.FreezePageRelfrozenXid = FreezeLimit;
7501 pagefrz.FreezePageRelminMxid = MultiXactCutoff;
7502 pagefrz.NoFreezePageRelfrozenXid = FreezeLimit;
7503 pagefrz.NoFreezePageRelminMxid = MultiXactCutoff;
7504
7505 do_freeze = heap_prepare_freeze_tuple(tuple, &cutoffs,
7506 &pagefrz, &frz, &totally_frozen);
7507
7508 /*
7509 * Note that because this is not a WAL-logged operation, we don't need to
7510 * fill in the offset in the freeze record.
7511 */
7512
7513 if (do_freeze)
7515 return do_freeze;
7516}

References fb(), VacuumCutoffs::FreezeLimit, heap_execute_freeze_tuple(), heap_prepare_freeze_tuple(), VacuumCutoffs::MultiXactCutoff, VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, VacuumCutoffs::relfrozenxid, and VacuumCutoffs::relminmxid.

Referenced by rewrite_heap_tuple().

◆ heap_get_latest_tid()

void heap_get_latest_tid ( TableScanDesc  sscan,
ItemPointer  tid 
)

Definition at line 1930 of file heapam.c.

1932{
1933 Relation relation = sscan->rs_rd;
1934 Snapshot snapshot = sscan->rs_snapshot;
1935 ItemPointerData ctid;
1937
1938 /*
1939 * table_tuple_get_latest_tid() verified that the passed in tid is valid.
1940 * Assume that t_ctid links are valid however - there shouldn't be invalid
1941 * ones in the table.
1942 */
1944
1945 /*
1946 * Loop to chase down t_ctid links. At top of loop, ctid is the tuple we
1947 * need to examine, and *tid is the TID we will return if ctid turns out
1948 * to be bogus.
1949 *
1950 * Note that we will loop until we reach the end of the t_ctid chain.
1951 * Depending on the snapshot passed, there might be at most one visible
1952 * version of the row, but we don't try to optimize for that.
1953 */
1954 ctid = *tid;
1955 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
1956 for (;;)
1957 {
1958 Buffer buffer;
1959 Page page;
1960 OffsetNumber offnum;
1961 ItemId lp;
1962 HeapTupleData tp;
1963 bool valid;
1964
1965 /*
1966 * Read, pin, and lock the page.
1967 */
1968 buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid));
1970 page = BufferGetPage(buffer);
1971
1972 /*
1973 * Check for bogus item number. This is not treated as an error
1974 * condition because it can happen while following a t_ctid link. We
1975 * just assume that the prior tid is OK and return it unchanged.
1976 */
1977 offnum = ItemPointerGetOffsetNumber(&ctid);
1979 {
1980 UnlockReleaseBuffer(buffer);
1981 break;
1982 }
1983 lp = PageGetItemId(page, offnum);
1984 if (!ItemIdIsNormal(lp))
1985 {
1986 UnlockReleaseBuffer(buffer);
1987 break;
1988 }
1989
1990 /* OK to access the tuple */
1991 tp.t_self = ctid;
1992 tp.t_data = (HeapTupleHeader) PageGetItem(page, lp);
1993 tp.t_len = ItemIdGetLength(lp);
1994 tp.t_tableOid = RelationGetRelid(relation);
1995
1996 /*
1997 * After following a t_ctid link, we might arrive at an unrelated
1998 * tuple. Check for XMIN match.
1999 */
2002 {
2003 UnlockReleaseBuffer(buffer);
2004 break;
2005 }
2006
2007 /*
2008 * Check tuple visibility; if visible, set it as the new result
2009 * candidate.
2010 */
2011 valid = HeapTupleSatisfiesVisibility(&tp, snapshot, buffer);
2012 HeapCheckForSerializableConflictOut(valid, relation, &tp, buffer, snapshot);
2013 if (valid)
2014 *tid = ctid;
2015
2016 /*
2017 * If there's a valid t_ctid link, follow it, else we're done.
2018 */
2019 if ((tp.t_data->t_infomask & HEAP_XMAX_INVALID) ||
2023 {
2024 UnlockReleaseBuffer(buffer);
2025 break;
2026 }
2027
2028 ctid = tp.t_data->t_ctid;
2030 UnlockReleaseBuffer(buffer);
2031 } /* end of loop */
2032}

References Assert, BUFFER_LOCK_SHARE, BufferGetPage(), fb(), HEAP_XMAX_INVALID, HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIndicatesMovedPartitions(), HeapTupleHeaderIsOnlyLocked(), HeapTupleSatisfiesVisibility(), InvalidTransactionId, ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), ReadBuffer(), RelationGetRelid, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TransactionIdEquals, TransactionIdIsValid, and UnlockReleaseBuffer().

◆ heap_getnext()

HeapTuple heap_getnext ( TableScanDesc  sscan,
ScanDirection  direction 
)

Definition at line 1409 of file heapam.c.

1410{
1412
1413 /*
1414 * This is still widely used directly, without going through table AM, so
1415 * add a safety check. It's possible we should, at a later point,
1416 * downgrade this to an assert. The reason for checking the AM routine,
1417 * rather than the AM oid, is that this allows to write regression tests
1418 * that create another AM reusing the heap handler.
1419 */
1420 if (unlikely(sscan->rs_rd->rd_tableam != GetHeapamTableAmRoutine()))
1421 ereport(ERROR,
1423 errmsg_internal("only heap AM is supported")));
1424
1425 /* Note: no locking manipulations needed */
1426
1428 heapgettup_pagemode(scan, direction,
1429 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1430 else
1431 heapgettup(scan, direction,
1432 scan->rs_base.rs_nkeys, scan->rs_base.rs_key);
1433
1434 if (scan->rs_ctup.t_data == NULL)
1435 return NULL;
1436
1437 /*
1438 * if we get here it means we have a new current scan tuple, so point to
1439 * the proper return buffer and return the tuple.
1440 */
1441
1443
1444 return &scan->rs_ctup;
1445}

References ereport, errcode(), errmsg_internal(), ERROR, fb(), GetHeapamTableAmRoutine(), heapgettup(), heapgettup_pagemode(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_ctup, TableScanDescData::rs_flags, TableScanDescData::rs_key, TableScanDescData::rs_nkeys, TableScanDescData::rs_rd, SO_ALLOW_PAGEMODE, HeapTupleData::t_data, and unlikely.

Referenced by AlterTableMoveAll(), AlterTableSpaceOptions(), check_db_file_conflict(), CreateDatabaseUsingFileCopy(), do_autovacuum(), DropSetting(), DropTableSpace(), find_typed_table_dependencies(), get_all_vacuum_rels(), get_database_list(), get_subscription_list(), get_tables_to_cluster(), get_tablespace_name(), get_tablespace_oid(), GetAllPublicationRelations(), getRelationsInNamespace(), GetSchemaPublicationRelations(), heapam_index_build_range_scan(), heapam_index_validate_scan(), objectsInSchemaToOids(), pgrowlocks(), pgstat_heap(), populate_typ_list(), ReindexMultipleTables(), remove_dbtablespaces(), RemoveSubscriptionRel(), RenameTableSpace(), ThereIsAtLeastOneRole(), and vac_truncate_clog().

◆ heap_getnextslot()

bool heap_getnextslot ( TableScanDesc  sscan,
ScanDirection  direction,
TupleTableSlot slot 
)

Definition at line 1448 of file heapam.c.

1449{
1451
1452 /* Note: no locking manipulations needed */
1453
1454 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1455 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1456 else
1457 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1458
1459 if (scan->rs_ctup.t_data == NULL)
1460 {
1461 ExecClearTuple(slot);
1462 return false;
1463 }
1464
1465 /*
1466 * if we get here it means we have a new current scan tuple, so point to
1467 * the proper return buffer and return the tuple.
1468 */
1469
1471
1472 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot,
1473 scan->rs_cbuf);
1474 return true;
1475}

References ExecClearTuple(), ExecStoreBufferHeapTuple(), fb(), heapgettup(), heapgettup_pagemode(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_rd, SO_ALLOW_PAGEMODE, and HeapTupleData::t_data.

◆ heap_getnextslot_tidrange()

bool heap_getnextslot_tidrange ( TableScanDesc  sscan,
ScanDirection  direction,
TupleTableSlot slot 
)

Definition at line 1551 of file heapam.c.

1553{
1555 ItemPointer mintid = &sscan->st.tidrange.rs_mintid;
1556 ItemPointer maxtid = &sscan->st.tidrange.rs_maxtid;
1557
1558 /* Note: no locking manipulations needed */
1559 for (;;)
1560 {
1561 if (sscan->rs_flags & SO_ALLOW_PAGEMODE)
1562 heapgettup_pagemode(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1563 else
1564 heapgettup(scan, direction, sscan->rs_nkeys, sscan->rs_key);
1565
1566 if (scan->rs_ctup.t_data == NULL)
1567 {
1568 ExecClearTuple(slot);
1569 return false;
1570 }
1571
1572 /*
1573 * heap_set_tidrange will have used heap_setscanlimits to limit the
1574 * range of pages we scan to only ones that can contain the TID range
1575 * we're scanning for. Here we must filter out any tuples from these
1576 * pages that are outside of that range.
1577 */
1578 if (ItemPointerCompare(&scan->rs_ctup.t_self, mintid) < 0)
1579 {
1580 ExecClearTuple(slot);
1581
1582 /*
1583 * When scanning backwards, the TIDs will be in descending order.
1584 * Future tuples in this direction will be lower still, so we can
1585 * just return false to indicate there will be no more tuples.
1586 */
1587 if (ScanDirectionIsBackward(direction))
1588 return false;
1589
1590 continue;
1591 }
1592
1593 /*
1594 * Likewise for the final page, we must filter out TIDs greater than
1595 * maxtid.
1596 */
1597 if (ItemPointerCompare(&scan->rs_ctup.t_self, maxtid) > 0)
1598 {
1599 ExecClearTuple(slot);
1600
1601 /*
1602 * When scanning forward, the TIDs will be in ascending order.
1603 * Future tuples in this direction will be higher still, so we can
1604 * just return false to indicate there will be no more tuples.
1605 */
1606 if (ScanDirectionIsForward(direction))
1607 return false;
1608 continue;
1609 }
1610
1611 break;
1612 }
1613
1614 /*
1615 * if we get here it means we have a new current scan tuple, so point to
1616 * the proper return buffer and return the tuple.
1617 */
1619
1620 ExecStoreBufferHeapTuple(&scan->rs_ctup, slot, scan->rs_cbuf);
1621 return true;
1622}

References ExecClearTuple(), ExecStoreBufferHeapTuple(), fb(), heapgettup(), heapgettup_pagemode(), ItemPointerCompare(), pgstat_count_heap_getnext, HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_ctup, TableScanDescData::rs_rd, ScanDirectionIsBackward, ScanDirectionIsForward, SO_ALLOW_PAGEMODE, HeapTupleData::t_data, and HeapTupleData::t_self.

◆ heap_hot_search_buffer()

bool heap_hot_search_buffer ( ItemPointer  tid,
Relation  relation,
Buffer  buffer,
Snapshot  snapshot,
HeapTuple  heapTuple,
bool all_dead,
bool  first_call 
)

Definition at line 1778 of file heapam.c.

1781{
1782 Page page = BufferGetPage(buffer);
1784 BlockNumber blkno;
1785 OffsetNumber offnum;
1786 bool at_chain_start;
1787 bool valid;
1788 bool skip;
1789 GlobalVisState *vistest = NULL;
1790
1791 /* If this is not the first call, previous call returned a (live!) tuple */
1792 if (all_dead)
1794
1795 blkno = ItemPointerGetBlockNumber(tid);
1796 offnum = ItemPointerGetOffsetNumber(tid);
1798 skip = !first_call;
1799
1800 /* XXX: we should assert that a snapshot is pushed or registered */
1802 Assert(BufferGetBlockNumber(buffer) == blkno);
1803
1804 /* Scan through possible multiple members of HOT-chain */
1805 for (;;)
1806 {
1807 ItemId lp;
1808
1809 /* check for bogus TID */
1811 break;
1812
1813 lp = PageGetItemId(page, offnum);
1814
1815 /* check for unused, dead, or redirected items */
1816 if (!ItemIdIsNormal(lp))
1817 {
1818 /* We should only see a redirect at start of chain */
1820 {
1821 /* Follow the redirect */
1822 offnum = ItemIdGetRedirect(lp);
1823 at_chain_start = false;
1824 continue;
1825 }
1826 /* else must be end of chain */
1827 break;
1828 }
1829
1830 /*
1831 * Update heapTuple to point to the element of the HOT chain we're
1832 * currently investigating. Having t_self set correctly is important
1833 * because the SSI checks and the *Satisfies routine for historical
1834 * MVCC snapshots need the correct tid to decide about the visibility.
1835 */
1836 heapTuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
1837 heapTuple->t_len = ItemIdGetLength(lp);
1838 heapTuple->t_tableOid = RelationGetRelid(relation);
1839 ItemPointerSet(&heapTuple->t_self, blkno, offnum);
1840
1841 /*
1842 * Shouldn't see a HEAP_ONLY tuple at chain start.
1843 */
1845 break;
1846
1847 /*
1848 * The xmin should match the previous xmax value, else chain is
1849 * broken.
1850 */
1854 break;
1855
1856 /*
1857 * When first_call is true (and thus, skip is initially false) we'll
1858 * return the first tuple we find. But on later passes, heapTuple
1859 * will initially be pointing to the tuple we returned last time.
1860 * Returning it again would be incorrect (and would loop forever), so
1861 * we skip it and return the next match we find.
1862 */
1863 if (!skip)
1864 {
1865 /* If it's visible per the snapshot, we must return it */
1866 valid = HeapTupleSatisfiesVisibility(heapTuple, snapshot, buffer);
1868 buffer, snapshot);
1869
1870 if (valid)
1871 {
1872 ItemPointerSetOffsetNumber(tid, offnum);
1873 PredicateLockTID(relation, &heapTuple->t_self, snapshot,
1875 if (all_dead)
1876 *all_dead = false;
1877 return true;
1878 }
1879 }
1880 skip = false;
1881
1882 /*
1883 * If we can't see it, maybe no one else can either. At caller
1884 * request, check whether all chain members are dead to all
1885 * transactions.
1886 *
1887 * Note: if you change the criterion here for what is "dead", fix the
1888 * planner's get_actual_variable_range() function to match.
1889 */
1890 if (all_dead && *all_dead)
1891 {
1892 if (!vistest)
1893 vistest = GlobalVisTestFor(relation);
1894
1895 if (!HeapTupleIsSurelyDead(heapTuple, vistest))
1896 *all_dead = false;
1897 }
1898
1899 /*
1900 * Check to see if HOT chain continues past this tuple; if so fetch
1901 * the next offnum and loop around.
1902 */
1904 {
1905 Assert(ItemPointerGetBlockNumber(&heapTuple->t_data->t_ctid) ==
1906 blkno);
1907 offnum = ItemPointerGetOffsetNumber(&heapTuple->t_data->t_ctid);
1908 at_chain_start = false;
1910 }
1911 else
1912 break; /* end of chain */
1913 }
1914
1915 return false;
1916}

References Assert, BufferGetBlockNumber(), BufferGetPage(), fb(), GlobalVisTestFor(), HeapCheckForSerializableConflictOut(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleIsHeapOnly(), HeapTupleIsHotUpdated(), HeapTupleIsSurelyDead(), HeapTupleSatisfiesVisibility(), InvalidTransactionId, ItemIdGetLength, ItemIdGetRedirect, ItemIdIsNormal, ItemIdIsRedirected, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerSet(), ItemPointerSetOffsetNumber(), PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), PredicateLockTID(), RecentXmin, RelationGetRelid, skip, TransactionIdEquals, and TransactionIdIsValid.

Referenced by BitmapHeapScanNextBlock(), heap_index_delete_tuples(), and heapam_index_fetch_tuple().

◆ heap_index_delete_tuples()

TransactionId heap_index_delete_tuples ( Relation  rel,
TM_IndexDeleteOp delstate 
)

Definition at line 8198 of file heapam.c.

8199{
8200 /* Initial assumption is that earlier pruning took care of conflict */
8201 TransactionId snapshotConflictHorizon = InvalidTransactionId;
8204 Page page = NULL;
8207#ifdef USE_PREFETCH
8210#endif
8212 int finalndeltids = 0,
8213 nblocksaccessed = 0;
8214
8215 /* State that's only used in bottom-up index deletion case */
8216 int nblocksfavorable = 0;
8217 int curtargetfreespace = delstate->bottomupfreespace,
8218 lastfreespace = 0,
8219 actualfreespace = 0;
8220 bool bottomup_final_block = false;
8221
8223
8224 /* Sort caller's deltids array by TID for further processing */
8226
8227 /*
8228 * Bottom-up case: resort deltids array in an order attuned to where the
8229 * greatest number of promising TIDs are to be found, and determine how
8230 * many blocks from the start of sorted array should be considered
8231 * favorable. This will also shrink the deltids array in order to
8232 * eliminate completely unfavorable blocks up front.
8233 */
8234 if (delstate->bottomup)
8236
8237#ifdef USE_PREFETCH
8238 /* Initialize prefetch state. */
8240 prefetch_state.next_item = 0;
8241 prefetch_state.ndeltids = delstate->ndeltids;
8242 prefetch_state.deltids = delstate->deltids;
8243
8244 /*
8245 * Determine the prefetch distance that we will attempt to maintain.
8246 *
8247 * Since the caller holds a buffer lock somewhere in rel, we'd better make
8248 * sure that isn't a catalog relation before we call code that does
8249 * syscache lookups, to avoid risk of deadlock.
8250 */
8251 if (IsCatalogRelation(rel))
8253 else
8256
8257 /* Cap initial prefetch distance for bottom-up deletion caller */
8258 if (delstate->bottomup)
8259 {
8263 }
8264
8265 /* Start prefetching. */
8267#endif
8268
8269 /* Iterate over deltids, determine which to delete, check their horizon */
8270 Assert(delstate->ndeltids > 0);
8271 for (int i = 0; i < delstate->ndeltids; i++)
8272 {
8273 TM_IndexDelete *ideltid = &delstate->deltids[i];
8274 TM_IndexStatus *istatus = delstate->status + ideltid->id;
8275 ItemPointer htid = &ideltid->tid;
8276 OffsetNumber offnum;
8277
8278 /*
8279 * Read buffer, and perform required extra steps each time a new block
8280 * is encountered. Avoid refetching if it's the same block as the one
8281 * from the last htid.
8282 */
8283 if (blkno == InvalidBlockNumber ||
8285 {
8286 /*
8287 * Consider giving up early for bottom-up index deletion caller
8288 * first. (Only prefetch next-next block afterwards, when it
8289 * becomes clear that we're at least going to access the next
8290 * block in line.)
8291 *
8292 * Sometimes the first block frees so much space for bottom-up
8293 * caller that the deletion process can end without accessing any
8294 * more blocks. It is usually necessary to access 2 or 3 blocks
8295 * per bottom-up deletion operation, though.
8296 */
8297 if (delstate->bottomup)
8298 {
8299 /*
8300 * We often allow caller to delete a few additional items
8301 * whose entries we reached after the point that space target
8302 * from caller was satisfied. The cost of accessing the page
8303 * was already paid at that point, so it made sense to finish
8304 * it off. When that happened, we finalize everything here
8305 * (by finishing off the whole bottom-up deletion operation
8306 * without needlessly paying the cost of accessing any more
8307 * blocks).
8308 */
8310 break;
8311
8312 /*
8313 * Give up when we didn't enable our caller to free any
8314 * additional space as a result of processing the page that we
8315 * just finished up with. This rule is the main way in which
8316 * we keep the cost of bottom-up deletion under control.
8317 */
8319 break;
8320 lastfreespace = actualfreespace; /* for next time */
8321
8322 /*
8323 * Deletion operation (which is bottom-up) will definitely
8324 * access the next block in line. Prepare for that now.
8325 *
8326 * Decay target free space so that we don't hang on for too
8327 * long with a marginal case. (Space target is only truly
8328 * helpful when it allows us to recognize that we don't need
8329 * to access more than 1 or 2 blocks to satisfy caller due to
8330 * agreeable workload characteristics.)
8331 *
8332 * We are a bit more patient when we encounter contiguous
8333 * blocks, though: these are treated as favorable blocks. The
8334 * decay process is only applied when the next block in line
8335 * is not a favorable/contiguous block. This is not an
8336 * exception to the general rule; we still insist on finding
8337 * at least one deletable item per block accessed. See
8338 * bottomup_nblocksfavorable() for full details of the theory
8339 * behind favorable blocks and heap block locality in general.
8340 *
8341 * Note: The first block in line is always treated as a
8342 * favorable block, so the earliest possible point that the
8343 * decay can be applied is just before we access the second
8344 * block in line. The Assert() verifies this for us.
8345 */
8347 if (nblocksfavorable > 0)
8349 else
8350 curtargetfreespace /= 2;
8351 }
8352
8353 /* release old buffer */
8354 if (BufferIsValid(buf))
8356
8358 buf = ReadBuffer(rel, blkno);
8360 Assert(!delstate->bottomup ||
8362
8363#ifdef USE_PREFETCH
8364
8365 /*
8366 * To maintain the prefetch distance, prefetch one more page for
8367 * each page we read.
8368 */
8370#endif
8371
8373
8374 page = BufferGetPage(buf);
8375 maxoff = PageGetMaxOffsetNumber(page);
8376 }
8377
8378 /*
8379 * In passing, detect index corruption involving an index page with a
8380 * TID that points to a location in the heap that couldn't possibly be
8381 * correct. We only do this with actual TIDs from caller's index page
8382 * (not items reached by traversing through a HOT chain).
8383 */
8385
8386 if (istatus->knowndeletable)
8387 Assert(!delstate->bottomup && !istatus->promising);
8388 else
8389 {
8390 ItemPointerData tmp = *htid;
8392
8393 /* Are any tuples from this HOT chain non-vacuumable? */
8395 &heapTuple, NULL, true))
8396 continue; /* can't delete entry */
8397
8398 /* Caller will delete, since whole HOT chain is vacuumable */
8399 istatus->knowndeletable = true;
8400
8401 /* Maintain index free space info for bottom-up deletion case */
8402 if (delstate->bottomup)
8403 {
8404 Assert(istatus->freespace > 0);
8405 actualfreespace += istatus->freespace;
8407 bottomup_final_block = true;
8408 }
8409 }
8410
8411 /*
8412 * Maintain snapshotConflictHorizon value for deletion operation as a
8413 * whole by advancing current value using heap tuple headers. This is
8414 * loosely based on the logic for pruning a HOT chain.
8415 */
8417 priorXmax = InvalidTransactionId; /* cannot check first XMIN */
8418 for (;;)
8419 {
8420 ItemId lp;
8421 HeapTupleHeader htup;
8422
8423 /* Sanity check (pure paranoia) */
8424 if (offnum < FirstOffsetNumber)
8425 break;
8426
8427 /*
8428 * An offset past the end of page's line pointer array is possible
8429 * when the array was truncated
8430 */
8431 if (offnum > maxoff)
8432 break;
8433
8434 lp = PageGetItemId(page, offnum);
8436 {
8437 offnum = ItemIdGetRedirect(lp);
8438 continue;
8439 }
8440
8441 /*
8442 * We'll often encounter LP_DEAD line pointers (especially with an
8443 * entry marked knowndeletable by our caller up front). No heap
8444 * tuple headers get examined for an htid that leads us to an
8445 * LP_DEAD item. This is okay because the earlier pruning
8446 * operation that made the line pointer LP_DEAD in the first place
8447 * must have considered the original tuple header as part of
8448 * generating its own snapshotConflictHorizon value.
8449 *
8450 * Relying on XLOG_HEAP2_PRUNE_VACUUM_SCAN records like this is
8451 * the same strategy that index vacuuming uses in all cases. Index
8452 * VACUUM WAL records don't even have a snapshotConflictHorizon
8453 * field of their own for this reason.
8454 */
8455 if (!ItemIdIsNormal(lp))
8456 break;
8457
8458 htup = (HeapTupleHeader) PageGetItem(page, lp);
8459
8460 /*
8461 * Check the tuple XMIN against prior XMAX, if any
8462 */
8465 break;
8466
8468 &snapshotConflictHorizon);
8469
8470 /*
8471 * If the tuple is not HOT-updated, then we are at the end of this
8472 * HOT-chain. No need to visit later tuples from the same update
8473 * chain (they get their own index entries) -- just move on to
8474 * next htid from index AM caller.
8475 */
8476 if (!HeapTupleHeaderIsHotUpdated(htup))
8477 break;
8478
8479 /* Advance to next HOT chain member */
8480 Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == blkno);
8481 offnum = ItemPointerGetOffsetNumber(&htup->t_ctid);
8483 }
8484
8485 /* Enable further/final shrinking of deltids for caller */
8486 finalndeltids = i + 1;
8487 }
8488
8490
8491 /*
8492 * Shrink deltids array to exclude non-deletable entries at the end. This
8493 * is not just a minor optimization. Final deltids array size might be
8494 * zero for a bottom-up caller. Index AM is explicitly allowed to rely on
8495 * ndeltids being zero in all cases with zero total deletable entries.
8496 */
8497 Assert(finalndeltids > 0 || delstate->bottomup);
8498 delstate->ndeltids = finalndeltids;
8499
8500 return snapshotConflictHorizon;
8501}

References Assert, BOTTOMUP_MAX_NBLOCKS, bottomup_sort_and_shrink(), buf, BUFFER_LOCK_SHARE, BufferGetPage(), BufferIsValid(), fb(), FirstOffsetNumber, get_tablespace_maintenance_io_concurrency(), GlobalVisTestFor(), heap_hot_search_buffer(), HeapTupleHeaderAdvanceConflictHorizon(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIsHotUpdated(), i, index_delete_check_htid(), index_delete_sort(), InitNonVacuumableSnapshot, InvalidBlockNumber, InvalidBuffer, InvalidOffsetNumber, InvalidTransactionId, IsCatalogRelation(), ItemIdGetRedirect, ItemIdIsNormal, ItemIdIsRedirected, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), maintenance_io_concurrency, Min, PageGetItem(), PageGetItemId(), PageGetMaxOffsetNumber(), RelationData::rd_rel, ReadBuffer(), HeapTupleHeaderData::t_ctid, TransactionIdEquals, TransactionIdIsValid, and UnlockReleaseBuffer().

◆ heap_inplace_lock()

bool heap_inplace_lock ( Relation  relation,
HeapTuple  oldtup_ptr,
Buffer  buffer,
void(*)(void *)  release_callback,
void arg 
)

Definition at line 6436 of file heapam.c.

6439{
6440 HeapTupleData oldtup = *oldtup_ptr; /* minimize diff vs. heap_update() */
6441 TM_Result result;
6442 bool ret;
6443
6444#ifdef USE_ASSERT_CHECKING
6445 if (RelationGetRelid(relation) == RelationRelationId)
6447#endif
6448
6449 Assert(BufferIsValid(buffer));
6450
6451 /*
6452 * Register shared cache invals if necessary. Other sessions may finish
6453 * inplace updates of this tuple between this step and LockTuple(). Since
6454 * inplace updates don't change cache keys, that's harmless.
6455 *
6456 * While it's tempting to register invals only after confirming we can
6457 * return true, the following obstacle precludes reordering steps that
6458 * way. Registering invals might reach a CatalogCacheInitializeCache()
6459 * that locks "buffer". That would hang indefinitely if running after our
6460 * own LockBuffer(). Hence, we must register invals before LockBuffer().
6461 */
6463
6464 LockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6466
6467 /*----------
6468 * Interpret HeapTupleSatisfiesUpdate() like heap_update() does, except:
6469 *
6470 * - wait unconditionally
6471 * - already locked tuple above, since inplace needs that unconditionally
6472 * - don't recheck header after wait: simpler to defer to next iteration
6473 * - don't try to continue even if the updater aborts: likewise
6474 * - no crosscheck
6475 */
6477 buffer);
6478
6479 if (result == TM_Invisible)
6480 {
6481 /* no known way this can happen */
6482 ereport(ERROR,
6484 errmsg_internal("attempted to overwrite invisible tuple")));
6485 }
6486 else if (result == TM_SelfModified)
6487 {
6488 /*
6489 * CREATE INDEX might reach this if an expression is silly enough to
6490 * call e.g. SELECT ... FROM pg_class FOR SHARE. C code of other SQL
6491 * statements might get here after a heap_update() of the same row, in
6492 * the absence of an intervening CommandCounterIncrement().
6493 */
6494 ereport(ERROR,
6496 errmsg("tuple to be updated was already modified by an operation triggered by the current command")));
6497 }
6498 else if (result == TM_BeingModified)
6499 {
6502
6504 infomask = oldtup.t_data->t_infomask;
6505
6507 {
6510 int remain;
6511
6513 lockmode, NULL))
6514 {
6517 ret = false;
6519 relation, &oldtup.t_self, XLTW_Update,
6520 &remain);
6521 }
6522 else
6523 ret = true;
6524 }
6526 ret = true;
6528 ret = true;
6529 else
6530 {
6533 ret = false;
6534 XactLockTableWait(xwait, relation, &oldtup.t_self,
6535 XLTW_Update);
6536 }
6537 }
6538 else
6539 {
6540 ret = (result == TM_Ok);
6541 if (!ret)
6542 {
6545 }
6546 }
6547
6548 /*
6549 * GetCatalogSnapshot() relies on invalidation messages to know when to
6550 * take a new snapshot. COMMIT of xwait is responsible for sending the
6551 * invalidation. We're not acquiring heavyweight locks sufficient to
6552 * block if not yet sent, so we must take a new snapshot to ensure a later
6553 * attempt has a fair chance. While we don't need this if xwait aborted,
6554 * don't bother optimizing that.
6555 */
6556 if (!ret)
6557 {
6558 UnlockTuple(relation, &oldtup.t_self, InplaceUpdateTupleLock);
6561 }
6562 return ret;
6563}

References arg, Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsValid(), CacheInvalidateHeapTupleInplace(), DoesMultiXactIdConflict(), ereport, errcode(), errmsg(), errmsg_internal(), ERROR, fb(), ForgetInplace_Inval(), GetCurrentCommandId(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleSatisfiesUpdate(), InplaceUpdateTupleLock, InvalidateCatalogSnapshot(), LockBuffer(), LockTuple(), LockTupleNoKeyExclusive, MultiXactIdWait(), MultiXactStatusNoKeyUpdate, RelationGetRelid, TM_BeingModified, TM_Invisible, TM_Ok, TM_SelfModified, TransactionIdIsCurrentTransactionId(), UnlockTuple(), XactLockTableWait(), and XLTW_Update.

Referenced by systable_inplace_update_begin().

◆ heap_inplace_unlock()

void heap_inplace_unlock ( Relation  relation,
HeapTuple  oldtup,
Buffer  buffer 
)

◆ heap_inplace_update_and_unlock()

void heap_inplace_update_and_unlock ( Relation  relation,
HeapTuple  oldtup,
HeapTuple  tuple,
Buffer  buffer 
)

Definition at line 6574 of file heapam.c.

6577{
6578 HeapTupleHeader htup = oldtup->t_data;
6579 uint32 oldlen;
6580 uint32 newlen;
6581 char *dst;
6582 char *src;
6583 int nmsgs = 0;
6585 bool RelcacheInitFileInval = false;
6586
6587 Assert(ItemPointerEquals(&oldtup->t_self, &tuple->t_self));
6588 oldlen = oldtup->t_len - htup->t_hoff;
6589 newlen = tuple->t_len - tuple->t_data->t_hoff;
6590 if (oldlen != newlen || htup->t_hoff != tuple->t_data->t_hoff)
6591 elog(ERROR, "wrong tuple length");
6592
6593 dst = (char *) htup + htup->t_hoff;
6594 src = (char *) tuple->t_data + tuple->t_data->t_hoff;
6595
6596 /* Like RecordTransactionCommit(), log only if needed */
6599 &RelcacheInitFileInval);
6600
6601 /*
6602 * Unlink relcache init files as needed. If unlinking, acquire
6603 * RelCacheInitLock until after associated invalidations. By doing this
6604 * in advance, if we checkpoint and then crash between inplace
6605 * XLogInsert() and inval, we don't rely on StartupXLOG() ->
6606 * RelationCacheInitFileRemove(). That uses elevel==LOG, so replay would
6607 * neglect to PANIC on EIO.
6608 */
6610
6611 /*----------
6612 * NO EREPORT(ERROR) from here till changes are complete
6613 *
6614 * Our buffer lock won't stop a reader having already pinned and checked
6615 * visibility for this tuple. Hence, we write WAL first, then mutate the
6616 * buffer. Like in MarkBufferDirtyHint() or RecordTransactionCommit(),
6617 * checkpoint delay makes that acceptable. With the usual order of
6618 * changes, a crash after memcpy() and before XLogInsert() could allow
6619 * datfrozenxid to overtake relfrozenxid:
6620 *
6621 * ["D" is a VACUUM (ONLY_DATABASE_STATS)]
6622 * ["R" is a VACUUM tbl]
6623 * D: vac_update_datfrozenxid() -> systable_beginscan(pg_class)
6624 * D: systable_getnext() returns pg_class tuple of tbl
6625 * R: memcpy() into pg_class tuple of tbl
6626 * D: raise pg_database.datfrozenxid, XLogInsert(), finish
6627 * [crash]
6628 * [recovery restores datfrozenxid w/o relfrozenxid]
6629 *
6630 * Mimic MarkBufferDirtyHint() subroutine XLogSaveBufferForHint().
6631 * Specifically, use DELAY_CHKPT_START, and copy the buffer to the stack.
6632 * The stack copy facilitates a FPI of the post-mutation block before we
6633 * accept other sessions seeing it. DELAY_CHKPT_START allows us to
6634 * XLogInsert() before MarkBufferDirty(). Since XLogSaveBufferForHint()
6635 * can operate under BUFFER_LOCK_SHARED, it can't avoid DELAY_CHKPT_START.
6636 * This function, however, likely could avoid it with the following order
6637 * of operations: MarkBufferDirty(), XLogInsert(), memcpy(). Opt to use
6638 * DELAY_CHKPT_START here, too, as a way to have fewer distinct code
6639 * patterns to analyze. Inplace update isn't so frequent that it should
6640 * pursue the small optimization of skipping DELAY_CHKPT_START.
6641 */
6645
6646 /* XLOG stuff */
6647 if (RelationNeedsWAL(relation))
6648 {
6651 char *origdata = (char *) BufferGetBlock(buffer);
6652 Page page = BufferGetPage(buffer);
6653 uint16 lower = ((PageHeader) page)->pd_lower;
6654 uint16 upper = ((PageHeader) page)->pd_upper;
6656 RelFileLocator rlocator;
6657 ForkNumber forkno;
6658 BlockNumber blkno;
6660
6661 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
6662 xlrec.dbId = MyDatabaseId;
6664 xlrec.relcacheInitFileInval = RelcacheInitFileInval;
6665 xlrec.nmsgs = nmsgs;
6666
6669 if (nmsgs != 0)
6671 nmsgs * sizeof(SharedInvalidationMessage));
6672
6673 /* register block matching what buffer will look like after changes */
6678 BufferGetTag(buffer, &rlocator, &forkno, &blkno);
6679 Assert(forkno == MAIN_FORKNUM);
6680 XLogRegisterBlock(0, &rlocator, forkno, blkno, copied_buffer.data,
6682 XLogRegisterBufData(0, src, newlen);
6683
6684 /* inplace updates aren't decoded atm, don't log the origin */
6685
6687
6688 PageSetLSN(page, recptr);
6689 }
6690
6691 memcpy(dst, src, newlen);
6692
6693 MarkBufferDirty(buffer);
6694
6696
6697 /*
6698 * Send invalidations to shared queue. SearchSysCacheLocked1() assumes we
6699 * do this before UnlockTuple().
6700 */
6702
6705 UnlockTuple(relation, &tuple->t_self, InplaceUpdateTupleLock);
6706
6707 AcceptInvalidationMessages(); /* local processing of just-sent inval */
6708
6709 /*
6710 * Queue a transactional inval, for logical decoding and for third-party
6711 * code that might have been relying on it since long before inplace
6712 * update adopted immediate invalidation. See README.tuplock section
6713 * "Reading inplace-updated columns" for logical decoding details.
6714 */
6716 CacheInvalidateHeapTuple(relation, tuple, NULL);
6717}

References AcceptInvalidationMessages(), Assert, AtInplace_Inval(), BUFFER_LOCK_UNLOCK, BufferGetBlock(), BufferGetPage(), BufferGetTag(), CacheInvalidateHeapTuple(), DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog, END_CRIT_SECTION, ERROR, fb(), inplaceGetInvalidationMessages(), InplaceUpdateTupleLock, IsBootstrapProcessingMode, ItemPointerEquals(), ItemPointerGetOffsetNumber(), LockBuffer(), lower(), MAIN_FORKNUM, MarkBufferDirty(), MinSizeOfHeapInplace, MyDatabaseId, MyDatabaseTableSpace, MyProc, PageSetLSN(), PreInplace_Inval(), REGBUF_STANDARD, RelationNeedsWAL, START_CRIT_SECTION, HeapTupleData::t_data, HeapTupleHeaderData::t_hoff, HeapTupleData::t_len, HeapTupleData::t_self, UnlockTuple(), upper(), XLOG_HEAP_INPLACE, XLogBeginInsert(), XLogInsert(), XLogRegisterBlock(), XLogRegisterBufData(), XLogRegisterData(), and XLogStandbyInfoActive.

Referenced by systable_inplace_update_finish().

◆ heap_insert()

void heap_insert ( Relation  relation,
HeapTuple  tup,
CommandId  cid,
int  options,
BulkInsertState  bistate 
)

Definition at line 2141 of file heapam.c.

2143{
2146 Buffer buffer;
2147 Buffer vmbuffer = InvalidBuffer;
2148 bool all_visible_cleared = false;
2149
2150 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
2153
2154 AssertHasSnapshotForToast(relation);
2155
2156 /*
2157 * Fill in tuple header fields and toast the tuple if necessary.
2158 *
2159 * Note: below this point, heaptup is the data we actually intend to store
2160 * into the relation; tup is the caller's original untoasted data.
2161 */
2162 heaptup = heap_prepare_insert(relation, tup, xid, cid, options);
2163
2164 /*
2165 * Find buffer to insert this tuple into. If the page is all visible,
2166 * this will also pin the requisite visibility map page.
2167 */
2168 buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
2169 InvalidBuffer, options, bistate,
2170 &vmbuffer, NULL,
2171 0);
2172
2173 /*
2174 * We're about to do the actual insert -- but check for conflict first, to
2175 * avoid possibly having to roll back work we've just done.
2176 *
2177 * This is safe without a recheck as long as there is no possibility of
2178 * another process scanning the page between this check and the insert
2179 * being visible to the scan (i.e., an exclusive buffer content lock is
2180 * continuously held from this point until the tuple insert is visible).
2181 *
2182 * For a heap insert, we only need to check for table-level SSI locks. Our
2183 * new tuple can't possibly conflict with existing tuple locks, and heap
2184 * page locks are only consolidated versions of tuple locks; they do not
2185 * lock "gaps" as index page locks do. So we don't need to specify a
2186 * buffer when making the call, which makes for a faster check.
2187 */
2189
2190 /* NO EREPORT(ERROR) from here till changes are logged */
2192
2193 RelationPutHeapTuple(relation, buffer, heaptup,
2195
2196 if (PageIsAllVisible(BufferGetPage(buffer)))
2197 {
2198 all_visible_cleared = true;
2200 visibilitymap_clear(relation,
2202 vmbuffer, VISIBILITYMAP_VALID_BITS);
2203 }
2204
2205 /*
2206 * XXX Should we set PageSetPrunable on this page ?
2207 *
2208 * The inserting transaction may eventually abort thus making this tuple
2209 * DEAD and hence available for pruning. Though we don't want to optimize
2210 * for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
2211 * aborted tuple will never be pruned until next vacuum is triggered.
2212 *
2213 * If you do add PageSetPrunable here, add it in heap_xlog_insert too.
2214 */
2215
2216 MarkBufferDirty(buffer);
2217
2218 /* XLOG stuff */
2219 if (RelationNeedsWAL(relation))
2220 {
2224 Page page = BufferGetPage(buffer);
2225 uint8 info = XLOG_HEAP_INSERT;
2226 int bufflags = 0;
2227
2228 /*
2229 * If this is a catalog, we need to transmit combo CIDs to properly
2230 * decode, so log that as well.
2231 */
2233 log_heap_new_cid(relation, heaptup);
2234
2235 /*
2236 * If this is the single and first tuple on page, we can reinit the
2237 * page instead of restoring the whole thing. Set flag, and hide
2238 * buffer references from XLogInsert.
2239 */
2242 {
2243 info |= XLOG_HEAP_INIT_PAGE;
2245 }
2246
2247 xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self);
2248 xlrec.flags = 0;
2254
2255 /*
2256 * For logical decoding, we need the tuple even if we're doing a full
2257 * page write, so make sure it's included even if we take a full-page
2258 * image. (XXX We could alternatively store a pointer into the FPW).
2259 */
2260 if (RelationIsLogicallyLogged(relation) &&
2262 {
2265
2266 if (IsToastRelation(relation))
2268 }
2269
2272
2273 xlhdr.t_infomask2 = heaptup->t_data->t_infomask2;
2274 xlhdr.t_infomask = heaptup->t_data->t_infomask;
2275 xlhdr.t_hoff = heaptup->t_data->t_hoff;
2276
2277 /*
2278 * note we mark xlhdr as belonging to buffer; if XLogInsert decides to
2279 * write the whole page to the xlog, we don't need to store
2280 * xl_heap_header in the xlog.
2281 */
2284 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
2286 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2288
2289 /* filtering by origin on a row level is much more efficient */
2291
2292 recptr = XLogInsert(RM_HEAP_ID, info);
2293
2294 PageSetLSN(page, recptr);
2295 }
2296
2298
2299 UnlockReleaseBuffer(buffer);
2300 if (vmbuffer != InvalidBuffer)
2301 ReleaseBuffer(vmbuffer);
2302
2303 /*
2304 * If tuple is cacheable, mark it for invalidation from the caches in case
2305 * we abort. Note it is OK to do this after releasing the buffer, because
2306 * the heaptup data structure is all in local memory, not in the shared
2307 * buffer.
2308 */
2310
2311 /* Note: speculative insertions are counted too, even if aborted later */
2312 pgstat_count_heap_insert(relation, 1);
2313
2314 /*
2315 * If heaptup is a private copy, release it. Don't forget to copy t_self
2316 * back to the caller's image, too.
2317 */
2318 if (heaptup != tup)
2319 {
2320 tup->t_self = heaptup->t_self;
2322 }
2323}

References Assert, AssertHasSnapshotForToast(), BufferGetBlockNumber(), BufferGetPage(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), END_CRIT_SECTION, fb(), FirstOffsetNumber, GetCurrentTransactionId(), heap_freetuple(), HEAP_INSERT_NO_LOGICAL, HEAP_INSERT_SPECULATIVE, heap_prepare_insert(), HeapTupleHeaderGetNatts, InvalidBlockNumber, InvalidBuffer, IsToastRelation(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), log_heap_new_cid(), MarkBufferDirty(), PageClearAllVisible(), PageGetMaxOffsetNumber(), PageIsAllVisible(), PageSetLSN(), pgstat_count_heap_insert(), REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetBufferForTuple(), RelationGetNumberOfAttributes, RelationIsAccessibleInLogicalDecoding, RelationIsLogicallyLogged, RelationNeedsWAL, RelationPutHeapTuple(), ReleaseBuffer(), SizeOfHeapHeader, SizeOfHeapInsert, SizeofHeapTupleHeader, START_CRIT_SECTION, UnlockReleaseBuffer(), visibilitymap_clear(), VISIBILITYMAP_VALID_BITS, XLH_INSERT_ALL_VISIBLE_CLEARED, XLH_INSERT_CONTAINS_NEW_TUPLE, XLH_INSERT_IS_SPECULATIVE, XLH_INSERT_ON_TOAST_RELATION, XLOG_HEAP_INIT_PAGE, XLOG_HEAP_INSERT, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heapam_tuple_insert(), heapam_tuple_insert_speculative(), simple_heap_insert(), and toast_save_datum().

◆ heap_lock_tuple()

TM_Result heap_lock_tuple ( Relation  relation,
HeapTuple  tuple,
CommandId  cid,
LockTupleMode  mode,
LockWaitPolicy  wait_policy,
bool  follow_updates,
Buffer buffer,
TM_FailureData tmfd 
)

Definition at line 4643 of file heapam.c.

4647{
4648 TM_Result result;
4649 ItemPointer tid = &(tuple->t_self);
4650 ItemId lp;
4651 Page page;
4652 Buffer vmbuffer = InvalidBuffer;
4653 BlockNumber block;
4654 TransactionId xid,
4655 xmax;
4659 bool first_time = true;
4660 bool skip_tuple_lock = false;
4661 bool have_tuple_lock = false;
4662 bool cleared_all_frozen = false;
4663
4664 *buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid));
4665 block = ItemPointerGetBlockNumber(tid);
4666
4667 /*
4668 * Before locking the buffer, pin the visibility map page if it appears to
4669 * be necessary. Since we haven't got the lock yet, someone else might be
4670 * in the middle of changing this, so we'll need to recheck after we have
4671 * the lock.
4672 */
4673 if (PageIsAllVisible(BufferGetPage(*buffer)))
4674 visibilitymap_pin(relation, block, &vmbuffer);
4675
4677
4678 page = BufferGetPage(*buffer);
4681
4682 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp);
4683 tuple->t_len = ItemIdGetLength(lp);
4684 tuple->t_tableOid = RelationGetRelid(relation);
4685
4686l3:
4687 result = HeapTupleSatisfiesUpdate(tuple, cid, *buffer);
4688
4689 if (result == TM_Invisible)
4690 {
4691 /*
4692 * This is possible, but only when locking a tuple for ON CONFLICT
4693 * UPDATE. We return this value here rather than throwing an error in
4694 * order to give that case the opportunity to throw a more specific
4695 * error.
4696 */
4697 result = TM_Invisible;
4698 goto out_locked;
4699 }
4700 else if (result == TM_BeingModified ||
4701 result == TM_Updated ||
4702 result == TM_Deleted)
4703 {
4707 bool require_sleep;
4708 ItemPointerData t_ctid;
4709
4710 /* must copy state data before unlocking buffer */
4712 infomask = tuple->t_data->t_infomask;
4713 infomask2 = tuple->t_data->t_infomask2;
4714 ItemPointerCopy(&tuple->t_data->t_ctid, &t_ctid);
4715
4717
4718 /*
4719 * If any subtransaction of the current top transaction already holds
4720 * a lock as strong as or stronger than what we're requesting, we
4721 * effectively hold the desired lock already. We *must* succeed
4722 * without trying to take the tuple lock, else we will deadlock
4723 * against anyone wanting to acquire a stronger lock.
4724 *
4725 * Note we only do this the first time we loop on the HTSU result;
4726 * there is no point in testing in subsequent passes, because
4727 * evidently our own transaction cannot have acquired a new lock after
4728 * the first time we checked.
4729 */
4730 if (first_time)
4731 {
4732 first_time = false;
4733
4735 {
4736 int i;
4737 int nmembers;
4738 MultiXactMember *members;
4739
4740 /*
4741 * We don't need to allow old multixacts here; if that had
4742 * been the case, HeapTupleSatisfiesUpdate would have returned
4743 * MayBeUpdated and we wouldn't be here.
4744 */
4745 nmembers =
4746 GetMultiXactIdMembers(xwait, &members, false,
4748
4749 for (i = 0; i < nmembers; i++)
4750 {
4751 /* only consider members of our own transaction */
4752 if (!TransactionIdIsCurrentTransactionId(members[i].xid))
4753 continue;
4754
4755 if (TUPLOCK_from_mxstatus(members[i].status) >= mode)
4756 {
4757 pfree(members);
4758 result = TM_Ok;
4759 goto out_unlocked;
4760 }
4761 else
4762 {
4763 /*
4764 * Disable acquisition of the heavyweight tuple lock.
4765 * Otherwise, when promoting a weaker lock, we might
4766 * deadlock with another locker that has acquired the
4767 * heavyweight tuple lock and is waiting for our
4768 * transaction to finish.
4769 *
4770 * Note that in this case we still need to wait for
4771 * the multixact if required, to avoid acquiring
4772 * conflicting locks.
4773 */
4774 skip_tuple_lock = true;
4775 }
4776 }
4777
4778 if (members)
4779 pfree(members);
4780 }
4782 {
4783 switch (mode)
4784 {
4785 case LockTupleKeyShare:
4789 result = TM_Ok;
4790 goto out_unlocked;
4791 case LockTupleShare:
4794 {
4795 result = TM_Ok;
4796 goto out_unlocked;
4797 }
4798 break;
4801 {
4802 result = TM_Ok;
4803 goto out_unlocked;
4804 }
4805 break;
4806 case LockTupleExclusive:
4809 {
4810 result = TM_Ok;
4811 goto out_unlocked;
4812 }
4813 break;
4814 }
4815 }
4816 }
4817
4818 /*
4819 * Initially assume that we will have to wait for the locking
4820 * transaction(s) to finish. We check various cases below in which
4821 * this can be turned off.
4822 */
4823 require_sleep = true;
4824 if (mode == LockTupleKeyShare)
4825 {
4826 /*
4827 * If we're requesting KeyShare, and there's no update present, we
4828 * don't need to wait. Even if there is an update, we can still
4829 * continue if the key hasn't been modified.
4830 *
4831 * However, if there are updates, we need to walk the update chain
4832 * to mark future versions of the row as locked, too. That way,
4833 * if somebody deletes that future version, we're protected
4834 * against the key going away. This locking of future versions
4835 * could block momentarily, if a concurrent transaction is
4836 * deleting a key; or it could return a value to the effect that
4837 * the transaction deleting the key has already committed. So we
4838 * do this before re-locking the buffer; otherwise this would be
4839 * prone to deadlocks.
4840 *
4841 * Note that the TID we're locking was grabbed before we unlocked
4842 * the buffer. For it to change while we're not looking, the
4843 * other properties we're testing for below after re-locking the
4844 * buffer would also change, in which case we would restart this
4845 * loop above.
4846 */
4848 {
4849 bool updated;
4850
4852
4853 /*
4854 * If there are updates, follow the update chain; bail out if
4855 * that cannot be done.
4856 */
4857 if (follow_updates && updated &&
4858 !ItemPointerEquals(&tuple->t_self, &t_ctid))
4859 {
4860 TM_Result res;
4861
4862 res = heap_lock_updated_tuple(relation,
4863 infomask, xwait, &t_ctid,
4865 mode);
4866 if (res != TM_Ok)
4867 {
4868 result = res;
4869 /* recovery code expects to have buffer lock held */
4871 goto failed;
4872 }
4873 }
4874
4876
4877 /*
4878 * Make sure it's still an appropriate lock, else start over.
4879 * Also, if it wasn't updated before we released the lock, but
4880 * is updated now, we start over too; the reason is that we
4881 * now need to follow the update chain to lock the new
4882 * versions.
4883 */
4884 if (!HeapTupleHeaderIsOnlyLocked(tuple->t_data) &&
4885 ((tuple->t_data->t_infomask2 & HEAP_KEYS_UPDATED) ||
4886 !updated))
4887 goto l3;
4888
4889 /* Things look okay, so we can skip sleeping */
4890 require_sleep = false;
4891
4892 /*
4893 * Note we allow Xmax to change here; other updaters/lockers
4894 * could have modified it before we grabbed the buffer lock.
4895 * However, this is not a problem, because with the recheck we
4896 * just did we ensure that they still don't conflict with the
4897 * lock we want.
4898 */
4899 }
4900 }
4901 else if (mode == LockTupleShare)
4902 {
4903 /*
4904 * If we're requesting Share, we can similarly avoid sleeping if
4905 * there's no update and no exclusive lock present.
4906 */
4909 {
4911
4912 /*
4913 * Make sure it's still an appropriate lock, else start over.
4914 * See above about allowing xmax to change.
4915 */
4918 goto l3;
4919 require_sleep = false;
4920 }
4921 }
4922 else if (mode == LockTupleNoKeyExclusive)
4923 {
4924 /*
4925 * If we're requesting NoKeyExclusive, we might also be able to
4926 * avoid sleeping; just ensure that there no conflicting lock
4927 * already acquired.
4928 */
4930 {
4932 mode, NULL))
4933 {
4934 /*
4935 * No conflict, but if the xmax changed under us in the
4936 * meantime, start over.
4937 */
4941 xwait))
4942 goto l3;
4943
4944 /* otherwise, we're good */
4945 require_sleep = false;
4946 }
4947 }
4949 {
4951
4952 /* if the xmax changed in the meantime, start over */
4955 xwait))
4956 goto l3;
4957 /* otherwise, we're good */
4958 require_sleep = false;
4959 }
4960 }
4961
4962 /*
4963 * As a check independent from those above, we can also avoid sleeping
4964 * if the current transaction is the sole locker of the tuple. Note
4965 * that the strength of the lock already held is irrelevant; this is
4966 * not about recording the lock in Xmax (which will be done regardless
4967 * of this optimization, below). Also, note that the cases where we
4968 * hold a lock stronger than we are requesting are already handled
4969 * above by not doing anything.
4970 *
4971 * Note we only deal with the non-multixact case here; MultiXactIdWait
4972 * is well equipped to deal with this situation on its own.
4973 */
4976 {
4977 /* ... but if the xmax changed in the meantime, start over */
4981 xwait))
4982 goto l3;
4984 require_sleep = false;
4985 }
4986
4987 /*
4988 * Time to sleep on the other transaction/multixact, if necessary.
4989 *
4990 * If the other transaction is an update/delete that's already
4991 * committed, then sleeping cannot possibly do any good: if we're
4992 * required to sleep, get out to raise an error instead.
4993 *
4994 * By here, we either have already acquired the buffer exclusive lock,
4995 * or we must wait for the locking transaction or multixact; so below
4996 * we ensure that we grab buffer lock after the sleep.
4997 */
4998 if (require_sleep && (result == TM_Updated || result == TM_Deleted))
4999 {
5001 goto failed;
5002 }
5003 else if (require_sleep)
5004 {
5005 /*
5006 * Acquire tuple lock to establish our priority for the tuple, or
5007 * die trying. LockTuple will release us when we are next-in-line
5008 * for the tuple. We must do this even if we are share-locking,
5009 * but not if we already have a weaker lock on the tuple.
5010 *
5011 * If we are forced to "start over" below, we keep the tuple lock;
5012 * this arranges that we stay at the head of the line while
5013 * rechecking tuple state.
5014 */
5015 if (!skip_tuple_lock &&
5016 !heap_acquire_tuplock(relation, tid, mode, wait_policy,
5018 {
5019 /*
5020 * This can only happen if wait_policy is Skip and the lock
5021 * couldn't be obtained.
5022 */
5023 result = TM_WouldBlock;
5024 /* recovery code expects to have buffer lock held */
5026 goto failed;
5027 }
5028
5030 {
5032
5033 /* We only ever lock tuples, never update them */
5034 if (status >= MultiXactStatusNoKeyUpdate)
5035 elog(ERROR, "invalid lock mode in heap_lock_tuple");
5036
5037 /* wait for multixact to end, or die trying */
5038 switch (wait_policy)
5039 {
5040 case LockWaitBlock:
5042 relation, &tuple->t_self, XLTW_Lock, NULL);
5043 break;
5044 case LockWaitSkip:
5046 status, infomask, relation,
5047 NULL, false))
5048 {
5049 result = TM_WouldBlock;
5050 /* recovery code expects to have buffer lock held */
5052 goto failed;
5053 }
5054 break;
5055 case LockWaitError:
5057 status, infomask, relation,
5059 ereport(ERROR,
5061 errmsg("could not obtain lock on row in relation \"%s\"",
5062 RelationGetRelationName(relation))));
5063
5064 break;
5065 }
5066
5067 /*
5068 * Of course, the multixact might not be done here: if we're
5069 * requesting a light lock mode, other transactions with light
5070 * locks could still be alive, as well as locks owned by our
5071 * own xact or other subxacts of this backend. We need to
5072 * preserve the surviving MultiXact members. Note that it
5073 * isn't absolutely necessary in the latter case, but doing so
5074 * is simpler.
5075 */
5076 }
5077 else
5078 {
5079 /* wait for regular transaction to end, or die trying */
5080 switch (wait_policy)
5081 {
5082 case LockWaitBlock:
5083 XactLockTableWait(xwait, relation, &tuple->t_self,
5084 XLTW_Lock);
5085 break;
5086 case LockWaitSkip:
5088 {
5089 result = TM_WouldBlock;
5090 /* recovery code expects to have buffer lock held */
5092 goto failed;
5093 }
5094 break;
5095 case LockWaitError:
5097 ereport(ERROR,
5099 errmsg("could not obtain lock on row in relation \"%s\"",
5100 RelationGetRelationName(relation))));
5101 break;
5102 }
5103 }
5104
5105 /* if there are updates, follow the update chain */
5107 !ItemPointerEquals(&tuple->t_self, &t_ctid))
5108 {
5109 TM_Result res;
5110
5111 res = heap_lock_updated_tuple(relation,
5112 infomask, xwait, &t_ctid,
5114 mode);
5115 if (res != TM_Ok)
5116 {
5117 result = res;
5118 /* recovery code expects to have buffer lock held */
5120 goto failed;
5121 }
5122 }
5123
5125
5126 /*
5127 * xwait is done, but if xwait had just locked the tuple then some
5128 * other xact could update this tuple before we get to this point.
5129 * Check for xmax change, and start over if so.
5130 */
5133 xwait))
5134 goto l3;
5135
5137 {
5138 /*
5139 * Otherwise check if it committed or aborted. Note we cannot
5140 * be here if the tuple was only locked by somebody who didn't
5141 * conflict with us; that would have been handled above. So
5142 * that transaction must necessarily be gone by now. But
5143 * don't check for this in the multixact case, because some
5144 * locker transactions might still be running.
5145 */
5146 UpdateXmaxHintBits(tuple->t_data, *buffer, xwait);
5147 }
5148 }
5149
5150 /* By here, we're certain that we hold buffer exclusive lock again */
5151
5152 /*
5153 * We may lock if previous xmax aborted, or if it committed but only
5154 * locked the tuple without updating it; or if we didn't have to wait
5155 * at all for whatever reason.
5156 */
5157 if (!require_sleep ||
5158 (tuple->t_data->t_infomask & HEAP_XMAX_INVALID) ||
5161 result = TM_Ok;
5162 else if (!ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
5163 result = TM_Updated;
5164 else
5165 result = TM_Deleted;
5166 }
5167
5168failed:
5169 if (result != TM_Ok)
5170 {
5171 Assert(result == TM_SelfModified || result == TM_Updated ||
5172 result == TM_Deleted || result == TM_WouldBlock);
5173
5174 /*
5175 * When locking a tuple under LockWaitSkip semantics and we fail with
5176 * TM_WouldBlock above, it's possible for concurrent transactions to
5177 * release the lock and set HEAP_XMAX_INVALID in the meantime. So
5178 * this assert is slightly different from the equivalent one in
5179 * heap_delete and heap_update.
5180 */
5181 Assert((result == TM_WouldBlock) ||
5182 !(tuple->t_data->t_infomask & HEAP_XMAX_INVALID));
5183 Assert(result != TM_Updated ||
5184 !ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid));
5185 tmfd->ctid = tuple->t_data->t_ctid;
5186 tmfd->xmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
5187 if (result == TM_SelfModified)
5188 tmfd->cmax = HeapTupleHeaderGetCmax(tuple->t_data);
5189 else
5190 tmfd->cmax = InvalidCommandId;
5191 goto out_locked;
5192 }
5193
5194 /*
5195 * If we didn't pin the visibility map page and the page has become all
5196 * visible while we were busy locking the buffer, or during some
5197 * subsequent window during which we had it unlocked, we'll have to unlock
5198 * and re-lock, to avoid holding the buffer lock across I/O. That's a bit
5199 * unfortunate, especially since we'll now have to recheck whether the
5200 * tuple has been locked or updated under us, but hopefully it won't
5201 * happen very often.
5202 */
5203 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
5204 {
5206 visibilitymap_pin(relation, block, &vmbuffer);
5208 goto l3;
5209 }
5210
5211 xmax = HeapTupleHeaderGetRawXmax(tuple->t_data);
5212 old_infomask = tuple->t_data->t_infomask;
5213
5214 /*
5215 * If this is the first possibly-multixact-able operation in the current
5216 * transaction, set my per-backend OldestMemberMXactId setting. We can be
5217 * certain that the transaction will never become a member of any older
5218 * MultiXactIds than that. (We have to do this even if we end up just
5219 * using our own TransactionId below, since some other backend could
5220 * incorporate our XID into a MultiXact immediately afterwards.)
5221 */
5223
5224 /*
5225 * Compute the new xmax and infomask to store into the tuple. Note we do
5226 * not modify the tuple just yet, because that would leave it in the wrong
5227 * state if multixact.c elogs.
5228 */
5230 GetCurrentTransactionId(), mode, false,
5231 &xid, &new_infomask, &new_infomask2);
5232
5234
5235 /*
5236 * Store transaction information of xact locking the tuple.
5237 *
5238 * Note: Cmax is meaningless in this context, so don't set it; this avoids
5239 * possibly generating a useless combo CID. Moreover, if we're locking a
5240 * previously updated tuple, it's important to preserve the Cmax.
5241 *
5242 * Also reset the HOT UPDATE bit, but only if there's no update; otherwise
5243 * we would break the HOT chain.
5244 */
5247 tuple->t_data->t_infomask |= new_infomask;
5248 tuple->t_data->t_infomask2 |= new_infomask2;
5251 HeapTupleHeaderSetXmax(tuple->t_data, xid);
5252
5253 /*
5254 * Make sure there is no forward chain link in t_ctid. Note that in the
5255 * cases where the tuple has been updated, we must not overwrite t_ctid,
5256 * because it was set by the updater. Moreover, if the tuple has been
5257 * updated, we need to follow the update chain to lock the new versions of
5258 * the tuple as well.
5259 */
5261 tuple->t_data->t_ctid = *tid;
5262
5263 /* Clear only the all-frozen bit on visibility map if needed */
5264 if (PageIsAllVisible(page) &&
5265 visibilitymap_clear(relation, block, vmbuffer,
5267 cleared_all_frozen = true;
5268
5269
5270 MarkBufferDirty(*buffer);
5271
5272 /*
5273 * XLOG stuff. You might think that we don't need an XLOG record because
5274 * there is no state change worth restoring after a crash. You would be
5275 * wrong however: we have just written either a TransactionId or a
5276 * MultiXactId that may never have been seen on disk before, and we need
5277 * to make sure that there are XLOG entries covering those ID numbers.
5278 * Else the same IDs might be re-used after a crash, which would be
5279 * disastrous if this page made it to disk before the crash. Essentially
5280 * we have to enforce the WAL log-before-data rule even in this case.
5281 * (Also, in a PITR log-shipping or 2PC environment, we have to have XLOG
5282 * entries for everything anyway.)
5283 */
5284 if (RelationNeedsWAL(relation))
5285 {
5288
5291
5292 xlrec.offnum = ItemPointerGetOffsetNumber(&tuple->t_self);
5293 xlrec.xmax = xid;
5294 xlrec.infobits_set = compute_infobits(new_infomask,
5295 tuple->t_data->t_infomask2);
5298
5299 /* we don't decode row locks atm, so no need to log the origin */
5300
5302
5303 PageSetLSN(page, recptr);
5304 }
5305
5307
5308 result = TM_Ok;
5309
5312
5314 if (BufferIsValid(vmbuffer))
5315 ReleaseBuffer(vmbuffer);
5316
5317 /*
5318 * Don't update the visibility map here. Locking a tuple doesn't change
5319 * visibility info.
5320 */
5321
5322 /*
5323 * Now that we have successfully marked the tuple as locked, we can
5324 * release the lmgr tuple lock, if we had it.
5325 */
5326 if (have_tuple_lock)
5327 UnlockTupleTuplock(relation, tid, mode);
5328
5329 return result;
5330}

References Assert, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), BufferIsValid(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), ConditionalMultiXactIdWait(), ConditionalXactLockTableWait(), TM_FailureData::ctid, DoesMultiXactIdConflict(), elog, END_CRIT_SECTION, ereport, errcode(), errmsg(), ERROR, fb(), get_mxact_status_for_lock(), GetCurrentTransactionId(), GetMultiXactIdMembers(), heap_acquire_tuplock(), HEAP_KEYS_UPDATED, heap_lock_updated_tuple(), HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HeapTupleHeaderClearHotUpdated(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetXmax(), HeapTupleSatisfiesUpdate(), i, InvalidBuffer, InvalidCommandId, ItemIdGetLength, ItemIdIsNormal, ItemPointerCopy(), ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), LockTupleExclusive, LockTupleKeyShare, LockTupleNoKeyExclusive, LockTupleShare, LockWaitBlock, LockWaitError, LockWaitSkip, log_lock_failures, MarkBufferDirty(), mode, MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusNoKeyUpdate, PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetLSN(), pfree(), ReadBuffer(), REGBUF_STANDARD, RelationGetRelationName, RelationGetRelid, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapLock, START_CRIT_SECTION, HeapTupleHeaderData::t_ctid, HeapTupleData::t_data, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, HeapTupleData::t_len, HeapTupleData::t_self, HeapTupleData::t_tableOid, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TM_WouldBlock, TransactionIdEquals, TransactionIdIsCurrentTransactionId(), TUPLOCK_from_mxstatus, UnlockTupleTuplock, UpdateXmaxHintBits(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP_LOCK, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLTW_Lock, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_lock().

◆ heap_lock_updated_tuple()

static TM_Result heap_lock_updated_tuple ( Relation  rel,
uint16  prior_infomask,
TransactionId  prior_raw_xmax,
const ItemPointerData prior_ctid,
TransactionId  xid,
LockTupleMode  mode 
)
static

Definition at line 6114 of file heapam.c.

6119{
6120 INJECTION_POINT("heap_lock_updated_tuple", NULL);
6121
6122 /*
6123 * If the tuple has moved into another partition (effectively a delete)
6124 * stop here.
6125 */
6127 {
6129
6130 /*
6131 * If this is the first possibly-multixact-able operation in the
6132 * current transaction, set my per-backend OldestMemberMXactId
6133 * setting. We can be certain that the transaction will never become a
6134 * member of any older MultiXactIds than that. (We have to do this
6135 * even if we end up just using our own TransactionId below, since
6136 * some other backend could incorporate our XID into a MultiXact
6137 * immediately afterwards.)
6138 */
6140
6144 }
6145
6146 /* nothing to lock */
6147 return TM_Ok;
6148}

References fb(), heap_lock_updated_tuple_rec(), HEAP_XMAX_IS_MULTI, INJECTION_POINT, ItemPointerIndicatesMovedPartitions(), mode, MultiXactIdGetUpdateXid(), MultiXactIdSetOldestMember(), and TM_Ok.

Referenced by heap_lock_tuple().

◆ heap_lock_updated_tuple_rec()

static TM_Result heap_lock_updated_tuple_rec ( Relation  rel,
TransactionId  priorXmax,
const ItemPointerData tid,
TransactionId  xid,
LockTupleMode  mode 
)
static

Definition at line 5766 of file heapam.c.

5769{
5770 TM_Result result;
5773 Buffer buf;
5778 TransactionId xmax,
5779 new_xmax;
5780 bool cleared_all_frozen = false;
5782 Buffer vmbuffer = InvalidBuffer;
5783 BlockNumber block;
5784
5785 ItemPointerCopy(tid, &tupid);
5786
5787 for (;;)
5788 {
5789 new_infomask = 0;
5790 new_xmax = InvalidTransactionId;
5792 ItemPointerCopy(&tupid, &(mytup.t_self));
5793
5794 if (!heap_fetch(rel, SnapshotAny, &mytup, &buf, false))
5795 {
5796 /*
5797 * if we fail to find the updated version of the tuple, it's
5798 * because it was vacuumed/pruned away after its creator
5799 * transaction aborted. So behave as if we got to the end of the
5800 * chain, and there's no further tuple to lock: return success to
5801 * caller.
5802 */
5803 result = TM_Ok;
5804 goto out_unlocked;
5805 }
5806
5807l4:
5809
5810 /*
5811 * Before locking the buffer, pin the visibility map page if it
5812 * appears to be necessary. Since we haven't got the lock yet,
5813 * someone else might be in the middle of changing this, so we'll need
5814 * to recheck after we have the lock.
5815 */
5817 {
5818 visibilitymap_pin(rel, block, &vmbuffer);
5819 pinned_desired_page = true;
5820 }
5821 else
5822 pinned_desired_page = false;
5823
5825
5826 /*
5827 * If we didn't pin the visibility map page and the page has become
5828 * all visible while we were busy locking the buffer, we'll have to
5829 * unlock and re-lock, to avoid holding the buffer lock across I/O.
5830 * That's a bit unfortunate, but hopefully shouldn't happen often.
5831 *
5832 * Note: in some paths through this function, we will reach here
5833 * holding a pin on a vm page that may or may not be the one matching
5834 * this page. If this page isn't all-visible, we won't use the vm
5835 * page, but we hold onto such a pin till the end of the function.
5836 */
5838 {
5840 visibilitymap_pin(rel, block, &vmbuffer);
5842 }
5843
5844 /*
5845 * Check the tuple XMIN against prior XMAX, if any. If we reached the
5846 * end of the chain, we're done, so return success.
5847 */
5850 priorXmax))
5851 {
5852 result = TM_Ok;
5853 goto out_locked;
5854 }
5855
5856 /*
5857 * Also check Xmin: if this tuple was created by an aborted
5858 * (sub)transaction, then we already locked the last live one in the
5859 * chain, thus we're done, so return success.
5860 */
5862 {
5863 result = TM_Ok;
5864 goto out_locked;
5865 }
5866
5867 old_infomask = mytup.t_data->t_infomask;
5868 old_infomask2 = mytup.t_data->t_infomask2;
5869 xmax = HeapTupleHeaderGetRawXmax(mytup.t_data);
5870
5871 /*
5872 * If this tuple version has been updated or locked by some concurrent
5873 * transaction(s), what we do depends on whether our lock mode
5874 * conflicts with what those other transactions hold, and also on the
5875 * status of them.
5876 */
5878 {
5880 bool needwait;
5881
5884 {
5885 int nmembers;
5886 int i;
5887 MultiXactMember *members;
5888
5889 /*
5890 * We don't need a test for pg_upgrade'd tuples: this is only
5891 * applied to tuples after the first in an update chain. Said
5892 * first tuple in the chain may well be locked-in-9.2-and-
5893 * pg_upgraded, but that one was already locked by our caller,
5894 * not us; and any subsequent ones cannot be because our
5895 * caller must necessarily have obtained a snapshot later than
5896 * the pg_upgrade itself.
5897 */
5898 Assert(!HEAP_LOCKED_UPGRADED(mytup.t_data->t_infomask));
5899
5900 nmembers = GetMultiXactIdMembers(rawxmax, &members, false,
5902 for (i = 0; i < nmembers; i++)
5903 {
5904 result = test_lockmode_for_conflict(members[i].status,
5905 members[i].xid,
5906 mode,
5907 &mytup,
5908 &needwait);
5909
5910 /*
5911 * If the tuple was already locked by ourselves in a
5912 * previous iteration of this (say heap_lock_tuple was
5913 * forced to restart the locking loop because of a change
5914 * in xmax), then we hold the lock already on this tuple
5915 * version and we don't need to do anything; and this is
5916 * not an error condition either. We just need to skip
5917 * this tuple and continue locking the next version in the
5918 * update chain.
5919 */
5920 if (result == TM_SelfModified)
5921 {
5922 pfree(members);
5923 goto next;
5924 }
5925
5926 if (needwait)
5927 {
5929 XactLockTableWait(members[i].xid, rel,
5930 &mytup.t_self,
5932 pfree(members);
5933 goto l4;
5934 }
5935 if (result != TM_Ok)
5936 {
5937 pfree(members);
5938 goto out_locked;
5939 }
5940 }
5941 if (members)
5942 pfree(members);
5943 }
5944 else
5945 {
5946 MultiXactStatus status;
5947
5948 /*
5949 * For a non-multi Xmax, we first need to compute the
5950 * corresponding MultiXactStatus by using the infomask bits.
5951 */
5953 {
5957 status = MultiXactStatusForShare;
5959 {
5961 status = MultiXactStatusForUpdate;
5962 else
5964 }
5965 else
5966 {
5967 /*
5968 * LOCK_ONLY present alone (a pg_upgraded tuple marked
5969 * as share-locked in the old cluster) shouldn't be
5970 * seen in the middle of an update chain.
5971 */
5972 elog(ERROR, "invalid lock status in tuple");
5973 }
5974 }
5975 else
5976 {
5977 /* it's an update, but which kind? */
5979 status = MultiXactStatusUpdate;
5980 else
5982 }
5983
5984 result = test_lockmode_for_conflict(status, rawxmax, mode,
5985 &mytup, &needwait);
5986
5987 /*
5988 * If the tuple was already locked by ourselves in a previous
5989 * iteration of this (say heap_lock_tuple was forced to
5990 * restart the locking loop because of a change in xmax), then
5991 * we hold the lock already on this tuple version and we don't
5992 * need to do anything; and this is not an error condition
5993 * either. We just need to skip this tuple and continue
5994 * locking the next version in the update chain.
5995 */
5996 if (result == TM_SelfModified)
5997 goto next;
5998
5999 if (needwait)
6000 {
6002 XactLockTableWait(rawxmax, rel, &mytup.t_self,
6004 goto l4;
6005 }
6006 if (result != TM_Ok)
6007 {
6008 goto out_locked;
6009 }
6010 }
6011 }
6012
6013 /* compute the new Xmax and infomask values for the tuple ... */
6014 compute_new_xmax_infomask(xmax, old_infomask, mytup.t_data->t_infomask2,
6015 xid, mode, false,
6016 &new_xmax, &new_infomask, &new_infomask2);
6017
6019 visibilitymap_clear(rel, block, vmbuffer,
6021 cleared_all_frozen = true;
6022
6024
6025 /* ... and set them */
6026 HeapTupleHeaderSetXmax(mytup.t_data, new_xmax);
6027 mytup.t_data->t_infomask &= ~HEAP_XMAX_BITS;
6028 mytup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
6029 mytup.t_data->t_infomask |= new_infomask;
6030 mytup.t_data->t_infomask2 |= new_infomask2;
6031
6033
6034 /* XLOG stuff */
6035 if (RelationNeedsWAL(rel))
6036 {
6039 Page page = BufferGetPage(buf);
6040
6043
6044 xlrec.offnum = ItemPointerGetOffsetNumber(&mytup.t_self);
6045 xlrec.xmax = new_xmax;
6047 xlrec.flags =
6049
6051
6053
6054 PageSetLSN(page, recptr);
6055 }
6056
6058
6059next:
6060 /* if we find the end of update chain, we're done. */
6061 if (mytup.t_data->t_infomask & HEAP_XMAX_INVALID ||
6063 ItemPointerEquals(&mytup.t_self, &mytup.t_data->t_ctid) ||
6065 {
6066 result = TM_Ok;
6067 goto out_locked;
6068 }
6069
6070 /* tail recursion */
6072 ItemPointerCopy(&(mytup.t_data->t_ctid), &tupid);
6074 }
6075
6076 result = TM_Ok;
6077
6080
6082 if (vmbuffer != InvalidBuffer)
6083 ReleaseBuffer(vmbuffer);
6084
6085 return result;
6086}

References Assert, buf, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetPage(), CHECK_FOR_INTERRUPTS, compute_infobits(), compute_new_xmax_infomask(), elog, END_CRIT_SECTION, ERROR, fb(), GetMultiXactIdMembers(), heap_fetch(), HEAP_KEYS_UPDATED, HEAP_LOCKED_UPGRADED(), HEAP_XMAX_INVALID, HEAP_XMAX_IS_EXCL_LOCKED(), HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_IS_SHR_LOCKED(), HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderIndicatesMovedPartitions(), HeapTupleHeaderIsOnlyLocked(), HeapTupleHeaderSetXmax(), i, InvalidBuffer, InvalidTransactionId, ItemPointerCopy(), ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), LockBuffer(), MarkBufferDirty(), mode, MultiXactStatusForKeyShare, MultiXactStatusForNoKeyUpdate, MultiXactStatusForShare, MultiXactStatusForUpdate, MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, next, PageIsAllVisible(), PageSetLSN(), pfree(), REGBUF_STANDARD, RelationNeedsWAL, ReleaseBuffer(), SizeOfHeapLockUpdated, SnapshotAny, START_CRIT_SECTION, test_lockmode_for_conflict(), TM_Ok, TM_SelfModified, TransactionIdDidAbort(), TransactionIdEquals, TransactionIdIsValid, UnlockReleaseBuffer(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP2_LOCK_UPDATED, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), and XLTW_LockUpdated.

Referenced by heap_lock_updated_tuple().

◆ heap_multi_insert()

void heap_multi_insert ( Relation  relation,
TupleTableSlot **  slots,
int  ntuples,
CommandId  cid,
int  options,
BulkInsertState  bistate 
)

Definition at line 2412 of file heapam.c.

2414{
2417 int i;
2418 int ndone;
2420 Page page;
2421 Buffer vmbuffer = InvalidBuffer;
2422 bool needwal;
2426 bool starting_with_empty_page = false;
2427 int npages = 0;
2428 int npages_used = 0;
2429
2430 /* currently not needed (thus unsupported) for heap_multi_insert() */
2432
2433 AssertHasSnapshotForToast(relation);
2434
2435 needwal = RelationNeedsWAL(relation);
2438
2439 /* Toast and set header data in all the slots */
2440 heaptuples = palloc(ntuples * sizeof(HeapTuple));
2441 for (i = 0; i < ntuples; i++)
2442 {
2443 HeapTuple tuple;
2444
2445 tuple = ExecFetchSlotHeapTuple(slots[i], true, NULL);
2446 slots[i]->tts_tableOid = RelationGetRelid(relation);
2447 tuple->t_tableOid = slots[i]->tts_tableOid;
2448 heaptuples[i] = heap_prepare_insert(relation, tuple, xid, cid,
2449 options);
2450 }
2451
2452 /*
2453 * We're about to do the actual inserts -- but check for conflict first,
2454 * to minimize the possibility of having to roll back work we've just
2455 * done.
2456 *
2457 * A check here does not definitively prevent a serialization anomaly;
2458 * that check MUST be done at least past the point of acquiring an
2459 * exclusive buffer content lock on every buffer that will be affected,
2460 * and MAY be done after all inserts are reflected in the buffers and
2461 * those locks are released; otherwise there is a race condition. Since
2462 * multiple buffers can be locked and unlocked in the loop below, and it
2463 * would not be feasible to identify and lock all of those buffers before
2464 * the loop, we must do a final check at the end.
2465 *
2466 * The check here could be omitted with no loss of correctness; it is
2467 * present strictly as an optimization.
2468 *
2469 * For heap inserts, we only need to check for table-level SSI locks. Our
2470 * new tuples can't possibly conflict with existing tuple locks, and heap
2471 * page locks are only consolidated versions of tuple locks; they do not
2472 * lock "gaps" as index page locks do. So we don't need to specify a
2473 * buffer when making the call, which makes for a faster check.
2474 */
2476
2477 ndone = 0;
2478 while (ndone < ntuples)
2479 {
2480 Buffer buffer;
2481 bool all_visible_cleared = false;
2482 bool all_frozen_set = false;
2483 int nthispage;
2484
2486
2487 /*
2488 * Compute number of pages needed to fit the to-be-inserted tuples in
2489 * the worst case. This will be used to determine how much to extend
2490 * the relation by in RelationGetBufferForTuple(), if needed. If we
2491 * filled a prior page from scratch, we can just update our last
2492 * computation, but if we started with a partially filled page,
2493 * recompute from scratch, the number of potentially required pages
2494 * can vary due to tuples needing to fit onto the page, page headers
2495 * etc.
2496 */
2497 if (ndone == 0 || !starting_with_empty_page)
2498 {
2499 npages = heap_multi_insert_pages(heaptuples, ndone, ntuples,
2501 npages_used = 0;
2502 }
2503 else
2504 npages_used++;
2505
2506 /*
2507 * Find buffer where at least the next tuple will fit. If the page is
2508 * all-visible, this will also pin the requisite visibility map page.
2509 *
2510 * Also pin visibility map page if COPY FREEZE inserts tuples into an
2511 * empty page. See all_frozen_set below.
2512 */
2513 buffer = RelationGetBufferForTuple(relation, heaptuples[ndone]->t_len,
2514 InvalidBuffer, options, bistate,
2515 &vmbuffer, NULL,
2516 npages - npages_used);
2517 page = BufferGetPage(buffer);
2518
2520
2522 {
2523 all_frozen_set = true;
2524 /* Lock the vmbuffer before entering the critical section */
2526 }
2527
2528 /* NO EREPORT(ERROR) from here till changes are logged */
2530
2531 /*
2532 * RelationGetBufferForTuple has ensured that the first tuple fits.
2533 * Put that on the page, and then as many other tuples as fit.
2534 */
2535 RelationPutHeapTuple(relation, buffer, heaptuples[ndone], false);
2536
2537 /*
2538 * For logical decoding we need combo CIDs to properly decode the
2539 * catalog.
2540 */
2541 if (needwal && need_cids)
2542 log_heap_new_cid(relation, heaptuples[ndone]);
2543
2544 for (nthispage = 1; ndone + nthispage < ntuples; nthispage++)
2545 {
2547
2548 if (PageGetHeapFreeSpace(page) < MAXALIGN(heaptup->t_len) + saveFreeSpace)
2549 break;
2550
2551 RelationPutHeapTuple(relation, buffer, heaptup, false);
2552
2553 /*
2554 * For logical decoding we need combo CIDs to properly decode the
2555 * catalog.
2556 */
2557 if (needwal && need_cids)
2558 log_heap_new_cid(relation, heaptup);
2559 }
2560
2561 /*
2562 * If the page is all visible, need to clear that, unless we're only
2563 * going to add further frozen rows to it.
2564 *
2565 * If we're only adding already frozen rows to a previously empty
2566 * page, mark it as all-frozen and update the visibility map. We're
2567 * already holding a pin on the vmbuffer.
2568 */
2570 {
2571 all_visible_cleared = true;
2572 PageClearAllVisible(page);
2573 visibilitymap_clear(relation,
2574 BufferGetBlockNumber(buffer),
2575 vmbuffer, VISIBILITYMAP_VALID_BITS);
2576 }
2577 else if (all_frozen_set)
2578 {
2579 PageSetAllVisible(page);
2581 vmbuffer,
2584 relation->rd_locator);
2585 }
2586
2587 /*
2588 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
2589 */
2590
2591 MarkBufferDirty(buffer);
2592
2593 /* XLOG stuff */
2594 if (needwal)
2595 {
2599 char *tupledata;
2600 int totaldatalen;
2601 char *scratchptr = scratch.data;
2602 bool init;
2603 int bufflags = 0;
2604
2605 /*
2606 * If the page was previously empty, we can reinit the page
2607 * instead of restoring the whole thing.
2608 */
2610
2611 /* allocate xl_heap_multi_insert struct from the scratch area */
2614
2615 /*
2616 * Allocate offsets array. Unless we're reinitializing the page,
2617 * in that case the tuples are stored in order starting at
2618 * FirstOffsetNumber and we don't need to store the offsets
2619 * explicitly.
2620 */
2621 if (!init)
2622 scratchptr += nthispage * sizeof(OffsetNumber);
2623
2624 /* the rest of the scratch space is used for tuple data */
2625 tupledata = scratchptr;
2626
2627 /* check that the mutually exclusive flags are not both set */
2629
2630 xlrec->flags = 0;
2633
2634 /*
2635 * We don't have to worry about including a conflict xid in the
2636 * WAL record, as HEAP_INSERT_FROZEN intentionally violates
2637 * visibility rules.
2638 */
2639 if (all_frozen_set)
2641
2642 xlrec->ntuples = nthispage;
2643
2644 /*
2645 * Write out an xl_multi_insert_tuple and the tuple data itself
2646 * for each tuple.
2647 */
2648 for (i = 0; i < nthispage; i++)
2649 {
2651 xl_multi_insert_tuple *tuphdr;
2652 int datalen;
2653
2654 if (!init)
2655 xlrec->offsets[i] = ItemPointerGetOffsetNumber(&heaptup->t_self);
2656 /* xl_multi_insert_tuple needs two-byte alignment. */
2658 scratchptr = ((char *) tuphdr) + SizeOfMultiInsertTuple;
2659
2660 tuphdr->t_infomask2 = heaptup->t_data->t_infomask2;
2661 tuphdr->t_infomask = heaptup->t_data->t_infomask;
2662 tuphdr->t_hoff = heaptup->t_data->t_hoff;
2663
2664 /* write bitmap [+ padding] [+ oid] + data */
2665 datalen = heaptup->t_len - SizeofHeapTupleHeader;
2667 (char *) heaptup->t_data + SizeofHeapTupleHeader,
2668 datalen);
2669 tuphdr->datalen = datalen;
2670 scratchptr += datalen;
2671 }
2672 totaldatalen = scratchptr - tupledata;
2673 Assert((scratchptr - scratch.data) < BLCKSZ);
2674
2675 if (need_tuple_data)
2677
2678 /*
2679 * Signal that this is the last xl_heap_multi_insert record
2680 * emitted by this call to heap_multi_insert(). Needed for logical
2681 * decoding so it knows when to cleanup temporary data.
2682 */
2683 if (ndone + nthispage == ntuples)
2685
2686 if (init)
2687 {
2688 info |= XLOG_HEAP_INIT_PAGE;
2690 }
2691
2692 /*
2693 * If we're doing logical decoding, include the new tuple data
2694 * even if we take a full-page image of the page.
2695 */
2696 if (need_tuple_data)
2698
2700 XLogRegisterData(xlrec, tupledata - scratch.data);
2702 if (all_frozen_set)
2703 XLogRegisterBuffer(1, vmbuffer, 0);
2704
2705 XLogRegisterBufData(0, tupledata, totaldatalen);
2706
2707 /* filtering by origin on a row level is much more efficient */
2709
2710 recptr = XLogInsert(RM_HEAP2_ID, info);
2711
2712 PageSetLSN(page, recptr);
2713 if (all_frozen_set)
2714 {
2715 Assert(BufferIsDirty(vmbuffer));
2716 PageSetLSN(BufferGetPage(vmbuffer), recptr);
2717 }
2718 }
2719
2721
2722 if (all_frozen_set)
2723 LockBuffer(vmbuffer, BUFFER_LOCK_UNLOCK);
2724
2725 UnlockReleaseBuffer(buffer);
2726 ndone += nthispage;
2727
2728 /*
2729 * NB: Only release vmbuffer after inserting all tuples - it's fairly
2730 * likely that we'll insert into subsequent heap pages that are likely
2731 * to use the same vm page.
2732 */
2733 }
2734
2735 /* We're done with inserting all tuples, so release the last vmbuffer. */
2736 if (vmbuffer != InvalidBuffer)
2737 ReleaseBuffer(vmbuffer);
2738
2739 /*
2740 * We're done with the actual inserts. Check for conflicts again, to
2741 * ensure that all rw-conflicts in to these inserts are detected. Without
2742 * this final check, a sequential scan of the heap may have locked the
2743 * table after the "before" check, missing one opportunity to detect the
2744 * conflict, and then scanned the table before the new tuples were there,
2745 * missing the other chance to detect the conflict.
2746 *
2747 * For heap inserts, we only need to check for table-level SSI locks. Our
2748 * new tuples can't possibly conflict with existing tuple locks, and heap
2749 * page locks are only consolidated versions of tuple locks; they do not
2750 * lock "gaps" as index page locks do. So we don't need to specify a
2751 * buffer when making the call.
2752 */
2754
2755 /*
2756 * If tuples are cacheable, mark them for invalidation from the caches in
2757 * case we abort. Note it is OK to do this after releasing the buffer,
2758 * because the heaptuples data structure is all in local memory, not in
2759 * the shared buffer.
2760 */
2761 if (IsCatalogRelation(relation))
2762 {
2763 for (i = 0; i < ntuples; i++)
2765 }
2766
2767 /* copy t_self fields back to the caller's slots */
2768 for (i = 0; i < ntuples; i++)
2769 slots[i]->tts_tid = heaptuples[i]->t_self;
2770
2771 pgstat_count_heap_insert(relation, ntuples);
2772}

References Assert, AssertHasSnapshotForToast(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), BufferIsDirty(), CacheInvalidateHeapTuple(), CHECK_FOR_INTERRUPTS, CheckForSerializableConflictIn(), xl_multi_insert_tuple::datalen, END_CRIT_SECTION, ExecFetchSlotHeapTuple(), fb(), GetCurrentTransactionId(), HEAP_DEFAULT_FILLFACTOR, HEAP_INSERT_FROZEN, HEAP_INSERT_NO_LOGICAL, heap_multi_insert_pages(), heap_prepare_insert(), i, init, InvalidBlockNumber, InvalidBuffer, IsCatalogRelation(), ItemPointerGetOffsetNumber(), LockBuffer(), log_heap_new_cid(), MarkBufferDirty(), MAXALIGN, PageClearAllVisible(), PageGetHeapFreeSpace(), PageGetMaxOffsetNumber(), PageIsAllVisible(), PageSetAllVisible(), PageSetLSN(), palloc(), pgstat_count_heap_insert(), RelationData::rd_locator, REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationGetBufferForTuple(), RelationGetRelid, RelationGetTargetPageFreeSpace, RelationIsAccessibleInLogicalDecoding, RelationIsLogicallyLogged, RelationNeedsWAL, RelationPutHeapTuple(), ReleaseBuffer(), SHORTALIGN, SizeOfHeapMultiInsert, SizeofHeapTupleHeader, SizeOfMultiInsertTuple, START_CRIT_SECTION, xl_multi_insert_tuple::t_hoff, xl_multi_insert_tuple::t_infomask, xl_multi_insert_tuple::t_infomask2, HeapTupleData::t_tableOid, TupleTableSlot::tts_tableOid, UnlockReleaseBuffer(), VISIBILITYMAP_ALL_FROZEN, VISIBILITYMAP_ALL_VISIBLE, visibilitymap_clear(), visibilitymap_set_vmbits(), VISIBILITYMAP_VALID_BITS, XLH_INSERT_ALL_FROZEN_SET, XLH_INSERT_ALL_VISIBLE_CLEARED, XLH_INSERT_CONTAINS_NEW_TUPLE, XLH_INSERT_LAST_IN_MULTI, XLOG_HEAP2_MULTI_INSERT, XLOG_HEAP_INIT_PAGE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by CatalogTuplesMultiInsertWithInfo().

◆ heap_multi_insert_pages()

static int heap_multi_insert_pages ( HeapTuple heaptuples,
int  done,
int  ntuples,
Size  saveFreeSpace 
)
static

Definition at line 2380 of file heapam.c.

2381{
2383 int npages = 1;
2384
2385 for (int i = done; i < ntuples; i++)
2386 {
2387 size_t tup_sz = sizeof(ItemIdData) + MAXALIGN(heaptuples[i]->t_len);
2388
2389 if (page_avail < tup_sz)
2390 {
2391 npages++;
2393 }
2394 page_avail -= tup_sz;
2395 }
2396
2397 return npages;
2398}

References fb(), i, MAXALIGN, and SizeOfPageHeaderData.

Referenced by heap_multi_insert().

◆ heap_pre_freeze_checks()

void heap_pre_freeze_checks ( Buffer  buffer,
HeapTupleFreeze tuples,
int  ntuples 
)

Definition at line 7407 of file heapam.c.

7409{
7410 Page page = BufferGetPage(buffer);
7411
7412 for (int i = 0; i < ntuples; i++)
7413 {
7414 HeapTupleFreeze *frz = tuples + i;
7415 ItemId itemid = PageGetItemId(page, frz->offset);
7416 HeapTupleHeader htup;
7417
7418 htup = (HeapTupleHeader) PageGetItem(page, itemid);
7419
7420 /* Deliberately avoid relying on tuple hint bits here */
7421 if (frz->checkflags & HEAP_FREEZE_CHECK_XMIN_COMMITTED)
7422 {
7424
7426 if (unlikely(!TransactionIdDidCommit(xmin)))
7427 ereport(ERROR,
7429 errmsg_internal("uncommitted xmin %u needs to be frozen",
7430 xmin)));
7431 }
7432
7433 /*
7434 * TransactionIdDidAbort won't work reliably in the presence of XIDs
7435 * left behind by transactions that were in progress during a crash,
7436 * so we can only check that xmax didn't commit
7437 */
7438 if (frz->checkflags & HEAP_FREEZE_CHECK_XMAX_ABORTED)
7439 {
7441
7444 ereport(ERROR,
7446 errmsg_internal("cannot freeze committed xmax %u",
7447 xmax)));
7448 }
7449 }
7450}

References Assert, BufferGetPage(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errmsg_internal(), ERROR, fb(), HEAP_FREEZE_CHECK_XMAX_ABORTED, HEAP_FREEZE_CHECK_XMIN_COMMITTED, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetRawXmin(), HeapTupleHeaderXminFrozen(), i, PageGetItem(), PageGetItemId(), TransactionIdDidCommit(), TransactionIdIsNormal, and unlikely.

Referenced by heap_page_will_freeze().

◆ heap_prepare_freeze_tuple()

bool heap_prepare_freeze_tuple ( HeapTupleHeader  tuple,
const struct VacuumCutoffs cutoffs,
HeapPageFreeze pagefrz,
HeapTupleFreeze frz,
bool totally_frozen 
)

Definition at line 7134 of file heapam.c.

7138{
7139 bool xmin_already_frozen = false,
7140 xmax_already_frozen = false;
7141 bool freeze_xmin = false,
7142 replace_xvac = false,
7143 replace_xmax = false,
7144 freeze_xmax = false;
7145 TransactionId xid;
7146
7147 frz->xmax = HeapTupleHeaderGetRawXmax(tuple);
7148 frz->t_infomask2 = tuple->t_infomask2;
7149 frz->t_infomask = tuple->t_infomask;
7150 frz->frzflags = 0;
7151 frz->checkflags = 0;
7152
7153 /*
7154 * Process xmin, while keeping track of whether it's already frozen, or
7155 * will become frozen iff our freeze plan is executed by caller (could be
7156 * neither).
7157 */
7158 xid = HeapTupleHeaderGetXmin(tuple);
7159 if (!TransactionIdIsNormal(xid))
7160 xmin_already_frozen = true;
7161 else
7162 {
7163 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7164 ereport(ERROR,
7166 errmsg_internal("found xmin %u from before relfrozenxid %u",
7167 xid, cutoffs->relfrozenxid)));
7168
7169 /* Will set freeze_xmin flags in freeze plan below */
7171
7172 /* Verify that xmin committed if and when freeze plan is executed */
7173 if (freeze_xmin)
7175 }
7176
7177 /*
7178 * Old-style VACUUM FULL is gone, but we have to process xvac for as long
7179 * as we support having MOVED_OFF/MOVED_IN tuples in the database
7180 */
7181 xid = HeapTupleHeaderGetXvac(tuple);
7182 if (TransactionIdIsNormal(xid))
7183 {
7185 Assert(TransactionIdPrecedes(xid, cutoffs->OldestXmin));
7186
7187 /*
7188 * For Xvac, we always freeze proactively. This allows totally_frozen
7189 * tracking to ignore xvac.
7190 */
7191 replace_xvac = pagefrz->freeze_required = true;
7192
7193 /* Will set replace_xvac flags in freeze plan below */
7194 }
7195
7196 /* Now process xmax */
7197 xid = frz->xmax;
7198 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7199 {
7200 /* Raw xmax is a MultiXactId */
7202 uint16 flags;
7203
7204 /*
7205 * We will either remove xmax completely (in the "freeze_xmax" path),
7206 * process xmax by replacing it (in the "replace_xmax" path), or
7207 * perform no-op xmax processing. The only constraint is that the
7208 * FreezeLimit/MultiXactCutoff postcondition must never be violated.
7209 */
7210 newxmax = FreezeMultiXactId(xid, tuple->t_infomask, cutoffs,
7211 &flags, pagefrz);
7212
7213 if (flags & FRM_NOOP)
7214 {
7215 /*
7216 * xmax is a MultiXactId, and nothing about it changes for now.
7217 * This is the only case where 'freeze_required' won't have been
7218 * set for us by FreezeMultiXactId, as well as the only case where
7219 * neither freeze_xmax nor replace_xmax are set (given a multi).
7220 *
7221 * This is a no-op, but the call to FreezeMultiXactId might have
7222 * ratcheted back NewRelfrozenXid and/or NewRelminMxid trackers
7223 * for us (the "freeze page" variants, specifically). That'll
7224 * make it safe for our caller to freeze the page later on, while
7225 * leaving this particular xmax undisturbed.
7226 *
7227 * FreezeMultiXactId is _not_ responsible for the "no freeze"
7228 * NewRelfrozenXid/NewRelminMxid trackers, though -- that's our
7229 * job. A call to heap_tuple_should_freeze for this same tuple
7230 * will take place below if 'freeze_required' isn't set already.
7231 * (This repeats work from FreezeMultiXactId, but allows "no
7232 * freeze" tracker maintenance to happen in only one place.)
7233 */
7236 }
7237 else if (flags & FRM_RETURN_IS_XID)
7238 {
7239 /*
7240 * xmax will become an updater Xid (original MultiXact's updater
7241 * member Xid will be carried forward as a simple Xid in Xmax).
7242 */
7244
7245 /*
7246 * NB -- some of these transformations are only valid because we
7247 * know the return Xid is a tuple updater (i.e. not merely a
7248 * locker.) Also note that the only reason we don't explicitly
7249 * worry about HEAP_KEYS_UPDATED is because it lives in
7250 * t_infomask2 rather than t_infomask.
7251 */
7252 frz->t_infomask &= ~HEAP_XMAX_BITS;
7253 frz->xmax = newxmax;
7254 if (flags & FRM_MARK_COMMITTED)
7255 frz->t_infomask |= HEAP_XMAX_COMMITTED;
7256 replace_xmax = true;
7257 }
7258 else if (flags & FRM_RETURN_IS_MULTI)
7259 {
7262
7263 /*
7264 * xmax is an old MultiXactId that we have to replace with a new
7265 * MultiXactId, to carry forward two or more original member XIDs.
7266 */
7268
7269 /*
7270 * We can't use GetMultiXactIdHintBits directly on the new multi
7271 * here; that routine initializes the masks to all zeroes, which
7272 * would lose other bits we need. Doing it this way ensures all
7273 * unrelated bits remain untouched.
7274 */
7275 frz->t_infomask &= ~HEAP_XMAX_BITS;
7276 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7278 frz->t_infomask |= newbits;
7279 frz->t_infomask2 |= newbits2;
7280 frz->xmax = newxmax;
7281 replace_xmax = true;
7282 }
7283 else
7284 {
7285 /*
7286 * Freeze plan for tuple "freezes xmax" in the strictest sense:
7287 * it'll leave nothing in xmax (neither an Xid nor a MultiXactId).
7288 */
7289 Assert(flags & FRM_INVALIDATE_XMAX);
7291
7292 /* Will set freeze_xmax flags in freeze plan below */
7293 freeze_xmax = true;
7294 }
7295
7296 /* MultiXactId processing forces freezing (barring FRM_NOOP case) */
7297 Assert(pagefrz->freeze_required || (!freeze_xmax && !replace_xmax));
7298 }
7299 else if (TransactionIdIsNormal(xid))
7300 {
7301 /* Raw xmax is normal XID */
7302 if (TransactionIdPrecedes(xid, cutoffs->relfrozenxid))
7303 ereport(ERROR,
7305 errmsg_internal("found xmax %u from before relfrozenxid %u",
7306 xid, cutoffs->relfrozenxid)));
7307
7308 /* Will set freeze_xmax flags in freeze plan below */
7310
7311 /*
7312 * Verify that xmax aborted if and when freeze plan is executed,
7313 * provided it's from an update. (A lock-only xmax can be removed
7314 * independent of this, since the lock is released at xact end.)
7315 */
7317 frz->checkflags |= HEAP_FREEZE_CHECK_XMAX_ABORTED;
7318 }
7319 else if (!TransactionIdIsValid(xid))
7320 {
7321 /* Raw xmax is InvalidTransactionId XID */
7322 Assert((tuple->t_infomask & HEAP_XMAX_IS_MULTI) == 0);
7323 xmax_already_frozen = true;
7324 }
7325 else
7326 ereport(ERROR,
7328 errmsg_internal("found raw xmax %u (infomask 0x%04x) not invalid and not multi",
7329 xid, tuple->t_infomask)));
7330
7331 if (freeze_xmin)
7332 {
7334
7335 frz->t_infomask |= HEAP_XMIN_FROZEN;
7336 }
7337 if (replace_xvac)
7338 {
7339 /*
7340 * If a MOVED_OFF tuple is not dead, the xvac transaction must have
7341 * failed; whereas a non-dead MOVED_IN tuple must mean the xvac
7342 * transaction succeeded.
7343 */
7344 Assert(pagefrz->freeze_required);
7345 if (tuple->t_infomask & HEAP_MOVED_OFF)
7346 frz->frzflags |= XLH_INVALID_XVAC;
7347 else
7348 frz->frzflags |= XLH_FREEZE_XVAC;
7349 }
7350 if (replace_xmax)
7351 {
7353 Assert(pagefrz->freeze_required);
7354
7355 /* Already set replace_xmax flags in freeze plan earlier */
7356 }
7357 if (freeze_xmax)
7358 {
7360
7361 frz->xmax = InvalidTransactionId;
7362
7363 /*
7364 * The tuple might be marked either XMAX_INVALID or XMAX_COMMITTED +
7365 * LOCKED. Normalize to INVALID just to be sure no one gets confused.
7366 * Also get rid of the HEAP_KEYS_UPDATED bit.
7367 */
7368 frz->t_infomask &= ~HEAP_XMAX_BITS;
7369 frz->t_infomask |= HEAP_XMAX_INVALID;
7370 frz->t_infomask2 &= ~HEAP_HOT_UPDATED;
7371 frz->t_infomask2 &= ~HEAP_KEYS_UPDATED;
7372 }
7373
7374 /*
7375 * Determine if this tuple is already totally frozen, or will become
7376 * totally frozen (provided caller executes freeze plans for the page)
7377 */
7380
7381 if (!pagefrz->freeze_required && !(xmin_already_frozen &&
7383 {
7384 /*
7385 * So far no previous tuple from the page made freezing mandatory.
7386 * Does this tuple force caller to freeze the entire page?
7387 */
7388 pagefrz->freeze_required =
7389 heap_tuple_should_freeze(tuple, cutoffs,
7390 &pagefrz->NoFreezePageRelfrozenXid,
7391 &pagefrz->NoFreezePageRelminMxid);
7392 }
7393
7394 /* Tell caller if this tuple has a usable freeze plan set in *frz */
7396}

References Assert, ereport, errcode(), ERRCODE_DATA_CORRUPTED, errmsg_internal(), ERROR, fb(), HeapPageFreeze::freeze_required, FreezeMultiXactId(), FRM_INVALIDATE_XMAX, FRM_MARK_COMMITTED, FRM_NOOP, FRM_RETURN_IS_MULTI, FRM_RETURN_IS_XID, GetMultiXactIdHintBits(), HEAP_FREEZE_CHECK_XMAX_ABORTED, HEAP_FREEZE_CHECK_XMIN_COMMITTED, HEAP_MOVED_OFF, heap_tuple_should_freeze(), HEAP_XMAX_COMMITTED, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMIN_FROZEN, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), InvalidTransactionId, VacuumCutoffs::MultiXactCutoff, MultiXactIdIsValid, MultiXactIdPrecedes(), HeapPageFreeze::NoFreezePageRelfrozenXid, HeapPageFreeze::NoFreezePageRelminMxid, VacuumCutoffs::OldestMxact, VacuumCutoffs::OldestXmin, VacuumCutoffs::relfrozenxid, HeapTupleHeaderData::t_infomask, HeapTupleHeaderData::t_infomask2, TransactionIdIsNormal, TransactionIdIsValid, TransactionIdPrecedes(), TransactionIdPrecedesOrEquals(), XLH_FREEZE_XVAC, and XLH_INVALID_XVAC.

Referenced by heap_freeze_tuple(), and heap_prune_record_unchanged_lp_normal().

◆ heap_prepare_insert()

static HeapTuple heap_prepare_insert ( Relation  relation,
HeapTuple  tup,
TransactionId  xid,
CommandId  cid,
int  options 
)
static

Definition at line 2332 of file heapam.c.

2334{
2335 /*
2336 * To allow parallel inserts, we need to ensure that they are safe to be
2337 * performed in workers. We have the infrastructure to allow parallel
2338 * inserts in general except for the cases where inserts generate a new
2339 * CommandId (eg. inserts into a table having a foreign key column).
2340 */
2341 if (IsParallelWorker())
2342 ereport(ERROR,
2344 errmsg("cannot insert tuples in a parallel worker")));
2345
2346 tup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
2347 tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
2348 tup->t_data->t_infomask |= HEAP_XMAX_INVALID;
2349 HeapTupleHeaderSetXmin(tup->t_data, xid);
2352
2353 HeapTupleHeaderSetCmin(tup->t_data, cid);
2354 HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */
2355 tup->t_tableOid = RelationGetRelid(relation);
2356
2357 /*
2358 * If the new tuple is too big for storage or contains already toasted
2359 * out-of-line attributes from some other relation, invoke the toaster.
2360 */
2361 if (relation->rd_rel->relkind != RELKIND_RELATION &&
2362 relation->rd_rel->relkind != RELKIND_MATVIEW)
2363 {
2364 /* toast table entries should never be recursively toasted */
2366 return tup;
2367 }
2368 else if (HeapTupleHasExternal(tup) || tup->t_len > TOAST_TUPLE_THRESHOLD)
2369 return heap_toast_insert_or_update(relation, tup, NULL, options);
2370 else
2371 return tup;
2372}

References Assert, ereport, errcode(), errmsg(), ERROR, fb(), HEAP2_XACT_MASK, HEAP_INSERT_FROZEN, heap_toast_insert_or_update(), HEAP_XACT_MASK, HEAP_XMAX_INVALID, HeapTupleHasExternal(), HeapTupleHeaderSetCmin(), HeapTupleHeaderSetXmax(), HeapTupleHeaderSetXmin(), HeapTupleHeaderSetXminFrozen(), IsParallelWorker, RelationData::rd_rel, RelationGetRelid, and TOAST_TUPLE_THRESHOLD.

Referenced by heap_insert(), and heap_multi_insert().

◆ heap_prepare_pagescan()

void heap_prepare_pagescan ( TableScanDesc  sscan)

Definition at line 615 of file heapam.c.

616{
618 Buffer buffer = scan->rs_cbuf;
619 BlockNumber block = scan->rs_cblock;
620 Snapshot snapshot;
621 Page page;
622 int lines;
623 bool all_visible;
625
626 Assert(BufferGetBlockNumber(buffer) == block);
627
628 /* ensure we're not accidentally being used when not in pagemode */
630 snapshot = scan->rs_base.rs_snapshot;
631
632 /*
633 * Prune and repair fragmentation for the whole page, if possible.
634 */
635 heap_page_prune_opt(scan->rs_base.rs_rd, buffer);
636
637 /*
638 * We must hold share lock on the buffer content while examining tuple
639 * visibility. Afterwards, however, the tuples we have found to be
640 * visible are guaranteed good as long as we hold the buffer pin.
641 */
643
644 page = BufferGetPage(buffer);
645 lines = PageGetMaxOffsetNumber(page);
646
647 /*
648 * If the all-visible flag indicates that all tuples on the page are
649 * visible to everyone, we can skip the per-tuple visibility tests.
650 *
651 * Note: In hot standby, a tuple that's already visible to all
652 * transactions on the primary might still be invisible to a read-only
653 * transaction in the standby. We partly handle this problem by tracking
654 * the minimum xmin of visible tuples as the cut-off XID while marking a
655 * page all-visible on the primary and WAL log that along with the
656 * visibility map SET operation. In hot standby, we wait for (or abort)
657 * all transactions that can potentially may not see one or more tuples on
658 * the page. That's how index-only scans work fine in hot standby. A
659 * crucial difference between index-only scans and heap scans is that the
660 * index-only scan completely relies on the visibility map where as heap
661 * scan looks at the page-level PD_ALL_VISIBLE flag. We are not sure if
662 * the page-level flag can be trusted in the same way, because it might
663 * get propagated somehow without being explicitly WAL-logged, e.g. via a
664 * full page write. Until we can prove that beyond doubt, let's check each
665 * tuple for visibility the hard way.
666 */
667 all_visible = PageIsAllVisible(page) && !snapshot->takenDuringRecovery;
670
671 /*
672 * We call page_collect_tuples() with constant arguments, to get the
673 * compiler to constant fold the constant arguments. Separate calls with
674 * constant arguments, rather than variables, are needed on several
675 * compilers to actually perform constant folding.
676 */
677 if (likely(all_visible))
678 {
680 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
681 block, lines, true, false);
682 else
683 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
684 block, lines, true, true);
685 }
686 else
687 {
689 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
690 block, lines, false, false);
691 else
692 scan->rs_ntuples = page_collect_tuples(scan, snapshot, page, buffer,
693 block, lines, false, true);
694 }
695
697}

References Assert, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), CheckForSerializableConflictOutNeeded(), fb(), heap_page_prune_opt(), likely, LockBuffer(), page_collect_tuples(), PageGetMaxOffsetNumber(), PageIsAllVisible(), HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, HeapScanDescData::rs_ntuples, TableScanDescData::rs_rd, TableScanDescData::rs_snapshot, SO_ALLOW_PAGEMODE, and SnapshotData::takenDuringRecovery.

Referenced by heapam_scan_sample_next_block(), and heapgettup_pagemode().

◆ heap_rescan()

void heap_rescan ( TableScanDesc  sscan,
ScanKey  key,
bool  set_params,
bool  allow_strat,
bool  allow_sync,
bool  allow_pagemode 
)

Definition at line 1317 of file heapam.c.

1319{
1321
1322 if (set_params)
1323 {
1324 if (allow_strat)
1326 else
1328
1329 if (allow_sync)
1331 else
1333
1334 if (allow_pagemode && scan->rs_base.rs_snapshot &&
1337 else
1339 }
1340
1341 /*
1342 * unpin scan buffers
1343 */
1344 if (BufferIsValid(scan->rs_cbuf))
1345 {
1346 ReleaseBuffer(scan->rs_cbuf);
1347 scan->rs_cbuf = InvalidBuffer;
1348 }
1349
1350 /*
1351 * SO_TYPE_BITMAPSCAN would be cleaned up here, but it does not hold any
1352 * additional data vs a normal HeapScan
1353 */
1354
1355 /*
1356 * The read stream is reset on rescan. This must be done before
1357 * initscan(), as some state referred to by read_stream_reset() is reset
1358 * in initscan().
1359 */
1360 if (scan->rs_read_stream)
1362
1363 /*
1364 * reinitialize scan descriptor
1365 */
1366 initscan(scan, key, true);
1367}

References BufferIsValid(), fb(), initscan(), InvalidBuffer, IsMVCCSnapshot, read_stream_reset(), ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cbuf, TableScanDescData::rs_flags, HeapScanDescData::rs_read_stream, TableScanDescData::rs_snapshot, SO_ALLOW_PAGEMODE, SO_ALLOW_STRAT, and SO_ALLOW_SYNC.

◆ heap_scan_stream_read_next_parallel()

static BlockNumber heap_scan_stream_read_next_parallel ( ReadStream stream,
void callback_private_data,
void per_buffer_data 
)
static

◆ heap_scan_stream_read_next_serial()

static BlockNumber heap_scan_stream_read_next_serial ( ReadStream stream,
void callback_private_data,
void per_buffer_data 
)
static

Definition at line 291 of file heapam.c.

294{
295 HeapScanDesc scan = (HeapScanDesc) callback_private_data;
296
297 if (unlikely(!scan->rs_inited))
298 {
300 scan->rs_inited = true;
301 }
302 else
304 scan->rs_prefetch_block,
305 scan->rs_dir);
306
307 return scan->rs_prefetch_block;
308}

References heapgettup_advance_block(), heapgettup_initial_block(), HeapScanDescData::rs_dir, HeapScanDescData::rs_inited, HeapScanDescData::rs_prefetch_block, and unlikely.

Referenced by heap_beginscan().

◆ heap_set_tidrange()

void heap_set_tidrange ( TableScanDesc  sscan,
ItemPointer  mintid,
ItemPointer  maxtid 
)

Definition at line 1478 of file heapam.c.

1480{
1486
1487 /*
1488 * For relations without any pages, we can simply leave the TID range
1489 * unset. There will be no tuples to scan, therefore no tuples outside
1490 * the given TID range.
1491 */
1492 if (scan->rs_nblocks == 0)
1493 return;
1494
1495 /*
1496 * Set up some ItemPointers which point to the first and last possible
1497 * tuples in the heap.
1498 */
1501
1502 /*
1503 * If the given maximum TID is below the highest possible TID in the
1504 * relation, then restrict the range to that, otherwise we scan to the end
1505 * of the relation.
1506 */
1509
1510 /*
1511 * If the given minimum TID is above the lowest possible TID in the
1512 * relation, then restrict the range to only scan for TIDs above that.
1513 */
1516
1517 /*
1518 * Check for an empty range and protect from would be negative results
1519 * from the numBlks calculation below.
1520 */
1522 {
1523 /* Set an empty range of blocks to scan */
1525 return;
1526 }
1527
1528 /*
1529 * Calculate the first block and the number of blocks we must scan. We
1530 * could be more aggressive here and perform some more validation to try
1531 * and further narrow the scope of blocks to scan by checking if the
1532 * lowestItem has an offset above MaxOffsetNumber. In this case, we could
1533 * advance startBlk by one. Likewise, if highestItem has an offset of 0
1534 * we could scan one fewer blocks. However, such an optimization does not
1535 * seem worth troubling over, currently.
1536 */
1538
1541
1542 /* Set the start block and number of blocks to scan */
1544
1545 /* Finally, set the TID range in sscan */
1546 ItemPointerCopy(&lowestItem, &sscan->st.tidrange.rs_mintid);
1547 ItemPointerCopy(&highestItem, &sscan->st.tidrange.rs_maxtid);
1548}

References fb(), FirstOffsetNumber, heap_setscanlimits(), ItemPointerCompare(), ItemPointerCopy(), ItemPointerGetBlockNumberNoCheck(), ItemPointerSet(), MaxOffsetNumber, and HeapScanDescData::rs_nblocks.

◆ heap_setscanlimits()

void heap_setscanlimits ( TableScanDesc  sscan,
BlockNumber  startBlk,
BlockNumber  numBlks 
)

Definition at line 499 of file heapam.c.

500{
502
503 Assert(!scan->rs_inited); /* else too late to change */
504 /* else rs_startblock is significant */
506
507 /* Check startBlk is valid (but allow case of zero blocks...) */
508 Assert(startBlk == 0 || startBlk < scan->rs_nblocks);
509
510 scan->rs_startblock = startBlk;
511 scan->rs_numblocks = numBlks;
512}

References Assert, fb(), HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, HeapScanDescData::rs_numblocks, HeapScanDescData::rs_startblock, and SO_ALLOW_SYNC.

Referenced by heap_set_tidrange(), and heapam_index_build_range_scan().

◆ heap_tuple_needs_eventual_freeze()

bool heap_tuple_needs_eventual_freeze ( HeapTupleHeader  tuple)

Definition at line 7890 of file heapam.c.

7891{
7892 TransactionId xid;
7893
7894 /*
7895 * If xmin is a normal transaction ID, this tuple is definitely not
7896 * frozen.
7897 */
7898 xid = HeapTupleHeaderGetXmin(tuple);
7899 if (TransactionIdIsNormal(xid))
7900 return true;
7901
7902 /*
7903 * If xmax is a valid xact or multixact, this tuple is also not frozen.
7904 */
7905 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7906 {
7907 MultiXactId multi;
7908
7909 multi = HeapTupleHeaderGetRawXmax(tuple);
7910 if (MultiXactIdIsValid(multi))
7911 return true;
7912 }
7913 else
7914 {
7915 xid = HeapTupleHeaderGetRawXmax(tuple);
7916 if (TransactionIdIsNormal(xid))
7917 return true;
7918 }
7919
7920 if (tuple->t_infomask & HEAP_MOVED)
7921 {
7922 xid = HeapTupleHeaderGetXvac(tuple);
7923 if (TransactionIdIsNormal(xid))
7924 return true;
7925 }
7926
7927 return false;
7928}

References HEAP_MOVED, HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), MultiXactIdIsValid, HeapTupleHeaderData::t_infomask, and TransactionIdIsNormal.

Referenced by collect_corrupt_items(), and heap_page_would_be_all_visible().

◆ heap_tuple_should_freeze()

bool heap_tuple_should_freeze ( HeapTupleHeader  tuple,
const struct VacuumCutoffs cutoffs,
TransactionId NoFreezePageRelfrozenXid,
MultiXactId NoFreezePageRelminMxid 
)

Definition at line 7945 of file heapam.c.

7949{
7950 TransactionId xid;
7951 MultiXactId multi;
7952 bool freeze = false;
7953
7954 /* First deal with xmin */
7955 xid = HeapTupleHeaderGetXmin(tuple);
7956 if (TransactionIdIsNormal(xid))
7957 {
7959 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7960 *NoFreezePageRelfrozenXid = xid;
7961 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7962 freeze = true;
7963 }
7964
7965 /* Now deal with xmax */
7967 multi = InvalidMultiXactId;
7968 if (tuple->t_infomask & HEAP_XMAX_IS_MULTI)
7969 multi = HeapTupleHeaderGetRawXmax(tuple);
7970 else
7971 xid = HeapTupleHeaderGetRawXmax(tuple);
7972
7973 if (TransactionIdIsNormal(xid))
7974 {
7976 /* xmax is a non-permanent XID */
7977 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
7978 *NoFreezePageRelfrozenXid = xid;
7979 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
7980 freeze = true;
7981 }
7982 else if (!MultiXactIdIsValid(multi))
7983 {
7984 /* xmax is a permanent XID or invalid MultiXactId/XID */
7985 }
7986 else if (HEAP_LOCKED_UPGRADED(tuple->t_infomask))
7987 {
7988 /* xmax is a pg_upgrade'd MultiXact, which can't have updater XID */
7989 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
7990 *NoFreezePageRelminMxid = multi;
7991 /* heap_prepare_freeze_tuple always freezes pg_upgrade'd xmax */
7992 freeze = true;
7993 }
7994 else
7995 {
7996 /* xmax is a MultiXactId that may have an updater XID */
7997 MultiXactMember *members;
7998 int nmembers;
7999
8001 if (MultiXactIdPrecedes(multi, *NoFreezePageRelminMxid))
8002 *NoFreezePageRelminMxid = multi;
8003 if (MultiXactIdPrecedes(multi, cutoffs->MultiXactCutoff))
8004 freeze = true;
8005
8006 /* need to check whether any member of the mxact is old */
8007 nmembers = GetMultiXactIdMembers(multi, &members, false,
8009
8010 for (int i = 0; i < nmembers; i++)
8011 {
8012 xid = members[i].xid;
8014 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8015 *NoFreezePageRelfrozenXid = xid;
8016 if (TransactionIdPrecedes(xid, cutoffs->FreezeLimit))
8017 freeze = true;
8018 }
8019 if (nmembers > 0)
8020 pfree(members);
8021 }
8022
8023 if (tuple->t_infomask & HEAP_MOVED)
8024 {
8025 xid = HeapTupleHeaderGetXvac(tuple);
8026 if (TransactionIdIsNormal(xid))
8027 {
8029 if (TransactionIdPrecedes(xid, *NoFreezePageRelfrozenXid))
8030 *NoFreezePageRelfrozenXid = xid;
8031 /* heap_prepare_freeze_tuple forces xvac freezing */
8032 freeze = true;
8033 }
8034 }
8035
8036 return freeze;
8037}

References Assert, VacuumCutoffs::FreezeLimit, GetMultiXactIdMembers(), HEAP_LOCKED_UPGRADED(), HEAP_MOVED, HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), i, InvalidMultiXactId, InvalidTransactionId, VacuumCutoffs::MultiXactCutoff, MultiXactIdIsValid, MultiXactIdPrecedes(), MultiXactIdPrecedesOrEquals(), pfree(), VacuumCutoffs::relfrozenxid, VacuumCutoffs::relminmxid, HeapTupleHeaderData::t_infomask, TransactionIdIsNormal, TransactionIdPrecedes(), TransactionIdPrecedesOrEquals(), and MultiXactMember::xid.

Referenced by heap_prepare_freeze_tuple(), and lazy_scan_noprune().

◆ heap_update()

TM_Result heap_update ( Relation  relation,
const ItemPointerData otid,
HeapTuple  newtup,
CommandId  cid,
Snapshot  crosscheck,
bool  wait,
TM_FailureData tmfd,
LockTupleMode lockmode,
TU_UpdateIndexes update_indexes 
)

Definition at line 3311 of file heapam.c.

3315{
3316 TM_Result result;
3324 ItemId lp;
3328 bool old_key_copied = false;
3329 Page page;
3330 BlockNumber block;
3332 Buffer buffer,
3333 newbuf,
3334 vmbuffer = InvalidBuffer,
3336 bool need_toast;
3338 pagefree;
3339 bool have_tuple_lock = false;
3340 bool iscombo;
3341 bool use_hot_update = false;
3342 bool summarized_update = false;
3343 bool key_intact;
3344 bool all_visible_cleared = false;
3345 bool all_visible_cleared_new = false;
3346 bool checked_lockers;
3347 bool locker_remains;
3348 bool id_has_external = false;
3355
3357
3358 /* Cheap, simplistic check that the tuple matches the rel's rowtype. */
3361
3362 AssertHasSnapshotForToast(relation);
3363
3364 /*
3365 * Forbid this during a parallel operation, lest it allocate a combo CID.
3366 * Other workers might need that combo CID for visibility checks, and we
3367 * have no provision for broadcasting it to them.
3368 */
3369 if (IsInParallelMode())
3370 ereport(ERROR,
3372 errmsg("cannot update tuples during a parallel operation")));
3373
3374#ifdef USE_ASSERT_CHECKING
3376#endif
3377
3378 /*
3379 * Fetch the list of attributes to be checked for various operations.
3380 *
3381 * For HOT considerations, this is wasted effort if we fail to update or
3382 * have to put the new tuple on a different page. But we must compute the
3383 * list before obtaining buffer lock --- in the worst case, if we are
3384 * doing an update on one of the relevant system catalogs, we could
3385 * deadlock if we try to fetch the list later. In any case, the relcache
3386 * caches the data so this is usually pretty cheap.
3387 *
3388 * We also need columns used by the replica identity and columns that are
3389 * considered the "key" of rows in the table.
3390 *
3391 * Note that we get copies of each bitmap, so we need not worry about
3392 * relcache flush happening midway through.
3393 */
3406
3408 INJECTION_POINT("heap_update-before-pin", NULL);
3409 buffer = ReadBuffer(relation, block);
3410 page = BufferGetPage(buffer);
3411
3412 /*
3413 * Before locking the buffer, pin the visibility map page if it appears to
3414 * be necessary. Since we haven't got the lock yet, someone else might be
3415 * in the middle of changing this, so we'll need to recheck after we have
3416 * the lock.
3417 */
3418 if (PageIsAllVisible(page))
3419 visibilitymap_pin(relation, block, &vmbuffer);
3420
3422
3424
3425 /*
3426 * Usually, a buffer pin and/or snapshot blocks pruning of otid, ensuring
3427 * we see LP_NORMAL here. When the otid origin is a syscache, we may have
3428 * neither a pin nor a snapshot. Hence, we may see other LP_ states, each
3429 * of which indicates concurrent pruning.
3430 *
3431 * Failing with TM_Updated would be most accurate. However, unlike other
3432 * TM_Updated scenarios, we don't know the successor ctid in LP_UNUSED and
3433 * LP_DEAD cases. While the distinction between TM_Updated and TM_Deleted
3434 * does matter to SQL statements UPDATE and MERGE, those SQL statements
3435 * hold a snapshot that ensures LP_NORMAL. Hence, the choice between
3436 * TM_Updated and TM_Deleted affects only the wording of error messages.
3437 * Settle on TM_Deleted, for two reasons. First, it avoids complicating
3438 * the specification of when tmfd->ctid is valid. Second, it creates
3439 * error log evidence that we took this branch.
3440 *
3441 * Since it's possible to see LP_UNUSED at otid, it's also possible to see
3442 * LP_NORMAL for a tuple that replaced LP_UNUSED. If it's a tuple for an
3443 * unrelated row, we'll fail with "duplicate key value violates unique".
3444 * XXX if otid is the live, newer version of the newtup row, we'll discard
3445 * changes originating in versions of this catalog row after the version
3446 * the caller got from syscache. See syscache-update-pruned.spec.
3447 */
3448 if (!ItemIdIsNormal(lp))
3449 {
3451
3452 UnlockReleaseBuffer(buffer);
3454 if (vmbuffer != InvalidBuffer)
3455 ReleaseBuffer(vmbuffer);
3456 tmfd->ctid = *otid;
3457 tmfd->xmax = InvalidTransactionId;
3458 tmfd->cmax = InvalidCommandId;
3460
3465 /* modified_attrs not yet initialized */
3467 return TM_Deleted;
3468 }
3469
3470 /*
3471 * Fill in enough data in oldtup for HeapDetermineColumnsInfo to work
3472 * properly.
3473 */
3474 oldtup.t_tableOid = RelationGetRelid(relation);
3475 oldtup.t_data = (HeapTupleHeader) PageGetItem(page, lp);
3476 oldtup.t_len = ItemIdGetLength(lp);
3477 oldtup.t_self = *otid;
3478
3479 /* the new tuple is ready, except for this: */
3480 newtup->t_tableOid = RelationGetRelid(relation);
3481
3482 /*
3483 * Determine columns modified by the update. Additionally, identify
3484 * whether any of the unmodified replica identity key attributes in the
3485 * old tuple is externally stored or not. This is required because for
3486 * such attributes the flattened value won't be WAL logged as part of the
3487 * new tuple so we must include it as part of the old_key_tuple. See
3488 * ExtractReplicaIdentity.
3489 */
3491 id_attrs, &oldtup,
3493
3494 /*
3495 * If we're not updating any "key" column, we can grab a weaker lock type.
3496 * This allows for more concurrency when we are running simultaneously
3497 * with foreign key checks.
3498 *
3499 * Note that if a column gets detoasted while executing the update, but
3500 * the value ends up being the same, this test will fail and we will use
3501 * the stronger lock. This is acceptable; the important case to optimize
3502 * is updates that don't manipulate key columns, not those that
3503 * serendipitously arrive at the same key values.
3504 */
3506 {
3507 *lockmode = LockTupleNoKeyExclusive;
3509 key_intact = true;
3510
3511 /*
3512 * If this is the first possibly-multixact-able operation in the
3513 * current transaction, set my per-backend OldestMemberMXactId
3514 * setting. We can be certain that the transaction will never become a
3515 * member of any older MultiXactIds than that. (We have to do this
3516 * even if we end up just using our own TransactionId below, since
3517 * some other backend could incorporate our XID into a MultiXact
3518 * immediately afterwards.)
3519 */
3521 }
3522 else
3523 {
3524 *lockmode = LockTupleExclusive;
3526 key_intact = false;
3527 }
3528
3529 /*
3530 * Note: beyond this point, use oldtup not otid to refer to old tuple.
3531 * otid may very well point at newtup->t_self, which we will overwrite
3532 * with the new tuple's location, so there's great risk of confusion if we
3533 * use otid anymore.
3534 */
3535
3536l2:
3537 checked_lockers = false;
3538 locker_remains = false;
3539 result = HeapTupleSatisfiesUpdate(&oldtup, cid, buffer);
3540
3541 /* see below about the "no wait" case */
3542 Assert(result != TM_BeingModified || wait);
3543
3544 if (result == TM_Invisible)
3545 {
3546 UnlockReleaseBuffer(buffer);
3547 ereport(ERROR,
3549 errmsg("attempted to update invisible tuple")));
3550 }
3551 else if (result == TM_BeingModified && wait)
3552 {
3555 bool can_continue = false;
3556
3557 /*
3558 * XXX note that we don't consider the "no wait" case here. This
3559 * isn't a problem currently because no caller uses that case, but it
3560 * should be fixed if such a caller is introduced. It wasn't a
3561 * problem previously because this code would always wait, but now
3562 * that some tuple locks do not conflict with one of the lock modes we
3563 * use, it is possible that this case is interesting to handle
3564 * specially.
3565 *
3566 * This may cause failures with third-party code that calls
3567 * heap_update directly.
3568 */
3569
3570 /* must copy state data before unlocking buffer */
3572 infomask = oldtup.t_data->t_infomask;
3573
3574 /*
3575 * Now we have to do something about the existing locker. If it's a
3576 * multi, sleep on it; we might be awakened before it is completely
3577 * gone (or even not sleep at all in some cases); we need to preserve
3578 * it as locker, unless it is gone completely.
3579 *
3580 * If it's not a multi, we need to check for sleeping conditions
3581 * before actually going to sleep. If the update doesn't conflict
3582 * with the locks, we just continue without sleeping (but making sure
3583 * it is preserved).
3584 *
3585 * Before sleeping, we need to acquire tuple lock to establish our
3586 * priority for the tuple (see heap_lock_tuple). LockTuple will
3587 * release us when we are next-in-line for the tuple. Note we must
3588 * not acquire the tuple lock until we're sure we're going to sleep;
3589 * otherwise we're open for race conditions with other transactions
3590 * holding the tuple lock which sleep on us.
3591 *
3592 * If we are forced to "start over" below, we keep the tuple lock;
3593 * this arranges that we stay at the head of the line while rechecking
3594 * tuple state.
3595 */
3597 {
3599 int remain;
3600 bool current_is_member = false;
3601
3603 *lockmode, &current_is_member))
3604 {
3606
3607 /*
3608 * Acquire the lock, if necessary (but skip it when we're
3609 * requesting a lock and already have one; avoids deadlock).
3610 */
3611 if (!current_is_member)
3612 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3614
3615 /* wait for multixact */
3617 relation, &oldtup.t_self, XLTW_Update,
3618 &remain);
3619 checked_lockers = true;
3620 locker_remains = remain != 0;
3622
3623 /*
3624 * If xwait had just locked the tuple then some other xact
3625 * could update this tuple before we get to this point. Check
3626 * for xmax change, and start over if so.
3627 */
3628 if (xmax_infomask_changed(oldtup.t_data->t_infomask,
3629 infomask) ||
3631 xwait))
3632 goto l2;
3633 }
3634
3635 /*
3636 * Note that the multixact may not be done by now. It could have
3637 * surviving members; our own xact or other subxacts of this
3638 * backend, and also any other concurrent transaction that locked
3639 * the tuple with LockTupleKeyShare if we only got
3640 * LockTupleNoKeyExclusive. If this is the case, we have to be
3641 * careful to mark the updated tuple with the surviving members in
3642 * Xmax.
3643 *
3644 * Note that there could have been another update in the
3645 * MultiXact. In that case, we need to check whether it committed
3646 * or aborted. If it aborted we are safe to update it again;
3647 * otherwise there is an update conflict, and we have to return
3648 * TableTuple{Deleted, Updated} below.
3649 *
3650 * In the LockTupleExclusive case, we still need to preserve the
3651 * surviving members: those would include the tuple locks we had
3652 * before this one, which are important to keep in case this
3653 * subxact aborts.
3654 */
3655 if (!HEAP_XMAX_IS_LOCKED_ONLY(oldtup.t_data->t_infomask))
3657 else
3659
3660 /*
3661 * There was no UPDATE in the MultiXact; or it aborted. No
3662 * TransactionIdIsInProgress() call needed here, since we called
3663 * MultiXactIdWait() above.
3664 */
3667 can_continue = true;
3668 }
3670 {
3671 /*
3672 * The only locker is ourselves; we can avoid grabbing the tuple
3673 * lock here, but must preserve our locking information.
3674 */
3675 checked_lockers = true;
3676 locker_remains = true;
3677 can_continue = true;
3678 }
3680 {
3681 /*
3682 * If it's just a key-share locker, and we're not changing the key
3683 * columns, we don't need to wait for it to end; but we need to
3684 * preserve it as locker.
3685 */
3686 checked_lockers = true;
3687 locker_remains = true;
3688 can_continue = true;
3689 }
3690 else
3691 {
3692 /*
3693 * Wait for regular transaction to end; but first, acquire tuple
3694 * lock.
3695 */
3697 heap_acquire_tuplock(relation, &(oldtup.t_self), *lockmode,
3699 XactLockTableWait(xwait, relation, &oldtup.t_self,
3700 XLTW_Update);
3701 checked_lockers = true;
3703
3704 /*
3705 * xwait is done, but if xwait had just locked the tuple then some
3706 * other xact could update this tuple before we get to this point.
3707 * Check for xmax change, and start over if so.
3708 */
3709 if (xmax_infomask_changed(oldtup.t_data->t_infomask, infomask) ||
3712 goto l2;
3713
3714 /* Otherwise check if it committed or aborted */
3715 UpdateXmaxHintBits(oldtup.t_data, buffer, xwait);
3716 if (oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)
3717 can_continue = true;
3718 }
3719
3720 if (can_continue)
3721 result = TM_Ok;
3722 else if (!ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid))
3723 result = TM_Updated;
3724 else
3725 result = TM_Deleted;
3726 }
3727
3728 /* Sanity check the result HeapTupleSatisfiesUpdate() and the logic above */
3729 if (result != TM_Ok)
3730 {
3731 Assert(result == TM_SelfModified ||
3732 result == TM_Updated ||
3733 result == TM_Deleted ||
3734 result == TM_BeingModified);
3735 Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID));
3736 Assert(result != TM_Updated ||
3737 !ItemPointerEquals(&oldtup.t_self, &oldtup.t_data->t_ctid));
3738 }
3739
3740 if (crosscheck != InvalidSnapshot && result == TM_Ok)
3741 {
3742 /* Perform additional check for transaction-snapshot mode RI updates */
3744 result = TM_Updated;
3745 }
3746
3747 if (result != TM_Ok)
3748 {
3749 tmfd->ctid = oldtup.t_data->t_ctid;
3750 tmfd->xmax = HeapTupleHeaderGetUpdateXid(oldtup.t_data);
3751 if (result == TM_SelfModified)
3752 tmfd->cmax = HeapTupleHeaderGetCmax(oldtup.t_data);
3753 else
3754 tmfd->cmax = InvalidCommandId;
3755 UnlockReleaseBuffer(buffer);
3756 if (have_tuple_lock)
3757 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
3758 if (vmbuffer != InvalidBuffer)
3759 ReleaseBuffer(vmbuffer);
3761
3768 return result;
3769 }
3770
3771 /*
3772 * If we didn't pin the visibility map page and the page has become all
3773 * visible while we were busy locking the buffer, or during some
3774 * subsequent window during which we had it unlocked, we'll have to unlock
3775 * and re-lock, to avoid holding the buffer lock across an I/O. That's a
3776 * bit unfortunate, especially since we'll now have to recheck whether the
3777 * tuple has been locked or updated under us, but hopefully it won't
3778 * happen very often.
3779 */
3780 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
3781 {
3783 visibilitymap_pin(relation, block, &vmbuffer);
3785 goto l2;
3786 }
3787
3788 /* Fill in transaction status data */
3789
3790 /*
3791 * If the tuple we're updating is locked, we need to preserve the locking
3792 * info in the old tuple's Xmax. Prepare a new Xmax value for this.
3793 */
3795 oldtup.t_data->t_infomask,
3796 oldtup.t_data->t_infomask2,
3797 xid, *lockmode, true,
3800
3801 /*
3802 * And also prepare an Xmax value for the new copy of the tuple. If there
3803 * was no xmax previously, or there was one but all lockers are now gone,
3804 * then use InvalidTransactionId; otherwise, get the xmax from the old
3805 * tuple. (In rare cases that might also be InvalidTransactionId and yet
3806 * not have the HEAP_XMAX_INVALID bit set; that's fine.)
3807 */
3808 if ((oldtup.t_data->t_infomask & HEAP_XMAX_INVALID) ||
3809 HEAP_LOCKED_UPGRADED(oldtup.t_data->t_infomask) ||
3812 else
3814
3816 {
3819 }
3820 else
3821 {
3822 /*
3823 * If we found a valid Xmax for the new tuple, then the infomask bits
3824 * to use on the new tuple depend on what was there on the old one.
3825 * Note that since we're doing an update, the only possibility is that
3826 * the lockers had FOR KEY SHARE lock.
3827 */
3828 if (oldtup.t_data->t_infomask & HEAP_XMAX_IS_MULTI)
3829 {
3832 }
3833 else
3834 {
3837 }
3838 }
3839
3840 /*
3841 * Prepare the new tuple with the appropriate initial values of Xmin and
3842 * Xmax, as well as initial infomask bits as computed above.
3843 */
3844 newtup->t_data->t_infomask &= ~(HEAP_XACT_MASK);
3845 newtup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK);
3846 HeapTupleHeaderSetXmin(newtup->t_data, xid);
3848 newtup->t_data->t_infomask |= HEAP_UPDATED | infomask_new_tuple;
3849 newtup->t_data->t_infomask2 |= infomask2_new_tuple;
3851
3852 /*
3853 * Replace cid with a combo CID if necessary. Note that we already put
3854 * the plain cid into the new tuple.
3855 */
3857
3858 /*
3859 * If the toaster needs to be activated, OR if the new tuple will not fit
3860 * on the same page as the old, then we need to release the content lock
3861 * (but not the pin!) on the old tuple's buffer while we are off doing
3862 * TOAST and/or table-file-extension work. We must mark the old tuple to
3863 * show that it's locked, else other processes may try to update it
3864 * themselves.
3865 *
3866 * We need to invoke the toaster if there are already any out-of-line
3867 * toasted values present, or if the new tuple is over-threshold.
3868 */
3869 if (relation->rd_rel->relkind != RELKIND_RELATION &&
3870 relation->rd_rel->relkind != RELKIND_MATVIEW)
3871 {
3872 /* toast table entries should never be recursively toasted */
3875 need_toast = false;
3876 }
3877 else
3880 newtup->t_len > TOAST_TUPLE_THRESHOLD);
3881
3883
3884 newtupsize = MAXALIGN(newtup->t_len);
3885
3887 {
3891 bool cleared_all_frozen = false;
3892
3893 /*
3894 * To prevent concurrent sessions from updating the tuple, we have to
3895 * temporarily mark it locked, while we release the page-level lock.
3896 *
3897 * To satisfy the rule that any xid potentially appearing in a buffer
3898 * written out to disk, we unfortunately have to WAL log this
3899 * temporary modification. We can reuse xl_heap_lock for this
3900 * purpose. If we crash/error before following through with the
3901 * actual update, xmax will be of an aborted transaction, allowing
3902 * other sessions to proceed.
3903 */
3904
3905 /*
3906 * Compute xmax / infomask appropriate for locking the tuple. This has
3907 * to be done separately from the combo that's going to be used for
3908 * updating, because the potentially created multixact would otherwise
3909 * be wrong.
3910 */
3912 oldtup.t_data->t_infomask,
3913 oldtup.t_data->t_infomask2,
3914 xid, *lockmode, false,
3917
3919
3921
3922 /* Clear obsolete visibility flags ... */
3923 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
3924 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
3926 /* ... and store info about transaction updating this tuple */
3929 oldtup.t_data->t_infomask |= infomask_lock_old_tuple;
3930 oldtup.t_data->t_infomask2 |= infomask2_lock_old_tuple;
3932
3933 /* temporarily make it look not-updated, but locked */
3934 oldtup.t_data->t_ctid = oldtup.t_self;
3935
3936 /*
3937 * Clear all-frozen bit on visibility map if needed. We could
3938 * immediately reset ALL_VISIBLE, but given that the WAL logging
3939 * overhead would be unchanged, that doesn't seem necessarily
3940 * worthwhile.
3941 */
3942 if (PageIsAllVisible(page) &&
3943 visibilitymap_clear(relation, block, vmbuffer,
3945 cleared_all_frozen = true;
3946
3947 MarkBufferDirty(buffer);
3948
3949 if (RelationNeedsWAL(relation))
3950 {
3953
3956
3957 xlrec.offnum = ItemPointerGetOffsetNumber(&oldtup.t_self);
3959 xlrec.infobits_set = compute_infobits(oldtup.t_data->t_infomask,
3960 oldtup.t_data->t_infomask2);
3961 xlrec.flags =
3965 PageSetLSN(page, recptr);
3966 }
3967
3969
3971
3972 /*
3973 * Let the toaster do its thing, if needed.
3974 *
3975 * Note: below this point, heaptup is the data we actually intend to
3976 * store into the relation; newtup is the caller's original untoasted
3977 * data.
3978 */
3979 if (need_toast)
3980 {
3981 /* Note we always use WAL and FSM during updates */
3983 newtupsize = MAXALIGN(heaptup->t_len);
3984 }
3985 else
3986 heaptup = newtup;
3987
3988 /*
3989 * Now, do we need a new page for the tuple, or not? This is a bit
3990 * tricky since someone else could have added tuples to the page while
3991 * we weren't looking. We have to recheck the available space after
3992 * reacquiring the buffer lock. But don't bother to do that if the
3993 * former amount of free space is still not enough; it's unlikely
3994 * there's more free now than before.
3995 *
3996 * What's more, if we need to get a new page, we will need to acquire
3997 * buffer locks on both old and new pages. To avoid deadlock against
3998 * some other backend trying to get the same two locks in the other
3999 * order, we must be consistent about the order we get the locks in.
4000 * We use the rule "lock the lower-numbered page of the relation
4001 * first". To implement this, we must do RelationGetBufferForTuple
4002 * while not holding the lock on the old page, and we must rely on it
4003 * to get the locks on both pages in the correct order.
4004 *
4005 * Another consideration is that we need visibility map page pin(s) if
4006 * we will have to clear the all-visible flag on either page. If we
4007 * call RelationGetBufferForTuple, we rely on it to acquire any such
4008 * pins; but if we don't, we have to handle that here. Hence we need
4009 * a loop.
4010 */
4011 for (;;)
4012 {
4013 if (newtupsize > pagefree)
4014 {
4015 /* It doesn't fit, must use RelationGetBufferForTuple. */
4016 newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
4017 buffer, 0, NULL,
4018 &vmbuffer_new, &vmbuffer,
4019 0);
4020 /* We're all done. */
4021 break;
4022 }
4023 /* Acquire VM page pin if needed and we don't have it. */
4024 if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
4025 visibilitymap_pin(relation, block, &vmbuffer);
4026 /* Re-acquire the lock on the old tuple's page. */
4028 /* Re-check using the up-to-date free space */
4030 if (newtupsize > pagefree ||
4031 (vmbuffer == InvalidBuffer && PageIsAllVisible(page)))
4032 {
4033 /*
4034 * Rats, it doesn't fit anymore, or somebody just now set the
4035 * all-visible flag. We must now unlock and loop to avoid
4036 * deadlock. Fortunately, this path should seldom be taken.
4037 */
4039 }
4040 else
4041 {
4042 /* We're all done. */
4043 newbuf = buffer;
4044 break;
4045 }
4046 }
4047 }
4048 else
4049 {
4050 /* No TOAST work needed, and it'll fit on same page */
4051 newbuf = buffer;
4052 heaptup = newtup;
4053 }
4054
4055 /*
4056 * We're about to do the actual update -- check for conflict first, to
4057 * avoid possibly having to roll back work we've just done.
4058 *
4059 * This is safe without a recheck as long as there is no possibility of
4060 * another process scanning the pages between this check and the update
4061 * being visible to the scan (i.e., exclusive buffer content lock(s) are
4062 * continuously held from this point until the tuple update is visible).
4063 *
4064 * For the new tuple the only check needed is at the relation level, but
4065 * since both tuples are in the same relation and the check for oldtup
4066 * will include checking the relation level, there is no benefit to a
4067 * separate check for the new tuple.
4068 */
4069 CheckForSerializableConflictIn(relation, &oldtup.t_self,
4070 BufferGetBlockNumber(buffer));
4071
4072 /*
4073 * At this point newbuf and buffer are both pinned and locked, and newbuf
4074 * has enough space for the new tuple. If they are the same buffer, only
4075 * one pin is held.
4076 */
4077
4078 if (newbuf == buffer)
4079 {
4080 /*
4081 * Since the new tuple is going into the same page, we might be able
4082 * to do a HOT update. Check if any of the index columns have been
4083 * changed.
4084 */
4086 {
4087 use_hot_update = true;
4088
4089 /*
4090 * If none of the columns that are used in hot-blocking indexes
4091 * were updated, we can apply HOT, but we do still need to check
4092 * if we need to update the summarizing indexes, and update those
4093 * indexes if the columns were updated, or we may fail to detect
4094 * e.g. value bound changes in BRIN minmax indexes.
4095 */
4097 summarized_update = true;
4098 }
4099 }
4100 else
4101 {
4102 /* Set a hint that the old page could use prune/defrag */
4103 PageSetFull(page);
4104 }
4105
4106 /*
4107 * Compute replica identity tuple before entering the critical section so
4108 * we don't PANIC upon a memory allocation failure.
4109 * ExtractReplicaIdentity() will return NULL if nothing needs to be
4110 * logged. Pass old key required as true only if the replica identity key
4111 * columns are modified or it has external data.
4112 */
4117
4118 /* NO EREPORT(ERROR) from here till changes are logged */
4120
4121 /*
4122 * If this transaction commits, the old tuple will become DEAD sooner or
4123 * later. Set flag that this page is a candidate for pruning once our xid
4124 * falls below the OldestXmin horizon. If the transaction finally aborts,
4125 * the subsequent page pruning will be a no-op and the hint will be
4126 * cleared.
4127 *
4128 * XXX Should we set hint on newbuf as well? If the transaction aborts,
4129 * there would be a prunable tuple in the newbuf; but for now we choose
4130 * not to optimize for aborts. Note that heap_xlog_update must be kept in
4131 * sync if this decision changes.
4132 */
4133 PageSetPrunable(page, xid);
4134
4135 if (use_hot_update)
4136 {
4137 /* Mark the old tuple as HOT-updated */
4139 /* And mark the new tuple as heap-only */
4141 /* Mark the caller's copy too, in case different from heaptup */
4143 }
4144 else
4145 {
4146 /* Make sure tuples are correctly marked as not-HOT */
4150 }
4151
4152 RelationPutHeapTuple(relation, newbuf, heaptup, false); /* insert new tuple */
4153
4154
4155 /* Clear obsolete visibility flags, possibly set by ourselves above... */
4156 oldtup.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED);
4157 oldtup.t_data->t_infomask2 &= ~HEAP_KEYS_UPDATED;
4158 /* ... and store info about transaction updating this tuple */
4161 oldtup.t_data->t_infomask |= infomask_old_tuple;
4162 oldtup.t_data->t_infomask2 |= infomask2_old_tuple;
4164
4165 /* record address of new tuple in t_ctid of old one */
4166 oldtup.t_data->t_ctid = heaptup->t_self;
4167
4168 /* clear PD_ALL_VISIBLE flags, reset all visibilitymap bits */
4169 if (PageIsAllVisible(BufferGetPage(buffer)))
4170 {
4171 all_visible_cleared = true;
4173 visibilitymap_clear(relation, BufferGetBlockNumber(buffer),
4174 vmbuffer, VISIBILITYMAP_VALID_BITS);
4175 }
4176 if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
4177 {
4182 }
4183
4184 if (newbuf != buffer)
4186 MarkBufferDirty(buffer);
4187
4188 /* XLOG stuff */
4189 if (RelationNeedsWAL(relation))
4190 {
4192
4193 /*
4194 * For logical decoding we need combo CIDs to properly decode the
4195 * catalog.
4196 */
4198 {
4199 log_heap_new_cid(relation, &oldtup);
4200 log_heap_new_cid(relation, heaptup);
4201 }
4202
4203 recptr = log_heap_update(relation, buffer,
4208 if (newbuf != buffer)
4209 {
4211 }
4213 }
4214
4216
4217 if (newbuf != buffer)
4220
4221 /*
4222 * Mark old tuple for invalidation from system caches at next command
4223 * boundary, and mark the new tuple for invalidation in case we abort. We
4224 * have to do this before releasing the buffer because oldtup is in the
4225 * buffer. (heaptup is all in local memory, but it's necessary to process
4226 * both tuple versions in one call to inval.c so we can avoid redundant
4227 * sinval messages.)
4228 */
4230
4231 /* Now we can release the buffer(s) */
4232 if (newbuf != buffer)
4234 ReleaseBuffer(buffer);
4237 if (BufferIsValid(vmbuffer))
4238 ReleaseBuffer(vmbuffer);
4239
4240 /*
4241 * Release the lmgr tuple lock, if we had it.
4242 */
4243 if (have_tuple_lock)
4244 UnlockTupleTuplock(relation, &(oldtup.t_self), *lockmode);
4245
4246 pgstat_count_heap_update(relation, use_hot_update, newbuf != buffer);
4247
4248 /*
4249 * If heaptup is a private copy, release it. Don't forget to copy t_self
4250 * back to the caller's image, too.
4251 */
4252 if (heaptup != newtup)
4253 {
4254 newtup->t_self = heaptup->t_self;
4256 }
4257
4258 /*
4259 * If it is a HOT update, the update may still need to update summarized
4260 * indexes, lest we fail to update those summaries and get incorrect
4261 * results (for example, minmax bounds of the block may change with this
4262 * update).
4263 */
4264 if (use_hot_update)
4265 {
4268 else
4270 }
4271 else
4273
4276
4283
4284 return TM_Ok;
4285}

References Assert, AssertHasSnapshotForToast(), bms_add_members(), bms_free(), bms_overlap(), BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), CacheInvalidateHeapTuple(), CheckForSerializableConflictIn(), TM_FailureData::cmax, compute_infobits(), compute_new_xmax_infomask(), TM_FailureData::ctid, DoesMultiXactIdConflict(), END_CRIT_SECTION, ereport, errcode(), errmsg(), ERROR, ExtractReplicaIdentity(), fb(), GetCurrentTransactionId(), GetMultiXactIdHintBits(), HEAP2_XACT_MASK, heap_acquire_tuplock(), heap_freetuple(), HEAP_LOCKED_UPGRADED(), HEAP_MOVED, heap_toast_insert_or_update(), HEAP_UPDATED, HEAP_XACT_MASK, HEAP_XMAX_BITS, HEAP_XMAX_INVALID, HEAP_XMAX_IS_KEYSHR_LOCKED(), HEAP_XMAX_IS_LOCKED_ONLY(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_KEYSHR_LOCK, HEAP_XMAX_LOCK_ONLY, HeapDetermineColumnsInfo(), HeapTupleClearHeapOnly(), HeapTupleClearHotUpdated(), HeapTupleGetUpdateXid(), HeapTupleHasExternal(), HeapTupleHeaderAdjustCmax(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetNatts, HeapTupleHeaderGetRawXmax(), HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderSetCmax(), HeapTupleHeaderSetCmin(), HeapTupleHeaderSetXmax(), HeapTupleHeaderSetXmin(), HeapTupleSatisfiesUpdate(), HeapTupleSatisfiesVisibility(), HeapTupleSetHeapOnly(), HeapTupleSetHotUpdated(), INDEX_ATTR_BITMAP_HOT_BLOCKING, INDEX_ATTR_BITMAP_IDENTITY_KEY, INDEX_ATTR_BITMAP_KEY, INDEX_ATTR_BITMAP_SUMMARIZED, INJECTION_POINT, InvalidBuffer, InvalidCommandId, InvalidSnapshot, InvalidTransactionId, IsInParallelMode(), ItemIdGetLength, ItemIdIsNormal, ItemPointerEquals(), ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), ItemPointerIsValid(), LockBuffer(), LockTupleExclusive, LockTupleNoKeyExclusive, LockWaitBlock, log_heap_new_cid(), log_heap_update(), MarkBufferDirty(), MAXALIGN, MultiXactIdSetOldestMember(), MultiXactIdWait(), MultiXactStatusNoKeyUpdate, MultiXactStatusUpdate, PageClearAllVisible(), PageGetHeapFreeSpace(), PageGetItem(), PageGetItemId(), PageIsAllVisible(), PageSetFull(), PageSetLSN(), PageSetPrunable, pgstat_count_heap_update(), RelationData::rd_rel, ReadBuffer(), REGBUF_STANDARD, RelationGetBufferForTuple(), RelationGetIndexAttrBitmap(), RelationGetNumberOfAttributes, RelationGetRelid, RelationIsAccessibleInLogicalDecoding, RelationNeedsWAL, RelationPutHeapTuple(), RelationSupportsSysCache(), ReleaseBuffer(), SizeOfHeapLock, START_CRIT_SECTION, TM_BeingModified, TM_Deleted, TM_Invisible, TM_Ok, TM_SelfModified, TM_Updated, TOAST_TUPLE_THRESHOLD, TransactionIdDidAbort(), TransactionIdEquals, TransactionIdIsCurrentTransactionId(), TransactionIdIsValid, TU_All, TU_None, TU_Summarizing, UnlockReleaseBuffer(), UnlockTupleTuplock, UpdateXmaxHintBits(), VISIBILITYMAP_ALL_FROZEN, visibilitymap_clear(), visibilitymap_pin(), VISIBILITYMAP_VALID_BITS, XactLockTableWait(), XLH_LOCK_ALL_FROZEN_CLEARED, XLOG_HEAP_LOCK, XLogBeginInsert(), XLogInsert(), XLogRegisterBuffer(), XLogRegisterData(), XLTW_Update, TM_FailureData::xmax, and xmax_infomask_changed().

Referenced by heapam_tuple_update(), and simple_heap_update().

◆ HeapCheckForSerializableConflictOut()

void HeapCheckForSerializableConflictOut ( bool  visible,
Relation  relation,
HeapTuple  tuple,
Buffer  buffer,
Snapshot  snapshot 
)

Definition at line 9325 of file heapam.c.

9328{
9329 TransactionId xid;
9331
9332 if (!CheckForSerializableConflictOutNeeded(relation, snapshot))
9333 return;
9334
9335 /*
9336 * Check to see whether the tuple has been written to by a concurrent
9337 * transaction, either to create it not visible to us, or to delete it
9338 * while it is visible to us. The "visible" bool indicates whether the
9339 * tuple is visible to us, while HeapTupleSatisfiesVacuum checks what else
9340 * is going on with it.
9341 *
9342 * In the event of a concurrently inserted tuple that also happens to have
9343 * been concurrently updated (by a separate transaction), the xmin of the
9344 * tuple will be used -- not the updater's xid.
9345 */
9347 switch (htsvResult)
9348 {
9349 case HEAPTUPLE_LIVE:
9350 if (visible)
9351 return;
9352 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9353 break;
9356 if (visible)
9357 xid = HeapTupleHeaderGetUpdateXid(tuple->t_data);
9358 else
9359 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9360
9362 {
9363 /* This is like the HEAPTUPLE_DEAD case */
9364 Assert(!visible);
9365 return;
9366 }
9367 break;
9369 xid = HeapTupleHeaderGetXmin(tuple->t_data);
9370 break;
9371 case HEAPTUPLE_DEAD:
9372 Assert(!visible);
9373 return;
9374 default:
9375
9376 /*
9377 * The only way to get to this default clause is if a new value is
9378 * added to the enum type without adding it to this switch
9379 * statement. That's a bug, so elog.
9380 */
9381 elog(ERROR, "unrecognized return value from HeapTupleSatisfiesVacuum: %u", htsvResult);
9382
9383 /*
9384 * In spite of having all enum values covered and calling elog on
9385 * this default, some compilers think this is a code path which
9386 * allows xid to be used below without initialization. Silence
9387 * that warning.
9388 */
9390 }
9391
9394
9395 /*
9396 * Find top level xid. Bail out if xid is too early to be a conflict, or
9397 * if it's our own xid.
9398 */
9400 return;
9403 return;
9404
9405 CheckForSerializableConflictOut(relation, xid, snapshot);
9406}

References Assert, CheckForSerializableConflictOut(), CheckForSerializableConflictOutNeeded(), elog, ERROR, fb(), GetTopTransactionIdIfAny(), HEAPTUPLE_DEAD, HEAPTUPLE_DELETE_IN_PROGRESS, HEAPTUPLE_INSERT_IN_PROGRESS, HEAPTUPLE_LIVE, HEAPTUPLE_RECENTLY_DEAD, HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleSatisfiesVacuum(), InvalidTransactionId, SubTransGetTopmostTransaction(), HeapTupleData::t_data, TransactionIdEquals, TransactionIdFollowsOrEquals(), TransactionIdIsValid, TransactionIdPrecedes(), and TransactionXmin.

Referenced by BitmapHeapScanNextBlock(), heap_fetch(), heap_get_latest_tid(), heap_hot_search_buffer(), heapam_scan_sample_next_tuple(), heapgettup(), and page_collect_tuples().

◆ HeapDetermineColumnsInfo()

static Bitmapset * HeapDetermineColumnsInfo ( Relation  relation,
Bitmapset interesting_cols,
Bitmapset external_cols,
HeapTuple  oldtup,
HeapTuple  newtup,
bool has_external 
)
static

Definition at line 4465 of file heapam.c.

4470{
4471 int attidx;
4473 TupleDesc tupdesc = RelationGetDescr(relation);
4474
4475 attidx = -1;
4476 while ((attidx = bms_next_member(interesting_cols, attidx)) >= 0)
4477 {
4478 /* attidx is zero-based, attrnum is the normal attribute number */
4480 Datum value1,
4481 value2;
4482 bool isnull1,
4483 isnull2;
4484
4485 /*
4486 * If it's a whole-tuple reference, say "not equal". It's not really
4487 * worth supporting this case, since it could only succeed after a
4488 * no-op update, which is hardly a case worth optimizing for.
4489 */
4490 if (attrnum == 0)
4491 {
4492 modified = bms_add_member(modified, attidx);
4493 continue;
4494 }
4495
4496 /*
4497 * Likewise, automatically say "not equal" for any system attribute
4498 * other than tableOID; we cannot expect these to be consistent in a
4499 * HOT chain, or even to be set correctly yet in the new tuple.
4500 */
4501 if (attrnum < 0)
4502 {
4503 if (attrnum != TableOidAttributeNumber)
4504 {
4505 modified = bms_add_member(modified, attidx);
4506 continue;
4507 }
4508 }
4509
4510 /*
4511 * Extract the corresponding values. XXX this is pretty inefficient
4512 * if there are many indexed columns. Should we do a single
4513 * heap_deform_tuple call on each tuple, instead? But that doesn't
4514 * work for system columns ...
4515 */
4516 value1 = heap_getattr(oldtup, attrnum, tupdesc, &isnull1);
4517 value2 = heap_getattr(newtup, attrnum, tupdesc, &isnull2);
4518
4519 if (!heap_attr_equals(tupdesc, attrnum, value1,
4520 value2, isnull1, isnull2))
4521 {
4522 modified = bms_add_member(modified, attidx);
4523 continue;
4524 }
4525
4526 /*
4527 * No need to check attributes that can't be stored externally. Note
4528 * that system attributes can't be stored externally.
4529 */
4530 if (attrnum < 0 || isnull1 ||
4531 TupleDescCompactAttr(tupdesc, attrnum - 1)->attlen != -1)
4532 continue;
4533
4534 /*
4535 * Check if the old tuple's attribute is stored externally and is a
4536 * member of external_cols.
4537 */
4540 *has_external = true;
4541 }
4542
4543 return modified;
4544}

References attlen, bms_add_member(), bms_is_member(), bms_next_member(), DatumGetPointer(), fb(), FirstLowInvalidHeapAttributeNumber, heap_attr_equals(), heap_getattr(), RelationGetDescr, TableOidAttributeNumber, TupleDescCompactAttr(), and VARATT_IS_EXTERNAL().

Referenced by heap_update().

◆ heapgettup()

static void heapgettup ( HeapScanDesc  scan,
ScanDirection  dir,
int  nkeys,
ScanKey  key 
)
static

Definition at line 959 of file heapam.c.

963{
964 HeapTuple tuple = &(scan->rs_ctup);
965 Page page;
967 int linesleft;
968
969 if (likely(scan->rs_inited))
970 {
971 /* continue from previously returned page/tuple */
973 page = heapgettup_continue_page(scan, dir, &linesleft, &lineoff);
974 goto continue_page;
975 }
976
977 /*
978 * advance the scan until we find a qualifying tuple or run out of stuff
979 * to scan
980 */
981 while (true)
982 {
983 heap_fetch_next_buffer(scan, dir);
984
985 /* did we run out of blocks to scan? */
986 if (!BufferIsValid(scan->rs_cbuf))
987 break;
988
990
992 page = heapgettup_start_page(scan, dir, &linesleft, &lineoff);
994
995 /*
996 * Only continue scanning the page while we have lines left.
997 *
998 * Note that this protects us from accessing line pointers past
999 * PageGetMaxOffsetNumber(); both for forward scans when we resume the
1000 * table scan, and for when we start scanning a new page.
1001 */
1002 for (; linesleft > 0; linesleft--, lineoff += dir)
1003 {
1004 bool visible;
1006
1007 if (!ItemIdIsNormal(lpp))
1008 continue;
1009
1010 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1011 tuple->t_len = ItemIdGetLength(lpp);
1012 ItemPointerSet(&(tuple->t_self), scan->rs_cblock, lineoff);
1013
1014 visible = HeapTupleSatisfiesVisibility(tuple,
1015 scan->rs_base.rs_snapshot,
1016 scan->rs_cbuf);
1017
1019 tuple, scan->rs_cbuf,
1020 scan->rs_base.rs_snapshot);
1021
1022 /* skip tuples not visible to this snapshot */
1023 if (!visible)
1024 continue;
1025
1026 /* skip any tuples that don't match the scan key */
1027 if (key != NULL &&
1029 nkeys, key))
1030 continue;
1031
1033 scan->rs_coffset = lineoff;
1034 return;
1035 }
1036
1037 /*
1038 * if we get here, it means we've exhausted the items on this page and
1039 * it's time to move to the next.
1040 */
1042 }
1043
1044 /* end of scan */
1045 if (BufferIsValid(scan->rs_cbuf))
1046 ReleaseBuffer(scan->rs_cbuf);
1047
1048 scan->rs_cbuf = InvalidBuffer;
1051 tuple->t_data = NULL;
1052 scan->rs_inited = false;
1053}

References Assert, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferGetBlockNumber(), BufferIsValid(), fb(), heap_fetch_next_buffer(), HeapCheckForSerializableConflictOut(), heapgettup_continue_page(), heapgettup_start_page(), HeapKeyTest(), HeapTupleSatisfiesVisibility(), InvalidBlockNumber, InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerSet(), likely, LockBuffer(), PageGetItem(), PageGetItemId(), RelationGetDescr, ReleaseBuffer(), HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_coffset, HeapScanDescData::rs_ctup, HeapScanDescData::rs_inited, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, TableScanDescData::rs_snapshot, HeapTupleData::t_data, HeapTupleData::t_len, and HeapTupleData::t_self.

Referenced by heap_getnext(), heap_getnextslot(), and heap_getnextslot_tidrange().

◆ heapgettup_advance_block()

static BlockNumber heapgettup_advance_block ( HeapScanDesc  scan,
BlockNumber  block,
ScanDirection  dir 
)
inlinestatic

Definition at line 875 of file heapam.c.

876{
877 Assert(scan->rs_base.rs_parallel == NULL);
878
880 {
881 block++;
882
883 /* wrap back to the start of the heap */
884 if (block >= scan->rs_nblocks)
885 block = 0;
886
887 /*
888 * Report our new scan position for synchronization purposes. We don't
889 * do that when moving backwards, however. That would just mess up any
890 * other forward-moving scanners.
891 *
892 * Note: we do this before checking for end of scan so that the final
893 * state of the position hint is back at the start of the rel. That's
894 * not strictly necessary, but otherwise when you run the same query
895 * multiple times the starting position would shift a little bit
896 * backwards on every invocation, which is confusing. We don't
897 * guarantee any specific ordering in general, though.
898 */
899 if (scan->rs_base.rs_flags & SO_ALLOW_SYNC)
900 ss_report_location(scan->rs_base.rs_rd, block);
901
902 /* we're done if we're back at where we started */
903 if (block == scan->rs_startblock)
904 return InvalidBlockNumber;
905
906 /* check if the limit imposed by heap_setscanlimits() is met */
907 if (scan->rs_numblocks != InvalidBlockNumber)
908 {
909 if (--scan->rs_numblocks == 0)
910 return InvalidBlockNumber;
911 }
912
913 return block;
914 }
915 else
916 {
917 /* we're done if the last block is the start position */
918 if (block == scan->rs_startblock)
919 return InvalidBlockNumber;
920
921 /* check if the limit imposed by heap_setscanlimits() is met */
922 if (scan->rs_numblocks != InvalidBlockNumber)
923 {
924 if (--scan->rs_numblocks == 0)
925 return InvalidBlockNumber;
926 }
927
928 /* wrap to the end of the heap when the last page was page 0 */
929 if (block == 0)
930 block = scan->rs_nblocks;
931
932 block--;
933
934 return block;
935 }
936}

References Assert, fb(), InvalidBlockNumber, likely, HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_nblocks, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, TableScanDescData::rs_rd, HeapScanDescData::rs_startblock, ScanDirectionIsForward, SO_ALLOW_SYNC, and ss_report_location().

Referenced by heap_scan_stream_read_next_serial().

◆ heapgettup_continue_page()

static Page heapgettup_continue_page ( HeapScanDesc  scan,
ScanDirection  dir,
int linesleft,
OffsetNumber lineoff 
)
inlinestatic

Definition at line 829 of file heapam.c.

831{
832 Page page;
833
834 Assert(scan->rs_inited);
836
837 /* Caller is responsible for ensuring buffer is locked if needed */
838 page = BufferGetPage(scan->rs_cbuf);
839
840 if (ScanDirectionIsForward(dir))
841 {
843 *linesleft = PageGetMaxOffsetNumber(page) - (*lineoff) + 1;
844 }
845 else
846 {
847 /*
848 * The previous returned tuple may have been vacuumed since the
849 * previous scan when we use a non-MVCC snapshot, so we must
850 * re-establish the lineoff <= PageGetMaxOffsetNumber(page) invariant
851 */
853 *linesleft = *lineoff;
854 }
855
856 /* lineoff now references the physically previous or next tid */
857 return page;
858}

References Assert, BufferGetPage(), BufferIsValid(), fb(), Min, OffsetNumberNext, OffsetNumberPrev, PageGetMaxOffsetNumber(), HeapScanDescData::rs_cbuf, HeapScanDescData::rs_coffset, HeapScanDescData::rs_inited, and ScanDirectionIsForward.

Referenced by heapgettup().

◆ heapgettup_initial_block()

static pg_noinline BlockNumber heapgettup_initial_block ( HeapScanDesc  scan,
ScanDirection  dir 
)
static

Definition at line 751 of file heapam.c.

752{
753 Assert(!scan->rs_inited);
754 Assert(scan->rs_base.rs_parallel == NULL);
755
756 /* When there are no pages to scan, return InvalidBlockNumber */
757 if (scan->rs_nblocks == 0 || scan->rs_numblocks == 0)
758 return InvalidBlockNumber;
759
760 if (ScanDirectionIsForward(dir))
761 {
762 return scan->rs_startblock;
763 }
764 else
765 {
766 /*
767 * Disable reporting to syncscan logic in a backwards scan; it's not
768 * very likely anyone else is doing the same thing at the same time,
769 * and much more likely that we'll just bollix things for forward
770 * scanners.
771 */
773
774 /*
775 * Start from last page of the scan. Ensure we take into account
776 * rs_numblocks if it's been adjusted by heap_setscanlimits().
777 */
778 if (scan->rs_numblocks != InvalidBlockNumber)
779 return (scan->rs_startblock + scan->rs_numblocks - 1) % scan->rs_nblocks;
780
781 if (scan->rs_startblock > 0)
782 return scan->rs_startblock - 1;
783
784 return scan->rs_nblocks - 1;
785 }
786}

References Assert, fb(), InvalidBlockNumber, HeapScanDescData::rs_base, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, HeapScanDescData::rs_nblocks, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, HeapScanDescData::rs_startblock, and ScanDirectionIsForward.

Referenced by heap_scan_stream_read_next_serial().

◆ heapgettup_pagemode()

static void heapgettup_pagemode ( HeapScanDesc  scan,
ScanDirection  dir,
int  nkeys,
ScanKey  key 
)
static

Definition at line 1069 of file heapam.c.

1073{
1074 HeapTuple tuple = &(scan->rs_ctup);
1075 Page page;
1078
1079 if (likely(scan->rs_inited))
1080 {
1081 /* continue from previously returned page/tuple */
1082 page = BufferGetPage(scan->rs_cbuf);
1083
1084 lineindex = scan->rs_cindex + dir;
1085 if (ScanDirectionIsForward(dir))
1086 linesleft = scan->rs_ntuples - lineindex;
1087 else
1088 linesleft = scan->rs_cindex;
1089 /* lineindex now references the next or previous visible tid */
1090
1091 goto continue_page;
1092 }
1093
1094 /*
1095 * advance the scan until we find a qualifying tuple or run out of stuff
1096 * to scan
1097 */
1098 while (true)
1099 {
1100 heap_fetch_next_buffer(scan, dir);
1101
1102 /* did we run out of blocks to scan? */
1103 if (!BufferIsValid(scan->rs_cbuf))
1104 break;
1105
1107
1108 /* prune the page and determine visible tuple offsets */
1110 page = BufferGetPage(scan->rs_cbuf);
1111 linesleft = scan->rs_ntuples;
1113
1114 /* block is the same for all tuples, set it once outside the loop */
1116
1117 /* lineindex now references the next or previous visible tid */
1119
1120 for (; linesleft > 0; linesleft--, lineindex += dir)
1121 {
1122 ItemId lpp;
1124
1125 Assert(lineindex < scan->rs_ntuples);
1127 lpp = PageGetItemId(page, lineoff);
1129
1130 tuple->t_data = (HeapTupleHeader) PageGetItem(page, lpp);
1131 tuple->t_len = ItemIdGetLength(lpp);
1133
1134 /* skip any tuples that don't match the scan key */
1135 if (key != NULL &&
1137 nkeys, key))
1138 continue;
1139
1140 scan->rs_cindex = lineindex;
1141 return;
1142 }
1143 }
1144
1145 /* end of scan */
1146 if (BufferIsValid(scan->rs_cbuf))
1147 ReleaseBuffer(scan->rs_cbuf);
1148 scan->rs_cbuf = InvalidBuffer;
1151 tuple->t_data = NULL;
1152 scan->rs_inited = false;
1153}

References Assert, BufferGetBlockNumber(), BufferGetPage(), BufferIsValid(), fb(), heap_fetch_next_buffer(), heap_prepare_pagescan(), HeapKeyTest(), InvalidBlockNumber, InvalidBuffer, ItemIdGetLength, ItemIdIsNormal, ItemPointerSetBlockNumber(), ItemPointerSetOffsetNumber(), likely, PageGetItem(), PageGetItemId(), RelationGetDescr, ReleaseBuffer(), HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_cindex, HeapScanDescData::rs_ctup, HeapScanDescData::rs_inited, HeapScanDescData::rs_ntuples, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, ScanDirectionIsForward, HeapTupleData::t_data, HeapTupleData::t_len, and HeapTupleData::t_self.

Referenced by heap_getnext(), heap_getnextslot(), and heap_getnextslot_tidrange().

◆ heapgettup_start_page()

static Page heapgettup_start_page ( HeapScanDesc  scan,
ScanDirection  dir,
int linesleft,
OffsetNumber lineoff 
)
static

Definition at line 798 of file heapam.c.

800{
801 Page page;
802
803 Assert(scan->rs_inited);
805
806 /* Caller is responsible for ensuring buffer is locked if needed */
807 page = BufferGetPage(scan->rs_cbuf);
808
810
811 if (ScanDirectionIsForward(dir))
813 else
815
816 /* lineoff now references the physically previous or next tid */
817 return page;
818}

References Assert, BufferGetPage(), BufferIsValid(), fb(), FirstOffsetNumber, PageGetMaxOffsetNumber(), HeapScanDescData::rs_cbuf, HeapScanDescData::rs_inited, and ScanDirectionIsForward.

Referenced by heapgettup().

◆ HeapTupleGetUpdateXid()

◆ HeapTupleHeaderAdvanceConflictHorizon()

void HeapTupleHeaderAdvanceConflictHorizon ( HeapTupleHeader  tuple,
TransactionId snapshotConflictHorizon 
)

Definition at line 8053 of file heapam.c.

8055{
8059
8060 if (tuple->t_infomask & HEAP_MOVED)
8061 {
8062 if (TransactionIdPrecedes(*snapshotConflictHorizon, xvac))
8063 *snapshotConflictHorizon = xvac;
8064 }
8065
8066 /*
8067 * Ignore tuples inserted by an aborted transaction or if the tuple was
8068 * updated/deleted by the inserting transaction.
8069 *
8070 * Look for a committed hint bit, or if no xmin bit is set, check clog.
8071 */
8072 if (HeapTupleHeaderXminCommitted(tuple) ||
8074 {
8075 if (xmax != xmin &&
8076 TransactionIdFollows(xmax, *snapshotConflictHorizon))
8077 *snapshotConflictHorizon = xmax;
8078 }
8079}

References fb(), HEAP_MOVED, HeapTupleHeaderGetUpdateXid(), HeapTupleHeaderGetXmin(), HeapTupleHeaderGetXvac(), HeapTupleHeaderXminCommitted(), HeapTupleHeaderXminInvalid(), HeapTupleHeaderData::t_infomask, TransactionIdDidCommit(), TransactionIdFollows(), and TransactionIdPrecedes().

Referenced by heap_index_delete_tuples(), heap_prune_chain(), and prune_freeze_plan().

◆ index_delete_check_htid()

static void index_delete_check_htid ( TM_IndexDeleteOp delstate,
Page  page,
OffsetNumber  maxoff,
const ItemPointerData htid,
TM_IndexStatus istatus 
)
inlinestatic

Definition at line 8138 of file heapam.c.

8141{
8143 ItemId iid;
8144
8145 Assert(OffsetNumberIsValid(istatus->idxoffnum));
8146
8147 if (unlikely(indexpagehoffnum > maxoff))
8148 ereport(ERROR,
8150 errmsg_internal("heap tid from index tuple (%u,%u) points past end of heap page line pointer array at offset %u of block %u in index \"%s\"",
8153 istatus->idxoffnum, delstate->iblknum,
8155
8157 if (unlikely(!ItemIdIsUsed(iid)))
8158 ereport(ERROR,
8160 errmsg_internal("heap tid from index tuple (%u,%u) points to unused heap page item at offset %u of block %u in index \"%s\"",
8163 istatus->idxoffnum, delstate->iblknum,
8165
8166 if (ItemIdHasStorage(iid))
8167 {
8168 HeapTupleHeader htup;
8169
8171 htup = (HeapTupleHeader) PageGetItem(page, iid);
8172
8174 ereport(ERROR,
8176 errmsg_internal("heap tid from index tuple (%u,%u) points to heap-only tuple at offset %u of block %u in index \"%s\"",
8179 istatus->idxoffnum, delstate->iblknum,
8181 }
8182}

References Assert, ereport, errcode(), errmsg_internal(), ERROR, fb(), HeapTupleHeaderIsHeapOnly(), ItemIdHasStorage, ItemIdIsNormal, ItemIdIsUsed, ItemPointerGetBlockNumber(), ItemPointerGetOffsetNumber(), OffsetNumberIsValid, PageGetItem(), PageGetItemId(), RelationGetRelationName, and unlikely.

Referenced by heap_index_delete_tuples().

◆ index_delete_sort()

static void index_delete_sort ( TM_IndexDeleteOp delstate)
static

Definition at line 8543 of file heapam.c.

8544{
8545 TM_IndexDelete *deltids = delstate->deltids;
8546 int ndeltids = delstate->ndeltids;
8547
8548 /*
8549 * Shellsort gap sequence (taken from Sedgewick-Incerpi paper).
8550 *
8551 * This implementation is fast with array sizes up to ~4500. This covers
8552 * all supported BLCKSZ values.
8553 */
8554 const int gaps[9] = {1968, 861, 336, 112, 48, 21, 7, 3, 1};
8555
8556 /* Think carefully before changing anything here -- keep swaps cheap */
8557 StaticAssertDecl(sizeof(TM_IndexDelete) <= 8,
8558 "element size exceeds 8 bytes");
8559
8560 for (int g = 0; g < lengthof(gaps); g++)
8561 {
8562 for (int hi = gaps[g], i = hi; i < ndeltids; i++)
8563 {
8564 TM_IndexDelete d = deltids[i];
8565 int j = i;
8566
8567 while (j >= hi && index_delete_sort_cmp(&deltids[j - hi], &d) >= 0)
8568 {
8569 deltids[j] = deltids[j - hi];
8570 j -= hi;
8571 }
8572 deltids[j] = d;
8573 }
8574 }
8575}

References fb(), i, index_delete_sort_cmp(), j, lengthof, and StaticAssertDecl.

Referenced by heap_index_delete_tuples().

◆ index_delete_sort_cmp()

static int index_delete_sort_cmp ( TM_IndexDelete deltid1,
TM_IndexDelete deltid2 
)
inlinestatic

Definition at line 8507 of file heapam.c.

8508{
8509 ItemPointer tid1 = &deltid1->tid;
8510 ItemPointer tid2 = &deltid2->tid;
8511
8512 {
8515
8516 if (blk1 != blk2)
8517 return (blk1 < blk2) ? -1 : 1;
8518 }
8519 {
8522
8523 if (pos1 != pos2)
8524 return (pos1 < pos2) ? -1 : 1;
8525 }
8526
8527 Assert(false);
8528
8529 return 0;
8530}

References Assert, fb(), ItemPointerGetBlockNumber(), and ItemPointerGetOffsetNumber().

Referenced by index_delete_sort().

◆ initscan()

static void initscan ( HeapScanDesc  scan,
ScanKey  key,
bool  keep_startblock 
)
static

Definition at line 356 of file heapam.c.

357{
359 bool allow_strat;
360 bool allow_sync;
361
362 /*
363 * Determine the number of blocks we have to scan.
364 *
365 * It is sufficient to do this once at scan start, since any tuples added
366 * while the scan is in progress will be invisible to my snapshot anyway.
367 * (That is not true when using a non-MVCC snapshot. However, we couldn't
368 * guarantee to return tuples added after scan start anyway, since they
369 * might go into pages we already scanned. To guarantee consistent
370 * results for a non-MVCC snapshot, the caller must hold some higher-level
371 * lock that ensures the interesting tuple(s) won't change.)
372 */
373 if (scan->rs_base.rs_parallel != NULL)
374 {
376 scan->rs_nblocks = bpscan->phs_nblocks;
377 }
378 else
380
381 /*
382 * If the table is large relative to NBuffers, use a bulk-read access
383 * strategy and enable synchronized scanning (see syncscan.c). Although
384 * the thresholds for these features could be different, we make them the
385 * same so that there are only two behaviors to tune rather than four.
386 * (However, some callers need to be able to disable one or both of these
387 * behaviors, independently of the size of the table; also there is a GUC
388 * variable that can disable synchronized scanning.)
389 *
390 * Note that table_block_parallelscan_initialize has a very similar test;
391 * if you change this, consider changing that one, too.
392 */
394 scan->rs_nblocks > NBuffers / 4)
395 {
397 allow_sync = (scan->rs_base.rs_flags & SO_ALLOW_SYNC) != 0;
398 }
399 else
400 allow_strat = allow_sync = false;
401
402 if (allow_strat)
403 {
404 /* During a rescan, keep the previous strategy object. */
405 if (scan->rs_strategy == NULL)
407 }
408 else
409 {
410 if (scan->rs_strategy != NULL)
412 scan->rs_strategy = NULL;
413 }
414
415 if (scan->rs_base.rs_parallel != NULL)
416 {
417 /* For parallel scan, believe whatever ParallelTableScanDesc says. */
420 else
422
423 /*
424 * If not rescanning, initialize the startblock. Finding the actual
425 * start location is done in table_block_parallelscan_startblock_init,
426 * based on whether an alternative start location has been set with
427 * heap_setscanlimits, or using the syncscan location, when syncscan
428 * is enabled.
429 */
430 if (!keep_startblock)
432 }
433 else
434 {
435 if (keep_startblock)
436 {
437 /*
438 * When rescanning, we want to keep the previous startblock
439 * setting, so that rewinding a cursor doesn't generate surprising
440 * results. Reset the active syncscan setting, though.
441 */
444 else
446 }
448 {
451 }
452 else
453 {
455 scan->rs_startblock = 0;
456 }
457 }
458
460 scan->rs_inited = false;
461 scan->rs_ctup.t_data = NULL;
463 scan->rs_cbuf = InvalidBuffer;
465 scan->rs_ntuples = 0;
466 scan->rs_cindex = 0;
467
468 /*
469 * Initialize to ForwardScanDirection because it is most common and
470 * because heap scans go forward before going backward (e.g. CURSORs).
471 */
474
475 /* page-at-a-time fields are always invalid when not rs_inited */
476
477 /*
478 * copy the scan key, if appropriate
479 */
480 if (key != NULL && scan->rs_base.rs_nkeys > 0)
481 memcpy(scan->rs_base.rs_key, key, scan->rs_base.rs_nkeys * sizeof(ScanKeyData));
482
483 /*
484 * Currently, we only have a stats counter for sequential heap scans (but
485 * e.g for bitmap scans the underlying bitmap index scans will be counted,
486 * and for sample scans we update stats for tuple fetches).
487 */
488 if (scan->rs_base.rs_flags & SO_TYPE_SEQSCAN)
490}

References BAS_BULKREAD, fb(), ForwardScanDirection, FreeAccessStrategy(), GetAccessStrategy(), InvalidBlockNumber, InvalidBuffer, ItemPointerSetInvalid(), NBuffers, pgstat_count_heap_scan, ParallelTableScanDescData::phs_syncscan, RelationGetNumberOfBlocks, RelationUsesLocalBuffers, HeapScanDescData::rs_base, HeapScanDescData::rs_cblock, HeapScanDescData::rs_cbuf, HeapScanDescData::rs_cindex, HeapScanDescData::rs_ctup, HeapScanDescData::rs_dir, TableScanDescData::rs_flags, HeapScanDescData::rs_inited, TableScanDescData::rs_key, HeapScanDescData::rs_nblocks, TableScanDescData::rs_nkeys, HeapScanDescData::rs_ntuples, HeapScanDescData::rs_numblocks, TableScanDescData::rs_parallel, HeapScanDescData::rs_prefetch_block, TableScanDescData::rs_rd, HeapScanDescData::rs_startblock, HeapScanDescData::rs_strategy, SO_ALLOW_STRAT, SO_ALLOW_SYNC, SO_TYPE_SEQSCAN, ss_get_location(), synchronize_seqscans, HeapTupleData::t_data, and HeapTupleData::t_self.

Referenced by heap_beginscan(), and heap_rescan().

◆ log_heap_new_cid()

static XLogRecPtr log_heap_new_cid ( Relation  relation,
HeapTuple  tup 
)
static

Definition at line 9140 of file heapam.c.

9141{
9143
9145 HeapTupleHeader hdr = tup->t_data;
9146
9147 Assert(ItemPointerIsValid(&tup->t_self));
9148 Assert(tup->t_tableOid != InvalidOid);
9149
9150 xlrec.top_xid = GetTopTransactionId();
9151 xlrec.target_locator = relation->rd_locator;
9152 xlrec.target_tid = tup->t_self;
9153
9154 /*
9155 * If the tuple got inserted & deleted in the same TX we definitely have a
9156 * combo CID, set cmin and cmax.
9157 */
9158 if (hdr->t_infomask & HEAP_COMBOCID)
9159 {
9162 xlrec.cmin = HeapTupleHeaderGetCmin(hdr);
9163 xlrec.cmax = HeapTupleHeaderGetCmax(hdr);
9164 xlrec.combocid = HeapTupleHeaderGetRawCommandId(hdr);
9165 }
9166 /* No combo CID, so only cmin or cmax can be set by this TX */
9167 else
9168 {
9169 /*
9170 * Tuple inserted.
9171 *
9172 * We need to check for LOCK ONLY because multixacts might be
9173 * transferred to the new tuple in case of FOR KEY SHARE updates in
9174 * which case there will be an xmax, although the tuple just got
9175 * inserted.
9176 */
9177 if (hdr->t_infomask & HEAP_XMAX_INVALID ||
9179 {
9181 xlrec.cmax = InvalidCommandId;
9182 }
9183 /* Tuple from a different tx updated or deleted. */
9184 else
9185 {
9186 xlrec.cmin = InvalidCommandId;
9188 }
9189 xlrec.combocid = InvalidCommandId;
9190 }
9191
9192 /*
9193 * Note that we don't need to register the buffer here, because this
9194 * operation does not modify the page. The insert/update/delete that
9195 * called us certainly did, but that's WAL-logged separately.
9196 */
9199
9200 /* will be looked at irrespective of origin */
9201
9203
9204 return recptr;
9205}

References Assert, fb(), GetTopTransactionId(), HEAP_COMBOCID, HEAP_XMAX_INVALID, HEAP_XMAX_IS_LOCKED_ONLY(), HeapTupleHeaderGetCmax(), HeapTupleHeaderGetCmin(), HeapTupleHeaderGetRawCommandId(), HeapTupleHeaderXminInvalid(), InvalidCommandId, InvalidOid, ItemPointerIsValid(), RelationData::rd_locator, SizeOfHeapNewCid, HeapTupleHeaderData::t_infomask, XLOG_HEAP2_NEW_CID, XLogBeginInsert(), XLogInsert(), and XLogRegisterData().

Referenced by heap_delete(), heap_insert(), heap_multi_insert(), and heap_update().

◆ log_heap_update()

static XLogRecPtr log_heap_update ( Relation  reln,
Buffer  oldbuf,
Buffer  newbuf,
HeapTuple  oldtup,
HeapTuple  newtup,
HeapTuple  old_key_tuple,
bool  all_visible_cleared,
bool  new_all_visible_cleared 
)
static

Definition at line 8918 of file heapam.c.

8922{
8926 uint8 info;
8928 uint16 prefixlen = 0,
8929 suffixlen = 0;
8931 Page page = BufferGetPage(newbuf);
8933 bool init;
8934 int bufflags;
8935
8936 /* Caller should not call me on a non-WAL-logged relation */
8938
8940
8942 info = XLOG_HEAP_HOT_UPDATE;
8943 else
8944 info = XLOG_HEAP_UPDATE;
8945
8946 /*
8947 * If the old and new tuple are on the same page, we only need to log the
8948 * parts of the new tuple that were changed. That saves on the amount of
8949 * WAL we need to write. Currently, we just count any unchanged bytes in
8950 * the beginning and end of the tuple. That's quick to check, and
8951 * perfectly covers the common case that only one field is updated.
8952 *
8953 * We could do this even if the old and new tuple are on different pages,
8954 * but only if we don't make a full-page image of the old page, which is
8955 * difficult to know in advance. Also, if the old tuple is corrupt for
8956 * some reason, it would allow the corruption to propagate the new page,
8957 * so it seems best to avoid. Under the general assumption that most
8958 * updates tend to create the new tuple version on the same page, there
8959 * isn't much to be gained by doing this across pages anyway.
8960 *
8961 * Skip this if we're taking a full-page image of the new page, as we
8962 * don't include the new tuple in the WAL record in that case. Also
8963 * disable if effective_wal_level='logical', as logical decoding needs to
8964 * be able to read the new tuple in whole from the WAL record alone.
8965 */
8966 if (oldbuf == newbuf && !need_tuple_data &&
8968 {
8969 char *oldp = (char *) oldtup->t_data + oldtup->t_data->t_hoff;
8970 char *newp = (char *) newtup->t_data + newtup->t_data->t_hoff;
8971 int oldlen = oldtup->t_len - oldtup->t_data->t_hoff;
8972 int newlen = newtup->t_len - newtup->t_data->t_hoff;
8973
8974 /* Check for common prefix between old and new tuple */
8975 for (prefixlen = 0; prefixlen < Min(oldlen, newlen); prefixlen++)
8976 {
8977 if (newp[prefixlen] != oldp[prefixlen])
8978 break;
8979 }
8980
8981 /*
8982 * Storing the length of the prefix takes 2 bytes, so we need to save
8983 * at least 3 bytes or there's no point.
8984 */
8985 if (prefixlen < 3)
8986 prefixlen = 0;
8987
8988 /* Same for suffix */
8990 {
8991 if (newp[newlen - suffixlen - 1] != oldp[oldlen - suffixlen - 1])
8992 break;
8993 }
8994 if (suffixlen < 3)
8995 suffixlen = 0;
8996 }
8997
8998 /* Prepare main WAL data chain */
8999 xlrec.flags = 0;
9004 if (prefixlen > 0)
9006 if (suffixlen > 0)
9008 if (need_tuple_data)
9009 {
9011 if (old_key_tuple)
9012 {
9013 if (reln->rd_rel->relreplident == REPLICA_IDENTITY_FULL)
9015 else
9017 }
9018 }
9019
9020 /* If new tuple is the single and first tuple on page... */
9023 {
9024 info |= XLOG_HEAP_INIT_PAGE;
9025 init = true;
9026 }
9027 else
9028 init = false;
9029
9030 /* Prepare WAL data for the old page */
9031 xlrec.old_offnum = ItemPointerGetOffsetNumber(&oldtup->t_self);
9032 xlrec.old_xmax = HeapTupleHeaderGetRawXmax(oldtup->t_data);
9033 xlrec.old_infobits_set = compute_infobits(oldtup->t_data->t_infomask,
9034 oldtup->t_data->t_infomask2);
9035
9036 /* Prepare WAL data for the new page */
9037 xlrec.new_offnum = ItemPointerGetOffsetNumber(&newtup->t_self);
9038 xlrec.new_xmax = HeapTupleHeaderGetRawXmax(newtup->t_data);
9039
9041 if (init)
9043 if (need_tuple_data)
9045
9047 if (oldbuf != newbuf)
9049
9051
9052 /*
9053 * Prepare WAL data for the new tuple.
9054 */
9055 if (prefixlen > 0 || suffixlen > 0)
9056 {
9057 if (prefixlen > 0 && suffixlen > 0)
9058 {
9061 XLogRegisterBufData(0, &prefix_suffix, sizeof(uint16) * 2);
9062 }
9063 else if (prefixlen > 0)
9064 {
9065 XLogRegisterBufData(0, &prefixlen, sizeof(uint16));
9066 }
9067 else
9068 {
9069 XLogRegisterBufData(0, &suffixlen, sizeof(uint16));
9070 }
9071 }
9072
9073 xlhdr.t_infomask2 = newtup->t_data->t_infomask2;
9074 xlhdr.t_infomask = newtup->t_data->t_infomask;
9075 xlhdr.t_hoff = newtup->t_data->t_hoff;
9077
9078 /*
9079 * PG73FORMAT: write bitmap [+ padding] [+ oid] + data
9080 *
9081 * The 'data' doesn't include the common prefix or suffix.
9082 */
9084 if (prefixlen == 0)
9085 {
9087 (char *) newtup->t_data + SizeofHeapTupleHeader,
9089 }
9090 else
9091 {
9092 /*
9093 * Have to write the null bitmap and data after the common prefix as
9094 * two separate rdata entries.
9095 */
9096 /* bitmap [+ padding] [+ oid] */
9097 if (newtup->t_data->t_hoff - SizeofHeapTupleHeader > 0)
9098 {
9100 (char *) newtup->t_data + SizeofHeapTupleHeader,
9101 newtup->t_data->t_hoff - SizeofHeapTupleHeader);
9102 }
9103
9104 /* data after common prefix */
9106 (char *) newtup->t_data + newtup->t_data->t_hoff + prefixlen,
9107 newtup->t_len - newtup->t_data->t_hoff - prefixlen - suffixlen);
9108 }
9109
9110 /* We need to log a tuple identity */
9112 {
9113 /* don't really need this, but its more comfy to decode */
9114 xlhdr_idx.t_infomask2 = old_key_tuple->t_data->t_infomask2;
9115 xlhdr_idx.t_infomask = old_key_tuple->t_data->t_infomask;
9116 xlhdr_idx.t_hoff = old_key_tuple->t_data->t_hoff;
9117
9119
9120 /* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
9123 }
9124
9125 /* filtering by origin on a row level is much more efficient */
9127
9128 recptr = XLogInsert(RM_HEAP_ID, info);
9129
9130 return recptr;
9131}

References Assert, BufferGetPage(), compute_infobits(), fb(), FirstOffsetNumber, HeapTupleHeaderGetRawXmax(), HeapTupleIsHeapOnly(), init, ItemPointerGetOffsetNumber(), Min, PageGetMaxOffsetNumber(), REGBUF_KEEP_DATA, REGBUF_STANDARD, REGBUF_WILL_INIT, RelationIsLogicallyLogged, RelationNeedsWAL, SizeOfHeapHeader, SizeofHeapTupleHeader, SizeOfHeapUpdate, XLH_UPDATE_CONTAINS_NEW_TUPLE, XLH_UPDATE_CONTAINS_OLD_KEY, XLH_UPDATE_CONTAINS_OLD_TUPLE, XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED, XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED, XLH_UPDATE_PREFIX_FROM_OLD, XLH_UPDATE_SUFFIX_FROM_OLD, XLOG_HEAP_HOT_UPDATE, XLOG_HEAP_INIT_PAGE, XLOG_HEAP_UPDATE, XLOG_INCLUDE_ORIGIN, XLogBeginInsert(), XLogCheckBufferNeedsBackup(), XLogInsert(), XLogRegisterBufData(), XLogRegisterBuffer(), XLogRegisterData(), and XLogSetRecordFlags().

Referenced by heap_update().

◆ log_heap_visible()

XLogRecPtr log_heap_visible ( Relation  rel,
Buffer  heap_buffer,
Buffer  vm_buffer,
TransactionId  snapshotConflictHorizon,
uint8  vmflags 
)

◆ MultiXactIdGetUpdateXid()

static TransactionId MultiXactIdGetUpdateXid ( TransactionId  xmax,
uint16  t_infomask 
)
static

Definition at line 7607 of file heapam.c.

7608{
7610 MultiXactMember *members;
7611 int nmembers;
7612
7613 Assert(!(t_infomask & HEAP_XMAX_LOCK_ONLY));
7614 Assert(t_infomask & HEAP_XMAX_IS_MULTI);
7615
7616 /*
7617 * Since we know the LOCK_ONLY bit is not set, this cannot be a multi from
7618 * pre-pg_upgrade.
7619 */
7620 nmembers = GetMultiXactIdMembers(xmax, &members, false, false);
7621
7622 if (nmembers > 0)
7623 {
7624 int i;
7625
7626 for (i = 0; i < nmembers; i++)
7627 {
7628 /* Ignore lockers */
7629 if (!ISUPDATE_from_mxstatus(members[i].status))
7630 continue;
7631
7632 /* there can be at most one updater */
7634 update_xact = members[i].xid;
7635#ifndef USE_ASSERT_CHECKING
7636
7637 /*
7638 * in an assert-enabled build, walk the whole array to ensure
7639 * there's no other updater.
7640 */
7641 break;
7642#endif
7643 }
7644
7645 pfree(members);
7646 }
7647
7648 return update_xact;
7649}

References Assert, fb(), GetMultiXactIdMembers(), HEAP_XMAX_IS_MULTI, HEAP_XMAX_LOCK_ONLY, i, InvalidTransactionId, ISUPDATE_from_mxstatus, pfree(), and MultiXactMember::xid.

Referenced by compute_new_xmax_infomask(), FreezeMultiXactId(), heap_lock_updated_tuple(), and HeapTupleGetUpdateXid().

◆ MultiXactIdWait()

static void MultiXactIdWait ( MultiXactId  multi,
MultiXactStatus  status,
uint16  infomask,
Relation  rel,
const ItemPointerData ctid,
XLTW_Oper  oper,
int remaining 
)
static

Definition at line 7853 of file heapam.c.

7856{
7857 (void) Do_MultiXactIdWait(multi, status, infomask, false,
7858 rel, ctid, oper, remaining, false);
7859}

References Do_MultiXactIdWait(), fb(), oper(), and remaining.

Referenced by heap_delete(), heap_inplace_lock(), heap_lock_tuple(), and heap_update().

◆ page_collect_tuples()

static pg_attribute_always_inline int page_collect_tuples ( HeapScanDesc  scan,
Snapshot  snapshot,
Page  page,
Buffer  buffer,
BlockNumber  block,
int  lines,
bool  all_visible,
bool  check_serializable 
)
static

Definition at line 521 of file heapam.c.

525{
526 Oid relid = RelationGetRelid(scan->rs_base.rs_rd);
527 int ntup = 0;
528 int nvis = 0;
530
531 /* page at a time should have been disabled otherwise */
532 Assert(IsMVCCSnapshot(snapshot));
533
534 /* first find all tuples on the page */
536 {
539
541 continue;
542
543 /*
544 * If the page is not all-visible or we need to check serializability,
545 * maintain enough state to be able to refind the tuple efficiently,
546 * without again first needing to fetch the item and then via that the
547 * tuple.
548 */
549 if (!all_visible || check_serializable)
550 {
551 tup = &batchmvcc.tuples[ntup];
552
554 tup->t_len = ItemIdGetLength(lpp);
555 tup->t_tableOid = relid;
556 ItemPointerSet(&(tup->t_self), block, lineoff);
557 }
558
559 /*
560 * If the page is all visible, these fields otherwise won't be
561 * populated in loop below.
562 */
563 if (all_visible)
564 {
566 {
567 batchmvcc.visible[ntup] = true;
568 }
569 scan->rs_vistuples[ntup] = lineoff;
570 }
571
572 ntup++;
573 }
574
576
577 /*
578 * Unless the page is all visible, test visibility for all tuples one go.
579 * That is considerably more efficient than calling
580 * HeapTupleSatisfiesMVCC() one-by-one.
581 */
582 if (all_visible)
583 nvis = ntup;
584 else
585 nvis = HeapTupleSatisfiesMVCCBatch(snapshot, buffer,
586 ntup,
587 &batchmvcc,
588 scan->rs_vistuples);
589
590 /*
591 * So far we don't have batch API for testing serializabilty, so do so
592 * one-by-one.
593 */
595 {
596 for (int i = 0; i < ntup; i++)
597 {
599 scan->rs_base.rs_rd,
600 &batchmvcc.tuples[i],
601 buffer, snapshot);
602 }
603 }
604
605 return nvis;
606}

References Assert, fb(), FirstOffsetNumber, HeapCheckForSerializableConflictOut(), HeapTupleSatisfiesMVCCBatch(), i, IsMVCCSnapshot, ItemIdGetLength, ItemIdIsNormal, ItemPointerSet(), MaxHeapTuplesPerPage, PageGetItem(), PageGetItemId(), RelationGetRelid, HeapScanDescData::rs_base, TableScanDescData::rs_rd, HeapScanDescData::rs_vistuples, HeapTupleData::t_data, and unlikely.

Referenced by heap_prepare_pagescan().

◆ ReleaseBulkInsertStatePin()

void ReleaseBulkInsertStatePin ( BulkInsertState  bistate)

Definition at line 2103 of file heapam.c.

2104{
2105 if (bistate->current_buf != InvalidBuffer)
2106 ReleaseBuffer(bistate->current_buf);
2107 bistate->current_buf = InvalidBuffer;
2108
2109 /*
2110 * Despite the name, we also reset bulk relation extension state.
2111 * Otherwise we can end up erroring out due to looking for free space in
2112 * ->next_free of one partition, even though ->next_free was set when
2113 * extending another partition. It could obviously also be bad for
2114 * efficiency to look at existing blocks at offsets from another
2115 * partition, even if we don't error out.
2116 */
2117 bistate->next_free = InvalidBlockNumber;
2118 bistate->last_free = InvalidBlockNumber;
2119}

References BulkInsertStateData::current_buf, InvalidBlockNumber, InvalidBuffer, BulkInsertStateData::last_free, BulkInsertStateData::next_free, and ReleaseBuffer().

Referenced by CopyFrom().

◆ simple_heap_delete()

void simple_heap_delete ( Relation  relation,
const ItemPointerData tid 
)

Definition at line 3265 of file heapam.c.

3266{
3267 TM_Result result;
3268 TM_FailureData tmfd;
3269
3270 result = heap_delete(relation, tid,
3272 true /* wait for commit */ ,
3273 &tmfd, false /* changingPart */ );
3274 switch (result)
3275 {
3276 case TM_SelfModified:
3277 /* Tuple was already updated in current command? */
3278 elog(ERROR, "tuple already updated by self");
3279 break;
3280
3281 case TM_Ok:
3282 /* done successfully */
3283 break;
3284
3285 case TM_Updated:
3286 elog(ERROR, "tuple concurrently updated");
3287 break;
3288
3289 case TM_Deleted:
3290 elog(ERROR, "tuple concurrently deleted");
3291 break;
3292
3293 default:
3294 elog(ERROR, "unrecognized heap_delete status: %u", result);
3295 break;
3296 }
3297}

References elog, ERROR, GetCurrentCommandId(), heap_delete(), InvalidSnapshot, TM_Deleted, TM_Ok, TM_SelfModified, and TM_Updated.

Referenced by CatalogTupleDelete(), and toast_delete_datum().

◆ simple_heap_insert()

void simple_heap_insert ( Relation  relation,
HeapTuple  tup 
)

Definition at line 2784 of file heapam.c.

2785{
2786 heap_insert(relation, tup, GetCurrentCommandId(true), 0, NULL);
2787}

References fb(), GetCurrentCommandId(), and heap_insert().

Referenced by CatalogTupleInsert(), CatalogTupleInsertWithInfo(), and InsertOneTuple().

◆ simple_heap_update()

void simple_heap_update ( Relation  relation,
const ItemPointerData otid,
HeapTuple  tup,
TU_UpdateIndexes update_indexes 
)

Definition at line 4555 of file heapam.c.

4557{
4558 TM_Result result;
4559 TM_FailureData tmfd;
4560 LockTupleMode lockmode;
4561
4562 result = heap_update(relation, otid, tup,
4564 true /* wait for commit */ ,
4565 &tmfd, &lockmode, update_indexes);
4566 switch (result)
4567 {
4568 case TM_SelfModified:
4569 /* Tuple was already updated in current command? */
4570 elog(ERROR, "tuple already updated by self");
4571 break;
4572
4573 case TM_Ok:
4574 /* done successfully */
4575 break;
4576
4577 case TM_Updated:
4578 elog(ERROR, "tuple concurrently updated");
4579 break;
4580
4581 case TM_Deleted:
4582 elog(ERROR, "tuple concurrently deleted");
4583 break;
4584
4585 default:
4586 elog(ERROR, "unrecognized heap_update status: %u", result);
4587 break;
4588 }
4589}

References elog, ERROR, fb(), GetCurrentCommandId(), heap_update(), InvalidSnapshot, TM_Deleted, TM_Ok, TM_SelfModified, and TM_Updated.

Referenced by CatalogTupleUpdate(), and CatalogTupleUpdateWithInfo().

◆ test_lockmode_for_conflict()

static TM_Result test_lockmode_for_conflict ( MultiXactStatus  status,
TransactionId  xid,
LockTupleMode  mode,
HeapTuple  tup,
bool needwait 
)
static

Definition at line 5675 of file heapam.c.

5678{
5680
5681 *needwait = false;
5683
5684 /*
5685 * Note: we *must* check TransactionIdIsInProgress before
5686 * TransactionIdDidAbort/Commit; see comment at top of heapam_visibility.c
5687 * for an explanation.
5688 */
5690 {
5691 /*
5692 * The tuple has already been locked by our own transaction. This is
5693 * very rare but can happen if multiple transactions are trying to
5694 * lock an ancient version of the same tuple.
5695 */
5696 return TM_SelfModified;
5697 }
5698 else if (TransactionIdIsInProgress(xid))
5699 {
5700 /*
5701 * If the locking transaction is running, what we do depends on
5702 * whether the lock modes conflict: if they do, then we must wait for
5703 * it to finish; otherwise we can fall through to lock this tuple
5704 * version without waiting.
5705 */
5708 {
5709 *needwait = true;
5710 }
5711
5712 /*
5713 * If we set needwait above, then this value doesn't matter;
5714 * otherwise, this value signals to caller that it's okay to proceed.
5715 */
5716 return TM_Ok;
5717 }
5718 else if (TransactionIdDidAbort(xid))
5719 return TM_Ok;
5720 else if (TransactionIdDidCommit(xid))
5721 {
5722 /*
5723 * The other transaction committed. If it was only a locker, then the
5724 * lock is completely gone now and we can return success; but if it
5725 * was an update, then what we do depends on whether the two lock
5726 * modes conflict. If they conflict, then we must report error to
5727 * caller. But if they don't, we can fall through to allow the current
5728 * transaction to lock the tuple.
5729 *
5730 * Note: the reason we worry about ISUPDATE here is because as soon as
5731 * a transaction ends, all its locks are gone and meaningless, and
5732 * thus we can ignore them; whereas its updates persist. In the
5733 * TransactionIdIsInProgress case, above, we don't need to check
5734 * because we know the lock is still "alive" and thus a conflict needs
5735 * always be checked.
5736 */
5737 if (!ISUPDATE_from_mxstatus(status))
5738 return TM_Ok;
5739
5742 {
5743 /* bummer */
5744 if (!ItemPointerEquals(&tup->t_self, &tup->t_data->t_ctid))
5745 return TM_Updated;
5746 else
5747 return TM_Deleted;
5748 }
5749
5750 return TM_Ok;
5751 }
5752
5753 /* Not in progress, not aborted, not committed -- must have crashed */
5754 return TM_Ok;
5755}

References DoLockModesConflict(), fb(), get_mxact_status_for_lock(), ISUPDATE_from_mxstatus, ItemPointerEquals(), LOCKMODE_from_mxstatus, mode, TM_Deleted, TM_Ok, TM_SelfModified, TM_Updated, TransactionIdDidAbort(), TransactionIdDidCommit(), TransactionIdIsCurrentTransactionId(), and TransactionIdIsInProgress().

Referenced by heap_lock_updated_tuple_rec().

◆ UpdateXmaxHintBits()

◆ xmax_infomask_changed()

static bool xmax_infomask_changed ( uint16  new_infomask,
uint16  old_infomask 
)
inlinestatic

Definition at line 2819 of file heapam.c.

2820{
2821 const uint16 interesting =
2823
2824 if ((new_infomask & interesting) != (old_infomask & interesting))
2825 return true;
2826
2827 return false;
2828}

References fb(), HEAP_LOCK_MASK, HEAP_XMAX_IS_MULTI, and HEAP_XMAX_LOCK_ONLY.

Referenced by heap_delete(), heap_lock_tuple(), and heap_update().

Variable Documentation

◆ hwlock

LOCKMODE hwlock

Definition at line 127 of file heapam.c.

◆ lockstatus

int lockstatus

Definition at line 128 of file heapam.c.

◆ MultiXactStatusLock

const int MultiXactStatusLock[MaxMultiXactStatus+1]
static
Initial value:

Definition at line 206 of file heapam.c.

207{
208 LockTupleKeyShare, /* ForKeyShare */
209 LockTupleShare, /* ForShare */
210 LockTupleNoKeyExclusive, /* ForNoKeyUpdate */
211 LockTupleExclusive, /* ForUpdate */
212 LockTupleNoKeyExclusive, /* NoKeyUpdate */
213 LockTupleExclusive /* Update */
214};

◆ [struct]

const struct { ... } tupleLockExtraInfo[]
Initial value:
=
{
.hwlock = AccessShareLock,
.updstatus = -1
},
.hwlock = RowShareLock,
.lockstatus = MultiXactStatusForShare,
.updstatus = -1
},
.hwlock = ExclusiveLock,
},
.lockstatus = MultiXactStatusForUpdate,
.updstatus = MultiXactStatusUpdate
}
}
#define AccessExclusiveLock
Definition lockdefs.h:43
#define ExclusiveLock
Definition lockdefs.h:42
#define RowShareLock
Definition lockdefs.h:37

Referenced by DoesMultiXactIdConflict(), and get_mxact_status_for_lock().

◆ updstatus

int updstatus

Definition at line 129 of file heapam.c.