PostgreSQL Source Code git master
Loading...
Searching...
No Matches
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "common/hashfn.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/aio.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/proclist.h"
#include "storage/procsignal.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
#include "utils/wait_event.h"
#include "lib/simplehash.h"
#include "lib/sort_template.h"
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountData
 
struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define SH_PREFIX   refcount
 
#define SH_ELEMENT_TYPE   PrivateRefCountEntry
 
#define SH_KEY_TYPE   Buffer
 
#define SH_KEY   buffer
 
#define SH_HASH_KEY(tb, key)   murmurhash32((uint32) (key))
 
#define SH_EQUAL(tb, a, b)   ((a) == (b))
 
#define SH_SCOPE   static inline
 
#define SH_DECLARE
 
#define SH_DEFINE
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define READV_COUNT_BITS   7
 
#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)
 

Typedefs

typedef struct PrivateRefCountData PrivateRefCountData
 
typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static void ResOwnerReleaseBufferIO (Datum res)
 
static charResOwnerPrintBufferIO (Datum res)
 
static void ResOwnerReleaseBuffer (Datum res)
 
static charResOwnerPrintBuffer (Datum res)
 
static pg_noinline PrivateRefCountEntryGetPrivateRefCountEntrySlow (Buffer buffer, bool do_move)
 
static Buffer ReadBuffer_common (Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static BlockNumber ExtendBufferedRelCommon (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static BlockNumber ExtendBufferedRelShared (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void UnpinBufferNoOwner (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static void AbortBufferIO (Buffer buffer)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
 
static bool AsyncReadBuffers (ReadBuffersOperation *operation, int *nblocks_progress)
 
static void CheckReadBuffersOperation (ReadBuffersOperation *operation, bool is_complete)
 
static pg_attribute_always_inline void TrackBufferHit (IOObject io_object, IOContext io_context, Relation rel, char persistence, SMgrRelation smgr, ForkNumber forknum, BlockNumber blocknum)
 
static Buffer GetVictimBuffer (BufferAccessStrategy strategy, IOContext io_context)
 
static void FlushUnlockedBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
static void BufferLockAcquire (Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
 
static void BufferLockUnlock (Buffer buffer, BufferDesc *buf_hdr)
 
static bool BufferLockConditional (Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
 
static bool BufferLockHeldByMeInMode (BufferDesc *buf_hdr, BufferLockMode mode)
 
static bool BufferLockHeldByMe (BufferDesc *buf_hdr)
 
static void BufferLockDisown (Buffer buffer, BufferDesc *buf_hdr)
 
static int BufferLockDisownInternal (Buffer buffer, BufferDesc *buf_hdr)
 
static bool BufferLockAttempt (BufferDesc *buf_hdr, BufferLockMode mode)
 
static void BufferLockQueueSelf (BufferDesc *buf_hdr, BufferLockMode mode)
 
static void BufferLockDequeueSelf (BufferDesc *buf_hdr)
 
static void BufferLockWakeup (BufferDesc *buf_hdr, bool unlocked)
 
static void BufferLockProcessRelease (BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
 
static uint64 BufferLockReleaseSub (BufferLockMode mode)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
Buffer ExtendBufferedRel (BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
 
BlockNumber ExtendBufferedRelBy (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
 
Buffer ExtendBufferedRelTo (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
 
static void ZeroAndLockBuffer (Buffer buffer, ReadBufferMode mode, bool already_valid)
 
static pg_attribute_always_inline Buffer PinBufferForBlock (Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, IOObject io_object, IOContext io_context, bool *foundPtr)
 
static pg_attribute_always_inline bool StartReadBuffersImpl (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
 
bool StartReadBuffers (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
 
bool StartReadBuffer (ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
 
static void ProcessReadBuffersResult (ReadBuffersOperation *operation)
 
bool WaitReadBuffers (ReadBuffersOperation *operation)
 
static void InvalidateBuffer (BufferDesc *buf)
 
static bool InvalidateVictimBuffer (BufferDesc *buf_hdr)
 
uint32 GetPinLimit (void)
 
uint32 GetAdditionalPinLimit (void)
 
void LimitAdditionalPins (uint32 *additional_pins)
 
bool BufferIsLockedByMe (Buffer buffer)
 
bool BufferIsLockedByMeInMode (Buffer buffer, BufferLockMode mode)
 
bool BufferIsDirty (Buffer buffer)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
static void WakePinCountWaiter (BufferDesc *buf)
 
void TrackNewBufferPin (Buffer buf)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferManagerAccess (void)
 
charDebugPrintBufferRefcount (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
static void MarkSharedBufferDirtyHint (Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, bool buffer_std)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void UnlockBuffer (Buffer buffer)
 
void LockBufferInternal (Buffer buffer, BufferLockMode mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void CheckBufferIsPinnedOnce (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
static bool SharedBufferBeginSetHintBits (Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
 
bool BufferBeginSetHintBits (Buffer buffer)
 
void BufferFinishSetHintBits (Buffer buffer, bool mark_dirty, bool buffer_std)
 
bool BufferSetHintBits16 (uint16 *ptr, uint16 val, Buffer buffer)
 
StartBufferIOResult StartSharedBufferIO (BufferDesc *buf, bool forInput, bool wait, PgAioWaitRef *io_wref)
 
StartBufferIOResult StartBufferIO (Buffer buffer, bool forInput, bool wait, PgAioWaitRef *io_wref)
 
void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
 
uint64 LockBufHdr (BufferDesc *desc)
 
pg_noinline uint64 WaitBufHdrUnlocked (BufferDesc *buf)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *wb_context, IOContext io_context)
 
static bool EvictUnpinnedBufferInternal (BufferDesc *desc, bool *buffer_flushed)
 
bool EvictUnpinnedBuffer (Buffer buf, bool *buffer_flushed)
 
void EvictAllUnpinnedBuffers (int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
void EvictRelUnpinnedBuffers (Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
static bool MarkDirtyUnpinnedBufferInternal (Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
 
bool MarkDirtyUnpinnedBuffer (Buffer buf, bool *buffer_already_dirty)
 
void MarkDirtyRelUnpinnedBuffers (Relation rel, int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
 
void MarkDirtyAllUnpinnedBuffers (int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
 
static pg_attribute_always_inline void buffer_stage_common (PgAioHandle *ioh, bool is_write, bool is_temp)
 
static void buffer_readv_decode_error (PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
 
static void buffer_readv_encode_error (PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
 
static pg_attribute_always_inline void buffer_readv_complete_one (PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
 
static pg_attribute_always_inline PgAioResult buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
 
static void buffer_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static void shared_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete_local (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void local_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult local_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT
 
int io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescPinCountWaitBuf = NULL
 
static Buffer PrivateRefCountArrayKeys [REFCOUNT_ARRAY_ENTRIES]
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static refcount_hashPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static int ReservedRefCountSlot = -1
 
static int PrivateRefCountEntryLast = -1
 
static uint32 MaxProportionalPins
 
const ResourceOwnerDesc buffer_io_resowner_desc
 
const ResourceOwnerDesc buffer_resowner_desc
 
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
 
const PgAioHandleCallbacks aio_local_buffer_readv_cb
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 95 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 85 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 84 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 77 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
false \
: \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition bufmgr.c:542
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:419
int32 * LocalRefCount
Definition localbuf.c:49
static int fb(int x)

Definition at line 599 of file bufmgr.c.

603 : \
605 (LocalRefCount[-(bufnum) - 1] > 0) \
606 : \
608)

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 76 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 80 of file bufmgr.c.

◆ READV_COUNT_BITS

#define READV_COUNT_BITS   7

◆ READV_COUNT_MASK

#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 145 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 87 of file bufmgr.c.

◆ SH_DECLARE

#define SH_DECLARE

Definition at line 140 of file bufmgr.c.

◆ SH_DEFINE

#define SH_DEFINE

Definition at line 141 of file bufmgr.c.

◆ SH_ELEMENT_TYPE

#define SH_ELEMENT_TYPE   PrivateRefCountEntry

Definition at line 134 of file bufmgr.c.

◆ SH_EQUAL

#define SH_EQUAL (   tb,
  a,
  b 
)    ((a) == (b))

Definition at line 138 of file bufmgr.c.

◆ SH_HASH_KEY

#define SH_HASH_KEY (   tb,
  key 
)    murmurhash32((uint32) (key))

Definition at line 137 of file bufmgr.c.

◆ SH_KEY

#define SH_KEY   buffer

Definition at line 136 of file bufmgr.c.

◆ SH_KEY_TYPE

#define SH_KEY_TYPE   Buffer

Definition at line 135 of file bufmgr.c.

◆ SH_PREFIX

#define SH_PREFIX   refcount

Definition at line 133 of file bufmgr.c.

◆ SH_SCOPE

#define SH_SCOPE   static inline

Definition at line 139 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 3536 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 3536 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 3538 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 3538 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 3535 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 3535 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 3537 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 3537 of file bufmgr.c.

◆ ST_SORT [1/2]

Definition at line 3534 of file bufmgr.c.

◆ ST_SORT [2/2]

Definition at line 3534 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

◆ PrivateRefCountData

◆ PrivateRefCountEntry

◆ SMgrSortArray

Function Documentation

◆ AbortBufferIO()

static void AbortBufferIO ( Buffer  buffer)
static

Definition at line 7420 of file bufmgr.c.

7421{
7422 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
7424
7427
7428 if (!(buf_state & BM_VALID))
7429 {
7432 }
7433 else
7434 {
7437
7438 /* Issue notice if this is not the first failure... */
7439 if (buf_state & BM_IO_ERROR)
7440 {
7441 /* Buffer is pinned, so we can read tag without spinlock */
7444 errmsg("could not write block %u of %s",
7445 buf_hdr->tag.blockNum,
7447 BufTagGetForkNum(&buf_hdr->tag)).str),
7448 errdetail("Multiple failures --- write error might be permanent.")));
7449 }
7450 }
7451
7452 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
7453}
#define BM_TAG_VALID
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc)
#define BM_DIRTY
#define BM_IO_IN_PROGRESS
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
#define BM_IO_ERROR
static BufferDesc * GetBufferDescriptor(uint32 id)
uint64 LockBufHdr(BufferDesc *desc)
Definition bufmgr.c:7518
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
Definition bufmgr.c:7358
#define Assert(condition)
Definition c.h:943
uint64_t uint64
Definition c.h:625
int errcode(int sqlerrcode)
Definition elog.c:874
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:37
#define ereport(elevel,...)
Definition elog.h:152
static char * errmsg
#define relpathperm(rlocator, forknum)
Definition relpath.h:146

References Assert, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg, fb(), GetBufferDescriptor(), LockBufHdr(), relpathperm, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by ResOwnerReleaseBufferIO().

◆ AsyncReadBuffers()

static bool AsyncReadBuffers ( ReadBuffersOperation operation,
int nblocks_progress 
)
static

Definition at line 1929 of file bufmgr.c.

1930{
1931 Buffer *buffers = &operation->buffers[0];
1932 int flags = operation->flags;
1933 ForkNumber forknum = operation->forknum;
1934 char persistence = operation->persistence;
1935 int16 nblocks_done = operation->nblocks_done;
1936 BlockNumber blocknum = operation->blocknum + nblocks_done;
1937 Buffer *io_buffers = &operation->buffers[nblocks_done];
1938 int io_buffers_len = 0;
1940 uint32 ioh_flags = 0;
1945 StartBufferIOResult status;
1946
1947 if (persistence == RELPERSISTENCE_TEMP)
1948 {
1951 }
1952 else
1953 {
1956 }
1957
1958 /*
1959 * When this IO is executed synchronously, either because the caller will
1960 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1961 * the AIO subsystem needs to know.
1962 */
1963 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1965
1966 if (persistence == RELPERSISTENCE_TEMP)
1968
1969 /*
1970 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1971 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1972 * set globally, but on a per-session basis. The completion callback,
1973 * which may be run in other processes, e.g. in IO workers, may have a
1974 * different value of the zero_damaged_pages GUC.
1975 *
1976 * XXX: We probably should eventually use a different flag for
1977 * zero_damaged_pages, so we can report different log levels / error codes
1978 * for zero_damaged_pages and ZERO_ON_ERROR.
1979 */
1982
1983 /*
1984 * For the same reason as with zero_damaged_pages we need to use this
1985 * backend's ignore_checksum_failure value.
1986 */
1989
1990
1991 /*
1992 * To be allowed to report stats in the local completion callback we need
1993 * to prepare to report stats now. This ensures we can safely report the
1994 * checksum failure even in a critical section.
1995 */
1996 pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
1997
1998 /*
1999 * We must get an IO handle before StartBufferIO(), as pgaio_io_acquire()
2000 * might block, which we don't want after setting IO_IN_PROGRESS. If we
2001 * don't need to do the IO, we'll release the handle.
2002 *
2003 * If we need to wait for IO before we can get a handle, submit
2004 * already-staged IO first, so that other backends don't need to wait.
2005 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
2006 * wait for already submitted IO, which doesn't require additional locks,
2007 * but it could still cause undesirable waits.
2008 *
2009 * A secondary benefit is that this would allow us to measure the time in
2010 * pgaio_io_acquire() without causing undue timer overhead in the common,
2011 * non-blocking, case. However, currently the pgstats infrastructure
2012 * doesn't really allow that, as it a) asserts that an operation can't
2013 * have time without operations b) doesn't have an API to report
2014 * "accumulated" time.
2015 */
2017 if (unlikely(!ioh))
2018 {
2021 }
2022
2023 operation->foreign_io = false;
2024 pgaio_wref_clear(&operation->io_wref);
2025
2026 /*
2027 * Try to start IO on the first buffer in a new run of blocks. If AIO is
2028 * in progress, be it in this backend or another backend, we just
2029 * associate the wait reference with the operation and wait in
2030 * WaitReadBuffers(). This turns out to be important for performance in
2031 * two workloads:
2032 *
2033 * 1) A read stream that has to read the same block multiple times within
2034 * the readahead distance. This can happen e.g. for the table accesses of
2035 * an index scan.
2036 *
2037 * 2) Concurrent scans by multiple backends on the same relation.
2038 *
2039 * If we were to synchronously wait for the in-progress IO, we'd not be
2040 * able to keep enough I/O in flight.
2041 *
2042 * If we do find there is ongoing I/O for the buffer, we set up a 1-block
2043 * ReadBuffersOperation that WaitReadBuffers then can wait on.
2044 *
2045 * It's possible that another backend has started IO on the buffer but not
2046 * yet set its wait reference. In this case, we have no choice but to wait
2047 * for either the wait reference to be valid or the IO to be done.
2048 */
2049 status = StartBufferIO(buffers[nblocks_done], true, true,
2050 &operation->io_wref);
2051 if (status != BUFFER_IO_READY_FOR_IO)
2052 {
2054 *nblocks_progress = 1;
2055 if (status == BUFFER_IO_ALREADY_DONE)
2056 {
2057 /*
2058 * Someone has already completed this block, we're done.
2059 *
2060 * When IO is necessary, ->nblocks_done is updated in
2061 * ProcessReadBuffersResult(), but that is not called if no IO is
2062 * necessary. Thus update here.
2063 */
2064 operation->nblocks_done += 1;
2065 Assert(operation->nblocks_done <= operation->nblocks);
2066
2067 Assert(!pgaio_wref_valid(&operation->io_wref));
2068
2069 /*
2070 * Report and track this as a 'hit' for this backend, even though
2071 * it must have started out as a miss in PinBufferForBlock(). The
2072 * other backend will track this as a 'read'.
2073 */
2075 operation->rel, operation->persistence,
2076 operation->smgr, operation->forknum,
2077 blocknum);
2078 return false;
2079 }
2080
2081 /* The IO is already in-progress */
2082 Assert(status == BUFFER_IO_IN_PROGRESS);
2083 Assert(pgaio_wref_valid(&operation->io_wref));
2084 operation->foreign_io = true;
2085
2086 return true;
2087 }
2088
2089 Assert(io_buffers[0] == buffers[nblocks_done]);
2090 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
2091 io_buffers_len = 1;
2092
2093 /*
2094 * NB: As little code as possible should be added between the
2095 * StartBufferIO() above, the further StartBufferIO()s below and the
2096 * smgrstartreadv(), as some of the buffers are now marked as
2097 * IO_IN_PROGRESS and will thus cause other backends to wait.
2098 */
2099
2100 /*
2101 * How many neighboring-on-disk blocks can we scatter-read into other
2102 * buffers at the same time? In this case we don't wait if we see an I/O
2103 * already in progress (see comment above).
2104 */
2105 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
2106 {
2107 /* Must be consecutive block numbers. */
2108 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
2109 BufferGetBlockNumber(buffers[i]) - 1);
2110
2111 status = StartBufferIO(buffers[i], true, false, NULL);
2112 if (status != BUFFER_IO_READY_FOR_IO)
2113 break;
2114
2115 Assert(io_buffers[io_buffers_len] == buffers[i]);
2116
2117 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2118 }
2119
2120 /* get a reference to wait for in WaitReadBuffers() */
2121 pgaio_io_get_wref(ioh, &operation->io_wref);
2122
2123 /* provide the list of buffers to the completion callbacks */
2125
2127 persistence == RELPERSISTENCE_TEMP ?
2130 flags);
2131
2133
2134 /* ---
2135 * Even though we're trying to issue IO asynchronously, track the time
2136 * in smgrstartreadv():
2137 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2138 * immediately
2139 * - the io method might not support the IO (e.g. worker IO for a temp
2140 * table)
2141 * ---
2142 */
2144 smgrstartreadv(ioh, operation->smgr, forknum,
2145 blocknum,
2149
2150 if (persistence == RELPERSISTENCE_TEMP)
2152 else
2154
2155 /*
2156 * Track vacuum cost when issuing IO, not after waiting for it. Otherwise
2157 * we could end up issuing a lot of IO in a short timespan, despite a low
2158 * cost limit.
2159 */
2160 if (VacuumCostActive)
2162
2164
2165 return true;
2166}
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition aio.c:971
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:162
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition aio.c:964
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition aio.c:366
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
void pgaio_submit_staged(void)
Definition aio.c:1133
void pgaio_io_release(PgAioHandle *ioh)
Definition aio.c:240
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:188
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition aio.h:198
@ PGAIO_HF_SYNCHRONOUS
Definition aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
uint32 BlockNumber
Definition block.h:31
int Buffer
Definition buf.h:23
StartBufferIOResult
@ BUFFER_IO_IN_PROGRESS
@ BUFFER_IO_ALREADY_DONE
@ BUFFER_IO_READY_FOR_IO
bool track_io_timing
Definition bufmgr.c:192
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4446
bool zero_damaged_pages
Definition bufmgr.c:189
static pg_attribute_always_inline void TrackBufferHit(IOObject io_object, IOContext io_context, Relation rel, char persistence, SMgrRelation smgr, ForkNumber forknum, BlockNumber blocknum)
Definition bufmgr.c:1674
StartBufferIOResult StartBufferIO(Buffer buffer, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition bufmgr.c:7321
#define READ_BUFFERS_ZERO_ON_ERROR
Definition bufmgr.h:122
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:435
#define MAX_IO_COMBINE_LIMIT
Definition bufmgr.h:175
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition bufmgr.h:126
#define READ_BUFFERS_SYNCHRONOUSLY
Definition bufmgr.h:128
bool ignore_checksum_failure
Definition bufpage.c:27
int16_t int16
Definition c.h:619
#define unlikely(x)
Definition c.h:438
uint32_t uint32
Definition c.h:624
static DataChecksumsWorkerOperation operation
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition freelist.c:712
int VacuumCostPageMiss
Definition globals.c:155
bool VacuumCostActive
Definition globals.c:161
int VacuumCostBalance
Definition globals.c:160
BufferUsage pgBufferUsage
Definition instrument.c:25
int i
Definition isn.c:77
IOObject
Definition pgstat.h:280
@ IOOBJECT_RELATION
Definition pgstat.h:281
@ IOOBJECT_TEMP_RELATION
Definition pgstat.h:282
IOContext
Definition pgstat.h:289
@ IOCONTEXT_NORMAL
Definition pgstat.h:293
@ IOOP_READ
Definition pgstat.h:319
void pgstat_prepare_report_checksum_failure(Oid dboid)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
ForkNumber
Definition relpath.h:56
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition smgr.c:753
int64 shared_blks_read
Definition instrument.h:27
int64 local_blks_read
Definition instrument.h:31

References Assert, BUFFER_IO_ALREADY_DONE, BUFFER_IO_IN_PROGRESS, BUFFER_IO_READY_FOR_IO, BufferGetBlock(), BufferGetBlockNumber(), CurrentResourceOwner, fb(), i, ignore_checksum_failure, IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_READ, BufferUsage::local_blks_read, MAX_IO_COMBINE_LIMIT, operation, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_HF_REFERENCES_LOCAL, PGAIO_HF_SYNCHRONOUS, pgaio_io_acquire(), pgaio_io_acquire_nb(), pgaio_io_get_wref(), pgaio_io_register_callbacks(), pgaio_io_release(), pgaio_io_set_flag(), pgaio_io_set_handle_data_32(), pgaio_submit_staged(), pgaio_wref_clear(), pgaio_wref_valid(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), pgstat_prepare_report_checksum_failure(), READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, BufferUsage::shared_blks_read, smgrstartreadv(), StartBufferIO(), PrivateRefCountEntry::status, track_io_timing, TrackBufferHit(), unlikely, VacuumCostActive, VacuumCostBalance, VacuumCostPageMiss, and zero_damaged_pages.

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 4199 of file bufmgr.c.

4200{
4202
4204
4206}
static void CheckForBufferLeaks(void)
Definition bufmgr.c:4263
static int32 PrivateRefCountOverflowed
Definition bufmgr.c:266
void AtEOXact_LocalBuffers(bool isCommit)
Definition localbuf.c:1019

References Assert, AtEOXact_LocalBuffers(), CheckForBufferLeaks(), fb(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 4245 of file bufmgr.c.

4246{
4247 UnlockBuffers();
4248
4250
4251 /* localbuf.c needs a chance too */
4253}
void UnlockBuffers(void)
Definition bufmgr.c:5852
void AtProcExit_LocalBuffers(void)
Definition localbuf.c:1030

References AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferManagerAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 3831 of file bufmgr.c.

3832{
3833 /* info obtained from freelist.c */
3834 int strategy_buf_id;
3837
3838 /*
3839 * Information saved between calls so we can determine the strategy
3840 * point's advance rate and avoid scanning already-cleaned buffers.
3841 */
3842 static bool saved_info_valid = false;
3843 static int prev_strategy_buf_id;
3845 static int next_to_clean;
3846 static uint32 next_passes;
3847
3848 /* Moving averages of allocation rate and clean-buffer density */
3849 static float smoothed_alloc = 0;
3850 static float smoothed_density = 10.0;
3851
3852 /* Potentially these could be tunables, but for now, not */
3853 float smoothing_samples = 16;
3854 float scan_whole_pool_milliseconds = 120000.0;
3855
3856 /* Used to compute how far we scan ahead */
3857 long strategy_delta;
3858 int bufs_to_lap;
3859 int bufs_ahead;
3860 float scans_per_alloc;
3863 int min_scan_buffers;
3864
3865 /* Variables for the scanning loop proper */
3866 int num_to_scan;
3867 int num_written;
3868 int reusable_buffers;
3869
3870 /* Variables for final smoothed_density update */
3871 long new_strategy_delta;
3873
3874 /*
3875 * Find out where the clock-sweep currently is, and how many buffer
3876 * allocations have happened since our last call.
3877 */
3879
3880 /* Report buffer alloc counts to pgstat */
3882
3883 /*
3884 * If we're not running the LRU scan, just stop after doing the stats
3885 * stuff. We mark the saved state invalid so that we can recover sanely
3886 * if LRU scan is turned back on later.
3887 */
3888 if (bgwriter_lru_maxpages <= 0)
3889 {
3890 saved_info_valid = false;
3891 return true;
3892 }
3893
3894 /*
3895 * Compute strategy_delta = how many buffers have been scanned by the
3896 * clock-sweep since last time. If first time through, assume none. Then
3897 * see if we are still ahead of the clock-sweep, and if so, how many
3898 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3899 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3900 * behavior when the passes counts wrap around.
3901 */
3902 if (saved_info_valid)
3903 {
3905
3908
3909 Assert(strategy_delta >= 0);
3910
3911 if ((int32) (next_passes - strategy_passes) > 0)
3912 {
3913 /* we're one pass ahead of the strategy point */
3915#ifdef BGW_DEBUG
3916 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3920#endif
3921 }
3922 else if (next_passes == strategy_passes &&
3924 {
3925 /* on same pass, but ahead or at least not behind */
3927#ifdef BGW_DEBUG
3928 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3932#endif
3933 }
3934 else
3935 {
3936 /*
3937 * We're behind, so skip forward to the strategy point and start
3938 * cleaning from there.
3939 */
3940#ifdef BGW_DEBUG
3941 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3945#endif
3949 }
3950 }
3951 else
3952 {
3953 /*
3954 * Initializing at startup or after LRU scanning had been off. Always
3955 * start at the strategy point.
3956 */
3957#ifdef BGW_DEBUG
3958 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3960#endif
3961 strategy_delta = 0;
3965 }
3966
3967 /* Update saved info for next time */
3970 saved_info_valid = true;
3971
3972 /*
3973 * Compute how many buffers had to be scanned for each new allocation, ie,
3974 * 1/density of reusable buffers, and track a moving average of that.
3975 *
3976 * If the strategy point didn't move, we don't update the density estimate
3977 */
3978 if (strategy_delta > 0 && recent_alloc > 0)
3979 {
3983 }
3984
3985 /*
3986 * Estimate how many reusable buffers there are between the current
3987 * strategy point and where we've scanned ahead to, based on the smoothed
3988 * density estimate.
3989 */
3992
3993 /*
3994 * Track a moving average of recent buffer allocations. Here, rather than
3995 * a true average we want a fast-attack, slow-decline behavior: we
3996 * immediately follow any increase.
3997 */
3998 if (smoothed_alloc <= (float) recent_alloc)
4000 else
4003
4004 /* Scale the estimate by a GUC to allow more aggressive tuning. */
4006
4007 /*
4008 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
4009 * eventually underflow to zero, and the underflows produce annoying
4010 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
4011 * zero, there's no point in tracking smaller and smaller values of
4012 * smoothed_alloc, so just reset it to exactly zero to avoid this
4013 * syndrome. It will pop back up as soon as recent_alloc increases.
4014 */
4015 if (upcoming_alloc_est == 0)
4016 smoothed_alloc = 0;
4017
4018 /*
4019 * Even in cases where there's been little or no buffer allocation
4020 * activity, we want to make a small amount of progress through the buffer
4021 * cache so that as many reusable buffers as possible are clean after an
4022 * idle period.
4023 *
4024 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
4025 * the BGW will be called during the scan_whole_pool time; slice the
4026 * buffer pool into that many sections.
4027 */
4029
4031 {
4032#ifdef BGW_DEBUG
4033 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
4035#endif
4037 }
4038
4039 /*
4040 * Now write out dirty reusable buffers, working forward from the
4041 * next_to_clean point, until we have lapped the strategy scan, or cleaned
4042 * enough buffers to match our estimate of the next cycle's allocation
4043 * requirements, or hit the bgwriter_lru_maxpages limit.
4044 */
4045
4046 num_to_scan = bufs_to_lap;
4047 num_written = 0;
4049
4050 /* Execute the LRU scan */
4051 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
4052 {
4054 wb_context);
4055
4056 if (++next_to_clean >= NBuffers)
4057 {
4058 next_to_clean = 0;
4059 next_passes++;
4060 }
4061 num_to_scan--;
4062
4063 if (sync_state & BUF_WRITTEN)
4064 {
4067 {
4069 break;
4070 }
4071 }
4072 else if (sync_state & BUF_REUSABLE)
4074 }
4075
4077
4078#ifdef BGW_DEBUG
4079 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
4082 bufs_to_lap - num_to_scan,
4085#endif
4086
4087 /*
4088 * Consider the above scan as being like a new allocation scan.
4089 * Characterize its density and update the smoothed one based on it. This
4090 * effectively halves the moving average period in cases where both the
4091 * strategy and the background writer are doing some useful scanning,
4092 * which is helpful because a long memory isn't as desirable on the
4093 * density estimates.
4094 */
4095 new_strategy_delta = bufs_to_lap - num_to_scan;
4097 if (new_strategy_delta > 0 && new_recent_alloc > 0)
4098 {
4102
4103#ifdef BGW_DEBUG
4104 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
4107#endif
4108 }
4109
4110 /* Return true if OK to hibernate */
4111 return (bufs_to_lap == 0 && recent_alloc == 0);
4112}
int BgWriterDelay
Definition bgwriter.c:59
#define BUF_REUSABLE
Definition bufmgr.c:85
double bgwriter_lru_multiplier
Definition bufmgr.c:191
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition bufmgr.c:4129
int bgwriter_lru_maxpages
Definition bufmgr.c:190
#define BUF_WRITTEN
Definition bufmgr.c:84
int32_t int32
Definition c.h:620
#define DEBUG2
Definition elog.h:30
#define DEBUG1
Definition elog.h:31
#define elog(elevel,...)
Definition elog.h:228
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition freelist.c:331
int NBuffers
Definition globals.c:144
PgStat_BgWriterStats PendingBgWriterStats
PgStat_Counter buf_written_clean
Definition pgstat.h:246
PgStat_Counter maxwritten_clean
Definition pgstat.h:247
PgStat_Counter buf_alloc
Definition pgstat.h:248

References Assert, bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, DEBUG1, DEBUG2, elog, fb(), PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ buffer_readv_complete()

static pg_attribute_always_inline PgAioResult buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data,
bool  is_temp 
)
static

Definition at line 8676 of file bufmgr.c.

8678{
8684 uint8 error_count = 0;
8685 uint8 zeroed_count = 0;
8686 uint8 ignored_count = 0;
8688 uint64 *io_data;
8689 uint8 handle_data_len;
8690
8691 if (is_temp)
8692 {
8693 Assert(td->smgr.is_temp);
8695 }
8696 else
8697 Assert(!td->smgr.is_temp);
8698
8699 /*
8700 * Iterate over all the buffers affected by this IO and call the
8701 * per-buffer completion function for each buffer.
8702 */
8703 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8704 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
8705 {
8707 bool failed;
8708 bool failed_verification = false;
8709 bool failed_checksum = false;
8710 bool zeroed_buffer = false;
8711 bool ignored_checksum = false;
8712
8714
8715 /*
8716 * If the entire I/O failed on a lower-level, each buffer needs to be
8717 * marked as failed. In case of a partial read, the first few buffers
8718 * may be ok.
8719 */
8720 failed =
8722 || prior_result.result <= buf_off;
8723
8724 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
8728 &zeroed_buffer);
8729
8730 /*
8731 * Track information about the number of different kinds of error
8732 * conditions across all pages, as there can be multiple pages failing
8733 * verification as part of one IO.
8734 */
8737 if (zeroed_buffer && zeroed_count++ == 0)
8739 if (ignored_checksum && ignored_count++ == 0)
8741 if (failed_checksum)
8743 }
8744
8745 /*
8746 * If the smgr read succeeded [partially] and page verification failed for
8747 * some of the pages, adjust the IO's result state appropriately.
8748 */
8749 if (prior_result.status != PGAIO_RS_ERROR &&
8750 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
8751 {
8753 zeroed_count > 0, ignored_count > 0,
8758 }
8759
8760 /*
8761 * For shared relations this reporting is done in
8762 * shared_buffer_readv_complete_local().
8763 */
8764 if (is_temp && checkfail_count > 0)
8767
8768 return result;
8769}
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition aio.c:355
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
@ PGAIO_RS_ERROR
Definition aio_types.h:84
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition bufmgr.c:8525
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition bufmgr.c:8430
uint8_t uint8
Definition c.h:622
uint32 result
ProcNumber MyProcNumber
Definition globals.c:92
static char buf[DEFAULT_XLOG_SEG_SIZE]
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@131 smgr

References Assert, buf, buffer_readv_complete_one(), buffer_readv_encode_error(), BufferIsValid(), RelFileLocator::dbOid, DEBUG1, fb(), PgAioTargetData::is_temp, MyProcNumber, pgaio_io_get_handle_data(), pgaio_io_get_owner(), pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, pgstat_report_checksum_failures_in_db(), result, PgAioTargetData::rlocator, and PgAioTargetData::smgr.

Referenced by local_buffer_readv_complete(), and shared_buffer_readv_complete().

◆ buffer_readv_complete_one()

static pg_attribute_always_inline void buffer_readv_complete_one ( PgAioTargetData td,
uint8  buf_off,
Buffer  buffer,
uint8  flags,
bool  failed,
bool  is_temp,
bool buffer_invalid,
bool failed_checksum,
bool ignored_checksum,
bool zeroed_buffer 
)
static

Definition at line 8525 of file bufmgr.c.

8531{
8532 BufferDesc *buf_hdr = is_temp ?
8533 GetLocalBufferDescriptor(-buffer - 1)
8534 : GetBufferDescriptor(buffer - 1);
8535 BufferTag tag = buf_hdr->tag;
8536 char *bufdata = BufferGetBlock(buffer);
8538 int piv_flags;
8539
8540 /* check that the buffer is in the expected state for a read */
8541#ifdef USE_ASSERT_CHECKING
8542 {
8544
8547 /* temp buffers don't use BM_IO_IN_PROGRESS */
8548 if (!is_temp)
8551 }
8552#endif
8553
8554 *buffer_invalid = false;
8555 *failed_checksum = false;
8556 *ignored_checksum = false;
8557 *zeroed_buffer = false;
8558
8559 /*
8560 * We ask PageIsVerified() to only log the message about checksum errors,
8561 * as the completion might be run in any backend (or IO workers). We will
8562 * report checksum errors in buffer_readv_report().
8563 */
8565
8566 /* the local zero_damaged_pages may differ from the definer's */
8569
8570 /*
8571 * If the buffers are marked for zero on error, we want to log that in
8572 * case of a checksum failure.
8573 */
8574 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8576
8577 /* Check for garbage data. */
8578 if (!failed)
8579 {
8580 /*
8581 * If the buffer is not currently pinned by this backend, e.g. because
8582 * we're completing this IO after an error, the buffer data will have
8583 * been marked as inaccessible when the buffer was unpinned. The AIO
8584 * subsystem holds a pin, but that doesn't prevent the buffer from
8585 * having been marked as inaccessible. The completion might also be
8586 * executed in a different process.
8587 */
8588#ifdef USE_VALGRIND
8589 if (!BufferIsPinned(buffer))
8591#endif
8592
8593 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
8595 {
8596 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8597 {
8598 memset(bufdata, 0, BLCKSZ);
8599 *zeroed_buffer = true;
8600 }
8601 else
8602 {
8603 *buffer_invalid = true;
8604 /* mark buffer as having failed */
8605 failed = true;
8606 }
8607 }
8608 else if (*failed_checksum)
8609 *ignored_checksum = true;
8610
8611 /* undo what we did above */
8612#ifdef USE_VALGRIND
8613 if (!BufferIsPinned(buffer))
8615#endif
8616
8617 /*
8618 * Immediately log a message about the invalid page, but only to the
8619 * server log. The reason to do so immediately is that this may be
8620 * executed in a different backend than the one that originated the
8621 * request. The reason to do so immediately is that the originator
8622 * might not process the query result immediately (because it is busy
8623 * doing another part of query processing) or at all (e.g. if it was
8624 * cancelled or errored out due to another IO also failing). The
8625 * definer of the IO will emit an ERROR or WARNING when processing the
8626 * IO's results
8627 *
8628 * To avoid duplicating the code to emit these log messages, we reuse
8629 * buffer_readv_report().
8630 */
8632 {
8633 PgAioResult result_one = {0};
8634
8639 *zeroed_buffer ? 1 : 0,
8640 *failed_checksum ? 1 : 0,
8643 }
8644 }
8645
8646 /* Terminate I/O and set BM_VALID. */
8647 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
8648 if (is_temp)
8650 else
8651 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
8652
8653 /*
8654 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
8655 * callback may not be executed in the same backend that called
8656 * BUFFER_READ_START. The alternative would be to defer calling the
8657 * tracepoint to a later point (e.g. the local completion callback for
8658 * shared buffer reads), which seems even less helpful.
8659 */
8661 tag.blockNum,
8662 tag.spcOid,
8663 tag.dbOid,
8664 tag.relNumber,
8666 false);
8667}
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition bufmgr.c:599
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition bufpage.c:94
#define PIV_LOG_LOG
Definition bufpage.h:500
#define PIV_ZERO_BUFFERS_ON_ERROR
Definition bufpage.h:502
PageData * Page
Definition bufpage.h:81
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition bufpage.h:501
#define LOG_SERVER_ONLY
Definition elog.h:33
#define false
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint64 set_flag_bits, bool release_aio)
Definition localbuf.c:578
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition memdebug.h:27
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum

References Assert, buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, buffer_readv_encode_error(), BufferGetBlock(), BufferIsPinned, buftag::dbOid, fb(), buftag::forkNum, GetBufferDescriptor(), GetLocalBufferDescriptor(), INVALID_PROC_NUMBER, LOG_SERVER_ONLY, MyProcNumber, PageIsVerified(), pg_atomic_read_u64(), pgaio_result_report(), PIV_IGNORE_CHECKSUM_FAILURE, PIV_LOG_LOG, PIV_ZERO_BUFFERS_ON_ERROR, READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_ZERO_ON_ERROR, buftag::relNumber, buftag::spcOid, TerminateBufferIO(), TerminateLocalBufferIO(), VALGRIND_MAKE_MEM_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by buffer_readv_complete().

◆ buffer_readv_decode_error()

static void buffer_readv_decode_error ( PgAioResult  result,
bool zeroed_any,
bool ignored_any,
uint8 zeroed_or_error_count,
uint8 checkfail_count,
uint8 first_off 
)
inlinestatic

Definition at line 8388 of file bufmgr.c.

8394{
8395 uint32 rem_error = result.error_data;
8396
8397 /* see static asserts in buffer_readv_encode_error */
8398#define READV_COUNT_BITS 7
8399#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
8400
8401 *zeroed_any = rem_error & 1;
8402 rem_error >>= 1;
8403
8404 *ignored_any = rem_error & 1;
8405 rem_error >>= 1;
8406
8409
8412
8415}
#define READV_COUNT_BITS
#define READV_COUNT_MASK

References fb(), READV_COUNT_BITS, READV_COUNT_MASK, and result.

Referenced by buffer_readv_encode_error(), buffer_readv_report(), and shared_buffer_readv_complete_local().

◆ buffer_readv_encode_error()

static void buffer_readv_encode_error ( PgAioResult result,
bool  is_temp,
bool  zeroed_any,
bool  ignored_any,
uint8  error_count,
uint8  zeroed_count,
uint8  checkfail_count,
uint8  first_error_off,
uint8  first_zeroed_off,
uint8  first_ignored_off 
)
inlinestatic

Definition at line 8430 of file bufmgr.c.

8440{
8441
8442 uint8 shift = 0;
8446
8448 "PG_IOV_MAX is bigger than reserved space for error data");
8450 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
8451
8452 /*
8453 * We only have space to encode one offset - but luckily that's good
8454 * enough. If there is an error, the error is the interesting offset, same
8455 * with a zeroed buffer vs an ignored buffer.
8456 */
8457 if (error_count > 0)
8459 else if (zeroed_count > 0)
8461 else
8463
8464 Assert(!zeroed_any || error_count == 0);
8465
8466 result->error_data = 0;
8467
8468 result->error_data |= zeroed_any << shift;
8469 shift += 1;
8470
8471 result->error_data |= ignored_any << shift;
8472 shift += 1;
8473
8474 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
8475 shift += READV_COUNT_BITS;
8476
8477 result->error_data |= ((uint32) checkfail_count) << shift;
8478 shift += READV_COUNT_BITS;
8479
8480 result->error_data |= ((uint32) first_off) << shift;
8481 shift += READV_COUNT_BITS;
8482
8483 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
8485
8486 if (error_count > 0)
8487 result->status = PGAIO_RS_ERROR;
8488 else
8489 result->status = PGAIO_RS_WARNING;
8490
8491 /*
8492 * The encoding is complicated enough to warrant cross-checking it against
8493 * the decode function.
8494 */
8495#ifdef USE_ASSERT_CHECKING
8496 {
8497 bool zeroed_any_2,
8502
8507 &first_off_2);
8513 }
8514#endif
8515
8516#undef READV_COUNT_BITS
8517#undef READV_COUNT_MASK
8518}
#define PGAIO_RESULT_ERROR_BITS
Definition aio_types.h:98
@ PGAIO_RS_WARNING
Definition aio_types.h:83
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition bufmgr.c:8388
#define StaticAssertDecl(condition, errmessage)
Definition c.h:1008
#define PG_IOV_MAX
Definition pg_iovec.h:47

References Assert, buffer_readv_decode_error(), fb(), PG_IOV_MAX, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_RESULT_ERROR_BITS, PGAIO_RS_ERROR, PGAIO_RS_WARNING, READV_COUNT_BITS, result, and StaticAssertDecl.

Referenced by buffer_readv_complete(), and buffer_readv_complete_one().

◆ buffer_readv_report()

static void buffer_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 8779 of file bufmgr.c.

8781{
8782 int nblocks = td->smgr.nblocks;
8783 BlockNumber first = td->smgr.blockNum;
8784 BlockNumber last = first + nblocks - 1;
8787 RelPathStr rpath =
8789 bool zeroed_any,
8793 first_off;
8795 const char *msg_one,
8796 *msg_mult,
8797 *det_mult,
8798 *hint_mult;
8799
8803 &first_off);
8804
8805 /*
8806 * Treat a read that had both zeroed buffers *and* ignored checksums as a
8807 * special case, it's too irregular to be emitted the same way as the
8808 * other cases.
8809 */
8810 if (zeroed_any && ignored_any)
8811 {
8813 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
8814 Assert(result.status != PGAIO_RS_ERROR);
8816
8817 ereport(elevel,
8819 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
8820 affected_count, checkfail_count, first, last, rpath.str),
8821 affected_count > 1 ?
8822 errdetail("Block %u held the first zeroed page.",
8823 first + first_off) : 0,
8824 errhint_plural("See server log for details about the other %d invalid block.",
8825 "See server log for details about the other %d invalid blocks.",
8828 return;
8829 }
8830
8831 /*
8832 * The other messages are highly repetitive. To avoid duplicating a long
8833 * and complicated ereport(), gather the translated format strings
8834 * separately and then do one common ereport.
8835 */
8836 if (result.status == PGAIO_RS_ERROR)
8837 {
8838 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
8840 msg_one = _("invalid page in block %u of relation \"%s\"");
8841 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
8842 det_mult = _("Block %u held the first invalid page.");
8843 hint_mult = _("See server log for the other %u invalid block(s).");
8844 }
8845 else if (zeroed_any && !ignored_any)
8846 {
8848 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
8849 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
8850 det_mult = _("Block %u held the first zeroed page.");
8851 hint_mult = _("See server log for the other %u zeroed block(s).");
8852 }
8853 else if (!zeroed_any && ignored_any)
8854 {
8856 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
8857 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
8858 det_mult = _("Block %u held the first ignored page.");
8859 hint_mult = _("See server log for the other %u ignored block(s).");
8860 }
8861 else
8863
8864 ereport(elevel,
8866 affected_count == 1 ?
8867 errmsg_internal(msg_one, first + first_off, rpath.str) :
8868 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
8871}
#define pg_unreachable()
Definition c.h:367
#define _(x)
Definition elog.c:95
int int errdetail_internal(const char *fmt,...) pg_attribute_printf(1
int int int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...) pg_attribute_printf(1
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
int int errhint_internal(const char *fmt,...) pg_attribute_printf(1
const char * str
#define ERRCODE_DATA_CORRUPTED
int ProcNumber
Definition procnumber.h:24
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
BlockNumber blockNum
Definition aio_types.h:66
BlockNumber nblocks
Definition aio_types.h:67
ForkNumber forkNum
Definition aio_types.h:68

References _, Assert, PgAioTargetData::blockNum, buffer_readv_decode_error(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errdetail(), errdetail_internal(), errhint_internal(), errhint_plural(), errmsg, errmsg_internal(), fb(), PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, pg_unreachable, PGAIO_RS_ERROR, relpathbackend, result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and RelPathStr::str.

◆ buffer_stage_common()

static pg_attribute_always_inline void buffer_stage_common ( PgAioHandle ioh,
bool  is_write,
bool  is_temp 
)
static

Definition at line 8281 of file bufmgr.c.

8282{
8283 uint64 *io_data;
8284 uint8 handle_data_len;
8287
8288 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8289
8291
8292 /* iterate over all buffers affected by the vectored readv/writev */
8293 for (int i = 0; i < handle_data_len; i++)
8294 {
8295 Buffer buffer = (Buffer) io_data[i];
8296 BufferDesc *buf_hdr = is_temp ?
8297 GetLocalBufferDescriptor(-buffer - 1)
8298 : GetBufferDescriptor(buffer - 1);
8300
8301 /*
8302 * Check that all the buffers are actually ones that could conceivably
8303 * be done in one IO, i.e. are sequential. This is the last
8304 * buffer-aware code before IO is actually executed and confusion
8305 * about which buffers are targeted by IO can be hard to debug, making
8306 * it worth doing extra-paranoid checks.
8307 */
8308 if (i == 0)
8309 first = buf_hdr->tag;
8310 else
8311 {
8312 Assert(buf_hdr->tag.relNumber == first.relNumber);
8313 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
8314 }
8315
8316 if (is_temp)
8318 else
8320
8321 /* verify the buffer is in the expected state */
8323 if (is_write)
8324 {
8327 }
8328 else
8329 {
8332 }
8333
8334 /* temp buffers don't use BM_IO_IN_PROGRESS */
8335 if (!is_temp)
8337
8339
8340 /*
8341 * Reflect that the buffer is now owned by the AIO subsystem.
8342 *
8343 * For local buffers: This can't be done just via LocalRefCount, as
8344 * one might initially think, as this backend could error out while
8345 * AIO is still in progress, releasing all the pins by the backend
8346 * itself.
8347 *
8348 * This pin is released again in TerminateBufferIO().
8349 */
8350 buf_hdr->io_wref = io_ref;
8351
8352 if (is_temp)
8353 {
8356 }
8357 else
8359
8360 /*
8361 * Ensure the content lock that prevents buffer modifications while
8362 * the buffer is being written out is not released early due to an
8363 * error.
8364 */
8365 if (is_write && !is_temp)
8366 {
8368
8369 /*
8370 * Lock is now owned by AIO subsystem.
8371 */
8372 BufferLockDisown(buffer, buf_hdr);
8373 }
8374
8375 /*
8376 * Stop tracking this buffer via the resowner - the AIO system now
8377 * keeps track.
8378 */
8379 if (!is_temp)
8381 }
8382}
static void pg_atomic_unlocked_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:494
#define BUF_REFCOUNT_ONE
static uint64 UnlockBufHdrExt(BufferDesc *desc, uint64 old_buf_state, uint64 set_bits, uint64 unset_bits, int refcount_change)
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BUF_STATE_GET_REFCOUNT(state)
static void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6271
static bool BufferLockHeldByMe(BufferDesc *buf_hdr)
Definition bufmgr.c:6543
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:249
BufferTag tag

References Assert, BM_DIRTY, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, BUF_REFCOUNT_ONE, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferLockDisown(), BufferLockHeldByMe(), CurrentResourceOwner, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, LockBufHdr(), pg_atomic_read_u64(), pg_atomic_unlocked_write_u64(), PG_USED_FOR_ASSERTS_ONLY, pgaio_io_get_handle_data(), pgaio_io_get_wref(), ResourceOwnerForgetBufferIO(), and UnlockBufHdrExt().

Referenced by local_buffer_readv_stage(), and shared_buffer_readv_stage().

◆ BufferAlloc()

static pg_attribute_always_inline BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr,
IOContext  io_context 
)
inlinestatic

Definition at line 2188 of file bufmgr.c.

2192{
2193 BufferTag newTag; /* identity of requested block */
2194 uint32 newHash; /* hash value for newTag */
2195 LWLock *newPartitionLock; /* buffer partition lock for it */
2196 int existing_buf_id;
2200 uint64 set_bits = 0;
2201
2202 /* Make sure we will have room to remember the buffer pin */
2205
2206 /* create a tag so we can lookup the buffer */
2207 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2208
2209 /* determine its hash code and partition lock ID */
2212
2213 /* see if the block is in the buffer pool already */
2216 if (existing_buf_id >= 0)
2217 {
2218 BufferDesc *buf;
2219 bool valid;
2220
2221 /*
2222 * Found it. Now, pin the buffer so no one can steal it from the
2223 * buffer pool, and check to see if the correct data has been loaded
2224 * into the buffer.
2225 */
2227
2228 valid = PinBuffer(buf, strategy, false);
2229
2230 /* Can release the mapping lock as soon as we've pinned it */
2232
2233 *foundPtr = true;
2234
2235 if (!valid)
2236 {
2237 /*
2238 * We can only get here if (a) someone else is still reading in
2239 * the page, (b) a previous read attempt failed, or (c) someone
2240 * called StartReadBuffers() but not yet WaitReadBuffers().
2241 */
2242 *foundPtr = false;
2243 }
2244
2245 return buf;
2246 }
2247
2248 /*
2249 * Didn't find it in the buffer pool. We'll have to initialize a new
2250 * buffer. Remember to unlock the mapping lock while doing the work.
2251 */
2253
2254 /*
2255 * Acquire a victim buffer. Somebody else might try to do the same, we
2256 * don't hold any conflicting locks. If so we'll have to undo our work
2257 * later.
2258 */
2261
2262 /*
2263 * Try to make a hashtable entry for the buffer under its new tag. If
2264 * somebody else inserted another buffer for the tag, we'll release the
2265 * victim buffer we acquired and use the already inserted one.
2266 */
2269 if (existing_buf_id >= 0)
2270 {
2272 bool valid;
2273
2274 /*
2275 * Got a collision. Someone has already done what we were about to do.
2276 * We'll just handle this as if it were found in the buffer pool in
2277 * the first place. First, give up the buffer we were planning to
2278 * use.
2279 *
2280 * We could do this after releasing the partition lock, but then we'd
2281 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2282 * before acquiring the lock, for the rare case of such a collision.
2283 */
2285
2286 /* remaining code should match code at top of routine */
2287
2289
2290 valid = PinBuffer(existing_buf_hdr, strategy, false);
2291
2292 /* Can release the mapping lock as soon as we've pinned it */
2294
2295 *foundPtr = true;
2296
2297 if (!valid)
2298 {
2299 /*
2300 * We can only get here if (a) someone else is still reading in
2301 * the page, (b) a previous read attempt failed, or (c) someone
2302 * called StartReadBuffers() but not yet WaitReadBuffers().
2303 */
2304 *foundPtr = false;
2305 }
2306
2307 return existing_buf_hdr;
2308 }
2309
2310 /*
2311 * Need to lock the buffer header too in order to change its tag.
2312 */
2314
2315 /* some sanity checks while we hold the buffer header lock */
2318
2319 victim_buf_hdr->tag = newTag;
2320
2321 /*
2322 * Make sure BM_PERMANENT is set for buffers that must be written at every
2323 * checkpoint. Unlogged buffers only need to be written at shutdown
2324 * checkpoints, except for their "init" forks, which need to be treated
2325 * just like permanent relations.
2326 */
2328 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2330
2332 set_bits, 0, 0);
2333
2335
2336 /*
2337 * Buffer contents are currently invalid.
2338 */
2339 *foundPtr = false;
2340
2341 return victim_buf_hdr;
2342}
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_PERMANENT
#define BUF_USAGECOUNT_ONE
static LWLock * BufMappingPartitionLock(uint32 hashcode)
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:96
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition buf_table.c:84
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition buf_table.c:124
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition bufmgr.c:2539
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition bufmgr.c:3272
static void ReservePrivateRefCountEntry(void)
Definition bufmgr.c:309
static void UnpinBuffer(BufferDesc *buf)
Definition bufmgr.c:3456
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1150
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1767
@ LW_SHARED
Definition lwlock.h:105
@ LW_EXCLUSIVE
Definition lwlock.h:104
@ INIT_FORKNUM
Definition relpath.h:61
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
RelFileLocator locator
RelFileLocatorBackend smgr_rlocator
Definition smgr.h:38

References Assert, BM_DIRTY, BM_IO_IN_PROGRESS, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), CurrentResourceOwner, fb(), GetBufferDescriptor(), GetVictimBuffer(), INIT_FORKNUM, InitBufferTag(), RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrRelationData::smgr_rlocator, UnlockBufHdrExt(), and UnpinBuffer().

Referenced by PinBufferForBlock().

◆ BufferBeginSetHintBits()

bool BufferBeginSetHintBits ( Buffer  buffer)

Definition at line 7042 of file bufmgr.c.

7043{
7046
7047 if (BufferIsLocal(buffer))
7048 {
7049 /*
7050 * NB: Will need to check if there is a write in progress, once it is
7051 * possible for writes to be done asynchronously.
7052 */
7053 return true;
7054 }
7055
7056 buf_hdr = GetBufferDescriptor(buffer - 1);
7057
7059}
#define BufferIsLocal(buffer)
Definition buf.h:37
static bool SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
Definition bufmgr.c:6951

References PrivateRefCountEntry::buffer, BufferIsLocal, fb(), GetBufferDescriptor(), and SharedBufferBeginSetHintBits().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), fsm_search_avail(), fsm_vacuum_page(), gistkillitems(), and SetHintBitsExt().

◆ BufferFinishSetHintBits()

void BufferFinishSetHintBits ( Buffer  buffer,
bool  mark_dirty,
bool  buffer_std 
)

Definition at line 7070 of file bufmgr.c.

7071{
7072 if (!BufferIsLocal(buffer))
7075
7076 if (mark_dirty)
7078}
bool BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:3087
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5821
@ BUFFER_LOCK_SHARE_EXCLUSIVE
Definition bufmgr.h:217
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:222

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), fb(), and MarkBufferDirtyHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), fsm_search_avail(), fsm_vacuum_page(), gistkillitems(), and HeapTupleSatisfiesMVCCBatch().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 4446 of file bufmgr.c.

4447{
4449
4450 Assert(BufferIsPinned(buffer));
4451
4452 if (BufferIsLocal(buffer))
4453 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4454 else
4455 bufHdr = GetBufferDescriptor(buffer - 1);
4456
4457 /* pinned, so OK to read tag without spinlock */
4458 return bufHdr->tag.blockNum;
4459}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, fb(), GetBufferDescriptor(), and GetLocalBufferDescriptor().

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_finish_split(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readpage(), _bt_relandgetbuf(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), AsyncReadBuffers(), BitmapHeapScanNextBlock(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), btvacuumpage(), check_index_page(), CheckReadBuffersOperation(), collect_corrupt_items(), collectMatchBitmap(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginDeletePostingPage(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), gistvacuumpage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_fetch_next_buffer(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_prune_opt(), heap_page_would_be_all_visible(), heap_prepare_pagescan(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), heapam_index_fetch_tuple(), heapam_scan_analyze_next_block(), heapgettup(), heapgettup_pagemode(), index_compute_xid_horizon_for_tuples(), lazy_scan_heap(), lazy_scan_noprune(), lazy_scan_prune(), lazy_vacuum_heap_rel(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), prune_freeze_setup(), read_buffers(), read_stream_start_pending_read(), ReadBufferBI(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), StartReadBuffersImpl(), startScanEntry(), statapprox_heap(), terminate_brin_buildstate(), vacuumLeafPage(), verify_heapam(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), and visibilitymap_set().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 4713 of file bufmgr.c.

4714{
4715 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4716 Assert(BufferIsValid(buffer));
4717 Assert(BufferIsPinned(buffer));
4718
4719#ifdef PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
4720 return PageGetLSN(BufferGetPage(buffer));
4721#else
4722 {
4723 char *page = BufferGetPage(buffer);
4725 XLogRecPtr lsn;
4726
4727 /*
4728 * If we don't need locking for correctness, fastpath out.
4729 */
4730 if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
4731 return PageGetLSN(page);
4732
4733 bufHdr = GetBufferDescriptor(buffer - 1);
4735 lsn = PageGetLSN(page);
4737
4738 return lsn;
4739 }
4740#endif
4741}
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:468
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:410
#define XLogHintBitIsNeeded()
Definition xlog.h:123
uint64 XLogRecPtr
Definition xlogdefs.h:21

References Assert, PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), fb(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_killitems(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), and SetHintBitsExt().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 4467 of file bufmgr.c.

4469{
4471
4472 /* Do the same checks as BufferGetBlockNumber. */
4473 Assert(BufferIsPinned(buffer));
4474
4475 if (BufferIsLocal(buffer))
4476 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4477 else
4478 bufHdr = GetBufferDescriptor(buffer - 1);
4479
4480 /* pinned, so OK to read tag without spinlock */
4481 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4482 *forknum = BufTagGetForkNum(&bufHdr->tag);
4483 *blknum = bufHdr->tag.blockNum;
4484}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), fb(), GetBufferDescriptor(), and GetLocalBufferDescriptor().

Referenced by fsm_search_avail(), ginRedoInsertEntry(), heap_inplace_update_and_unlock(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), and XLogRegisterBuffer().

◆ BufferIsDirty()

bool BufferIsDirty ( Buffer  buffer)

◆ BufferIsLockedByMe()

bool BufferIsLockedByMe ( Buffer  buffer)

Definition at line 3061 of file bufmgr.c.

3062{
3064
3065 Assert(BufferIsPinned(buffer));
3066
3067 if (BufferIsLocal(buffer))
3068 {
3069 /* Content locks are not maintained for local buffers. */
3070 return true;
3071 }
3072 else
3073 {
3074 bufHdr = GetBufferDescriptor(buffer - 1);
3075 return BufferLockHeldByMe(bufHdr);
3076 }
3077}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockHeldByMe(), fb(), and GetBufferDescriptor().

Referenced by FlushOneBuffer().

◆ BufferIsLockedByMeInMode()

bool BufferIsLockedByMeInMode ( Buffer  buffer,
BufferLockMode  mode 
)

Definition at line 3087 of file bufmgr.c.

3088{
3090
3091 Assert(BufferIsPinned(buffer));
3092
3093 if (BufferIsLocal(buffer))
3094 {
3095 /* Content locks are not maintained for local buffers. */
3096 return true;
3097 }
3098 else
3099 {
3100 bufHdr = GetBufferDescriptor(buffer - 1);
3102 }
3103}
static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6525
static PgChecksumMode mode

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockHeldByMeInMode(), fb(), GetBufferDescriptor(), and mode.

Referenced by BufferFinishSetHintBits(), BufferIsDirty(), heap_page_fix_vm_corruption(), HeapTupleSetHintBits(), IsBufferCleanupOK(), MarkBufferDirty(), visibilitymap_set(), and XLogRegisterBuffer().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 4677 of file bufmgr.c.

4678{
4680
4681 /* Local buffers are used only for temp relations. */
4682 if (BufferIsLocal(buffer))
4683 return false;
4684
4685 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4686 Assert(BufferIsValid(buffer));
4687 Assert(BufferIsPinned(buffer));
4688
4689 /*
4690 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4691 * need not bother with the buffer header spinlock. Even if someone else
4692 * changes the buffer header state while we're doing this, the state is
4693 * changed atomically, so we'll read the old value or the new value, but
4694 * not random garbage.
4695 */
4696 bufHdr = GetBufferDescriptor(buffer - 1);
4697 return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
4698}

References Assert, BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), fb(), GetBufferDescriptor(), and pg_atomic_read_u64().

Referenced by SetHintBitsExt().

◆ BufferLockAcquire()

static void BufferLockAcquire ( Buffer  buffer,
BufferDesc buf_hdr,
BufferLockMode  mode 
)
inlinestatic

Definition at line 5898 of file bufmgr.c.

5899{
5900 PrivateRefCountEntry *entry;
5901 int extraWaits = 0;
5902
5903 /*
5904 * Get reference to the refcount entry before we hold the lock, it seems
5905 * better to do before holding the lock.
5906 */
5907 entry = GetPrivateRefCountEntry(buffer, true);
5908
5909 /*
5910 * We better not already hold a lock on the buffer.
5911 */
5913
5914 /*
5915 * Lock out cancel/die interrupts until we exit the code section protected
5916 * by the content lock. This ensures that interrupts will not interfere
5917 * with manipulations of data structures in shared memory.
5918 */
5920
5921 for (;;)
5922 {
5923 uint32 wait_event = 0; /* initialized to avoid compiler warning */
5924 bool mustwait;
5925
5926 /*
5927 * Try to grab the lock the first time, we're not in the waitqueue
5928 * yet/anymore.
5929 */
5931
5932 if (likely(!mustwait))
5933 {
5934 break;
5935 }
5936
5937 /*
5938 * Ok, at this point we couldn't grab the lock on the first try. We
5939 * cannot simply queue ourselves to the end of the list and wait to be
5940 * woken up because by now the lock could long have been released.
5941 * Instead add us to the queue and try to grab the lock again. If we
5942 * succeed we need to revert the queuing and be happy, otherwise we
5943 * recheck the lock. If we still couldn't grab it, we know that the
5944 * other locker will see our queue entries when releasing since they
5945 * existed before we checked for the lock.
5946 */
5947
5948 /* add to the queue */
5950
5951 /* we're now guaranteed to be woken up if necessary */
5953
5954 /* ok, grabbed the lock the second time round, need to undo queueing */
5955 if (!mustwait)
5956 {
5958 break;
5959 }
5960
5961 switch (mode)
5962 {
5965 break;
5968 break;
5969 case BUFFER_LOCK_SHARE:
5971 break;
5972 case BUFFER_LOCK_UNLOCK:
5974
5975 }
5977
5978 /*
5979 * Wait until awakened.
5980 *
5981 * It is possible that we get awakened for a reason other than being
5982 * signaled by BufferLockWakeup(). If so, loop back and wait again.
5983 * Once we've gotten the lock, re-increment the sema by the number of
5984 * additional signals received.
5985 */
5986 for (;;)
5987 {
5990 break;
5991 extraWaits++;
5992 }
5993
5995
5996 /* Retrying, allow BufferLockRelease to release waiters again. */
5998 }
5999
6000 /* Remember that we now hold this lock */
6001 entry->data.lockmode = mode;
6002
6003 /*
6004 * Fix the process wait semaphore's count for any absorbed wakeups.
6005 */
6006 while (unlikely(extraWaits-- > 0))
6008}
static uint64 pg_atomic_fetch_and_u64(volatile pg_atomic_uint64 *ptr, uint64 and_)
Definition atomics.h:551
#define BM_LOCK_WAKE_IN_PROGRESS
static bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6096
static void BufferLockDequeueSelf(BufferDesc *buf_hdr)
Definition bufmgr.c:6203
static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6163
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition bufmgr.c:507
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:212
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:207
#define likely(x)
Definition c.h:437
@ LW_WS_NOT_WAITING
Definition lwlock.h:30
#define HOLD_INTERRUPTS()
Definition miscadmin.h:136
void PGSemaphoreUnlock(PGSemaphore sema)
Definition posix_sema.c:333
void PGSemaphoreLock(PGSemaphore sema)
Definition posix_sema.c:313
PGPROC * MyProc
Definition proc.c:71
PGSemaphore sem
Definition proc.h:258
uint8 lwWaiting
Definition proc.h:283
BufferLockMode lockmode
Definition bufmgr.c:112
PrivateRefCountData data
Definition bufmgr.c:130
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:67
static void pgstat_report_wait_end(void)
Definition wait_event.h:83

References Assert, BM_LOCK_WAKE_IN_PROGRESS, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferLockAttempt(), BufferLockDequeueSelf(), BufferLockQueueSelf(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), HOLD_INTERRUPTS, likely, PrivateRefCountData::lockmode, LW_WS_NOT_WAITING, PGPROC::lwWaiting, mode, MyProc, pg_atomic_fetch_and_u64(), pg_unreachable, PGSemaphoreLock(), PGSemaphoreUnlock(), pgstat_report_wait_end(), pgstat_report_wait_start(), PGPROC::sem, and unlikely.

Referenced by FlushUnlockedBuffer(), LockBufferInternal(), and MarkDirtyUnpinnedBufferInternal().

◆ BufferLockAttempt()

static bool BufferLockAttempt ( BufferDesc buf_hdr,
BufferLockMode  mode 
)
inlinestatic

Definition at line 6096 of file bufmgr.c.

6097{
6099
6100 /*
6101 * Read once outside the loop, later iterations will get the newer value
6102 * via compare & exchange.
6103 */
6105
6106 /* loop until we've determined whether we could acquire the lock or not */
6107 while (true)
6108 {
6110 bool lock_free;
6111
6113
6115 {
6116 lock_free = (old_state & BM_LOCK_MASK) == 0;
6117 if (lock_free)
6119 }
6121 {
6123 if (lock_free)
6125 }
6126 else
6127 {
6129 if (lock_free)
6131 }
6132
6133 /*
6134 * Attempt to swap in the state we are expecting. If we didn't see
6135 * lock to be free, that's just the old value. If we saw it as free,
6136 * we'll attempt to mark it acquired. The reason that we always swap
6137 * in the value is that this doubles as a memory barrier. We could try
6138 * to be smarter and only swap in values if we saw the lock as free,
6139 * but benchmark haven't shown it as beneficial so far.
6140 *
6141 * Retry if the value changed since we last looked at it.
6142 */
6145 {
6146 if (lock_free)
6147 {
6148 /* Great! Got the lock. */
6149 return false;
6150 }
6151 else
6152 return true; /* somebody else has the lock */
6153 }
6154 }
6155
6157}
static bool pg_atomic_compare_exchange_u64(volatile pg_atomic_uint64 *ptr, uint64 *expected, uint64 newval)
Definition atomics.h:522
#define BM_LOCK_VAL_SHARED
#define BM_LOCK_VAL_EXCLUSIVE
#define BM_LOCK_MASK
#define BM_LOCK_VAL_SHARE_EXCLUSIVE

References BM_LOCK_MASK, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_VAL_SHARE_EXCLUSIVE, BM_LOCK_VAL_SHARED, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, fb(), likely, mode, pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), and pg_unreachable.

Referenced by BufferLockAcquire(), and BufferLockConditional().

◆ BufferLockConditional()

static bool BufferLockConditional ( Buffer  buffer,
BufferDesc buf_hdr,
BufferLockMode  mode 
)
static

Definition at line 6050 of file bufmgr.c.

6051{
6052 PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true);
6053 bool mustwait;
6054
6055 /*
6056 * As described above, if we're trying to lock a buffer this backend
6057 * already has locked, return false, independent of the existing and
6058 * desired lock level.
6059 */
6060 if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
6061 return false;
6062
6063 /*
6064 * Lock out cancel/die interrupts until we exit the code section protected
6065 * by the content lock. This ensures that interrupts will not interfere
6066 * with manipulations of data structures in shared memory.
6067 */
6069
6070 /* Check for the lock */
6072
6073 if (mustwait)
6074 {
6075 /* Failed to get lock, so release interrupt holdoff */
6077 }
6078 else
6079 {
6080 entry->data.lockmode = mode;
6081 }
6082
6083 return !mustwait;
6084}
#define RESUME_INTERRUPTS()
Definition miscadmin.h:138

References PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferLockAttempt(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), HOLD_INTERRUPTS, PrivateRefCountData::lockmode, mode, and RESUME_INTERRUPTS.

Referenced by ConditionalLockBuffer(), and GetVictimBuffer().

◆ BufferLockDequeueSelf()

static void BufferLockDequeueSelf ( BufferDesc buf_hdr)
static

Definition at line 6203 of file bufmgr.c.

6204{
6205 bool on_waitlist;
6206
6208
6210 if (on_waitlist)
6211 proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6212
6213 if (proclist_is_empty(&buf_hdr->lock_waiters) &&
6215 {
6217 }
6218
6219 /* XXX: combine with fetch_and above? */
6221
6222 /* clear waiting state again, nice for debugging */
6223 if (on_waitlist)
6225 else
6226 {
6227 int extraWaits = 0;
6228
6229
6230 /*
6231 * Somebody else dequeued us and has or will wake us up. Deal with the
6232 * superfluous absorption of a wakeup.
6233 */
6234
6235 /*
6236 * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
6237 * removed ourselves - they'll have set it.
6238 */
6240
6241 /*
6242 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
6243 * get reset at some inconvenient point later. Most of the time this
6244 * will immediately return.
6245 */
6246 for (;;)
6247 {
6250 break;
6251 extraWaits++;
6252 }
6253
6254 /*
6255 * Fix the process wait semaphore's count for any absorbed wakeups.
6256 */
6257 while (extraWaits-- > 0)
6259 }
6260}
#define BM_LOCK_HAS_WAITERS
@ LW_WS_WAITING
Definition lwlock.h:31
#define proclist_delete(list, procno, link_member)
Definition proclist.h:187
static bool proclist_is_empty(const proclist_head *list)
Definition proclist.h:38

References BM_LOCK_HAS_WAITERS, BM_LOCK_WAKE_IN_PROGRESS, fb(), LockBufHdr(), LW_WS_NOT_WAITING, LW_WS_WAITING, PGPROC::lwWaiting, MyProc, MyProcNumber, pg_atomic_fetch_and_u64(), pg_atomic_read_u64(), PGSemaphoreLock(), PGSemaphoreUnlock(), proclist_delete, proclist_is_empty(), PGPROC::sem, and UnlockBufHdr().

Referenced by BufferLockAcquire().

◆ BufferLockDisown()

static void BufferLockDisown ( Buffer  buffer,
BufferDesc buf_hdr 
)
inlinestatic

Definition at line 6271 of file bufmgr.c.

6272{
6275}
static int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6285

References PrivateRefCountEntry::buffer, BufferLockDisownInternal(), fb(), and RESUME_INTERRUPTS.

Referenced by buffer_stage_common().

◆ BufferLockDisownInternal()

static int BufferLockDisownInternal ( Buffer  buffer,
BufferDesc buf_hdr 
)
inlinestatic

Definition at line 6285 of file bufmgr.c.

6286{
6289
6290 ref = GetPrivateRefCountEntry(buffer, false);
6291 if (ref == NULL)
6292 elog(ERROR, "lock %d is not held", buffer);
6293 mode = ref->data.lockmode;
6294 ref->data.lockmode = BUFFER_LOCK_UNLOCK;
6295
6296 return mode;
6297}
BufferLockMode
Definition bufmgr.h:206
#define ERROR
Definition elog.h:40

References PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, elog, ERROR, fb(), GetPrivateRefCountEntry(), and mode.

Referenced by BufferLockDisown(), BufferLockUnlock(), and UnlockReleaseBuffer().

◆ BufferLockHeldByMe()

static bool BufferLockHeldByMe ( BufferDesc buf_hdr)
static

Definition at line 6543 of file bufmgr.c.

6544{
6545 PrivateRefCountEntry *entry =
6547
6548 if (!entry)
6549 return false;
6550 else
6551 return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
6552}
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)

References BUFFER_LOCK_UNLOCK, BufferDescriptorGetBuffer(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), and PrivateRefCountData::lockmode.

Referenced by buffer_stage_common(), BufferIsLockedByMe(), and UnpinBufferNoOwner().

◆ BufferLockHeldByMeInMode()

static bool BufferLockHeldByMeInMode ( BufferDesc buf_hdr,
BufferLockMode  mode 
)
static

Definition at line 6525 of file bufmgr.c.

6526{
6527 PrivateRefCountEntry *entry =
6529
6530 if (!entry)
6531 return false;
6532 else
6533 return entry->data.lockmode == mode;
6534}

References BufferDescriptorGetBuffer(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), PrivateRefCountData::lockmode, and mode.

Referenced by BufferIsLockedByMeInMode(), FlushBuffer(), and MarkSharedBufferDirtyHint().

◆ BufferLockProcessRelease()

static void BufferLockProcessRelease ( BufferDesc buf_hdr,
BufferLockMode  mode,
uint64  lockstate 
)
static

Definition at line 6470 of file bufmgr.c.

6471{
6472 bool check_waiters = false;
6473 bool wake_exclusive = false;
6474
6475 /* nobody else can have that kind of lock */
6477
6478 /*
6479 * If we're still waiting for backends to get scheduled, don't wake them
6480 * up again. Otherwise check if we need to look through the waitqueue to
6481 * wake other backends.
6482 */
6485 {
6486 if ((lockstate & BM_LOCK_MASK) == 0)
6487 {
6488 /*
6489 * We released a lock and the lock was, in that moment, free. We
6490 * therefore can wake waiters for any kind of lock.
6491 */
6492 check_waiters = true;
6493 wake_exclusive = true;
6494 }
6496 {
6497 /*
6498 * We released the lock, but another backend still holds a lock.
6499 * We can't have released an exclusive lock, as there couldn't
6500 * have been other lock holders. If we released a share lock, no
6501 * waiters need to be woken up, as there must be other share
6502 * lockers. However, if we held a share-exclusive lock, another
6503 * backend now could acquire a share-exclusive lock.
6504 */
6505 check_waiters = true;
6506 wake_exclusive = false;
6507 }
6508 }
6509
6510 /*
6511 * As waking up waiters requires the spinlock to be acquired, only do so
6512 * if necessary.
6513 */
6514 if (check_waiters)
6516}
static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked)
Definition bufmgr.c:6305

References Assert, BM_LOCK_HAS_WAITERS, BM_LOCK_MASK, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_WAKE_IN_PROGRESS, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferLockWakeup(), fb(), and mode.

Referenced by BufferLockUnlock(), and UnlockReleaseBuffer().

◆ BufferLockQueueSelf()

static void BufferLockQueueSelf ( BufferDesc buf_hdr,
BufferLockMode  mode 
)
static

Definition at line 6163 of file bufmgr.c.

6164{
6165 /*
6166 * If we don't have a PGPROC structure, there's no way to wait. This
6167 * should never occur, since MyProc should only be null during shared
6168 * memory initialization.
6169 */
6170 if (MyProc == NULL)
6171 elog(PANIC, "cannot wait without a PGPROC structure");
6172
6174 elog(PANIC, "queueing for lock while waiting on another one");
6175
6177
6178 /* setting the flag is protected by the spinlock */
6180
6181 /*
6182 * These are currently used both for lwlocks and buffer content locks,
6183 * which is acceptable, although not pretty, because a backend can't wait
6184 * for both types of locks at the same time.
6185 */
6188
6189 proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6190
6191 /* Can release the mutex now */
6193}
static uint64 pg_atomic_fetch_or_u64(volatile pg_atomic_uint64 *ptr, uint64 or_)
Definition atomics.h:560
#define PANIC
Definition elog.h:44
#define proclist_push_tail(list, procno, link_member)
Definition proclist.h:191
uint8 lwWaitMode
Definition proc.h:284

References BM_LOCK_HAS_WAITERS, elog, fb(), LockBufHdr(), LW_WS_NOT_WAITING, LW_WS_WAITING, PGPROC::lwWaiting, PGPROC::lwWaitMode, mode, MyProc, MyProcNumber, PANIC, pg_atomic_fetch_or_u64(), proclist_push_tail, and UnlockBufHdr().

Referenced by BufferLockAcquire().

◆ BufferLockReleaseSub()

static uint64 BufferLockReleaseSub ( BufferLockMode  mode)
inlinestatic

Definition at line 6441 of file bufmgr.c.

6442{
6443 /*
6444 * Turns out that a switch() leads gcc to generate sufficiently worse code
6445 * for this to show up in profiles...
6446 */
6448 return BM_LOCK_VAL_EXCLUSIVE;
6451 else
6452 {
6454 return BM_LOCK_VAL_SHARED;
6455 }
6456
6457 return 0; /* keep compiler quiet */
6458}

References Assert, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_VAL_SHARE_EXCLUSIVE, BM_LOCK_VAL_SHARED, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, and mode.

Referenced by BufferLockUnlock(), and UnlockReleaseBuffer().

◆ BufferLockUnlock()

static void BufferLockUnlock ( Buffer  buffer,
BufferDesc buf_hdr 
)
static

Definition at line 6014 of file bufmgr.c.

6015{
6018 uint64 sub;
6019
6021
6022 /*
6023 * Release my hold on lock, after that it can immediately be acquired by
6024 * others, even if we still have to wakeup other waiters.
6025 */
6027
6029
6031
6032 /*
6033 * Now okay to allow cancel/die interrupts.
6034 */
6036}
static uint64 pg_atomic_sub_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:578
static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
Definition bufmgr.c:6470
static uint64 BufferLockReleaseSub(BufferLockMode mode)
Definition bufmgr.c:6441

References PrivateRefCountEntry::buffer, BufferLockDisownInternal(), BufferLockProcessRelease(), BufferLockReleaseSub(), fb(), mode, pg_atomic_sub_fetch_u64(), and RESUME_INTERRUPTS.

Referenced by FlushUnlockedBuffer(), MarkDirtyUnpinnedBufferInternal(), ResOwnerReleaseBuffer(), and UnlockBuffer().

◆ BufferLockWakeup()

static void BufferLockWakeup ( BufferDesc buf_hdr,
bool  unlocked 
)
static

Definition at line 6305 of file bufmgr.c.

6306{
6307 bool new_wake_in_progress = false;
6308 bool wake_share_exclusive = true;
6311
6313
6314 /* lock wait list while collecting backends to wake up */
6316
6317 proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
6318 {
6319 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6320
6321 /*
6322 * Already woke up a conflicting lock, so skip over this wait list
6323 * entry.
6324 */
6326 continue;
6328 continue;
6329
6330 proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
6331 proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
6332
6333 /*
6334 * Prevent additional wakeups until retryer gets to run. Backends that
6335 * are just waiting for the lock to become free don't retry
6336 * automatically.
6337 */
6338 new_wake_in_progress = true;
6339
6340 /*
6341 * Signal that the process isn't on the wait list anymore. This allows
6342 * BufferLockDequeueSelf() to remove itself from the waitlist with a
6343 * proclist_delete(), rather than having to check if it has been
6344 * removed from the list.
6345 */
6346 Assert(waiter->lwWaiting == LW_WS_WAITING);
6348
6349 /*
6350 * Don't wakeup further waiters after waking a conflicting waiter.
6351 */
6352 if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
6353 {
6354 /*
6355 * Share locks conflict with exclusive locks.
6356 */
6357 wake_exclusive = false;
6358 }
6359 else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6360 {
6361 /*
6362 * Share-exclusive locks conflict with share-exclusive and
6363 * exclusive locks.
6364 */
6365 wake_exclusive = false;
6366 wake_share_exclusive = false;
6367 }
6368 else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6369 {
6370 /*
6371 * Exclusive locks conflict with all other locks, there's no point
6372 * in waking up anybody else.
6373 */
6374 break;
6375 }
6376 }
6377
6379
6380 /* unset required flags, and release lock, in one fell swoop */
6381 {
6384
6386 while (true)
6387 {
6389
6390 /* compute desired flags */
6391
6394 else
6396
6397 if (proclist_is_empty(&buf_hdr->lock_waiters))
6399
6400 desired_state &= ~BM_LOCKED; /* release lock */
6401
6404 break;
6405 }
6406 }
6407
6408 /* Awaken any waiters I removed from the queue. */
6409 proclist_foreach_modify(iter, &wakeup, lwWaitLink)
6410 {
6411 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6412
6413 proclist_delete(&wakeup, iter.cur, lwWaitLink);
6414
6415 /*
6416 * Guarantee that lwWaiting being unset only becomes visible once the
6417 * unlink from the link has completed. Otherwise the target backend
6418 * could be woken up for other reason and enqueue for a new lock - if
6419 * that happens before the list unlink happens, the list would end up
6420 * being corrupted.
6421 *
6422 * The barrier pairs with the LockBufHdr() when enqueuing for another
6423 * lock.
6424 */
6426 waiter->lwWaiting = LW_WS_NOT_WAITING;
6427 PGSemaphoreUnlock(waiter->sem);
6428 }
6429}
#define pg_write_barrier()
Definition atomics.h:155
@ LW_WS_PENDING_WAKEUP
Definition lwlock.h:32
#define GetPGProcByNumber(n)
Definition proc.h:504
static void proclist_init(proclist_head *list)
Definition proclist.h:29
#define proclist_foreach_modify(iter, lhead, link_member)
Definition proclist.h:206
Definition proc.h:179
static TimestampTz wakeup[NUM_WALRCV_WAKEUPS]

References Assert, BM_LOCK_HAS_WAITERS, BM_LOCK_WAKE_IN_PROGRESS, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, proclist_mutable_iter::cur, fb(), GetPGProcByNumber, LockBufHdr(), LW_WS_NOT_WAITING, LW_WS_PENDING_WAKEUP, LW_WS_WAITING, PGPROC::lwWaiting, PGPROC::lwWaitMode, pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), pg_write_barrier, PGSemaphoreUnlock(), proclist_delete, proclist_foreach_modify, proclist_init(), proclist_is_empty(), proclist_push_tail, PGPROC::sem, and wakeup.

Referenced by BufferLockProcessRelease().

◆ BufferSetHintBits16()

bool BufferSetHintBits16 ( uint16 ptr,
uint16  val,
Buffer  buffer 
)

Definition at line 7093 of file bufmgr.c.

7094{
7097#ifdef USE_ASSERT_CHECKING
7098 char *page;
7099
7100 /* verify that the address is on the page */
7101 page = BufferGetPage(buffer);
7102 Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ));
7103#endif
7104
7105 if (BufferIsLocal(buffer))
7106 {
7107 *ptr = val;
7108
7109 MarkLocalBufferDirty(buffer);
7110
7111 return true;
7112 }
7113
7114 buf_hdr = GetBufferDescriptor(buffer - 1);
7115
7117 {
7118 *ptr = val;
7119
7121
7122 return true;
7123 }
7124
7125 return false;
7126}
static void MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, bool buffer_std)
Definition bufmgr.c:5696
long val
Definition informix.c:689
void MarkLocalBufferDirty(Buffer buffer)
Definition localbuf.c:492

References Assert, PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, fb(), GetBufferDescriptor(), MarkLocalBufferDirty(), MarkSharedBufferDirtyHint(), SharedBufferBeginSetHintBits(), and val.

Referenced by SetHintBitsExt().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 3552 of file bufmgr.c.

3553{
3555 int buf_id;
3556 int num_to_scan;
3557 int num_spaces;
3558 int num_processed;
3559 int num_written;
3561 Oid last_tsid;
3563 int i;
3564 uint64 mask = BM_DIRTY;
3566
3567 /*
3568 * Unless this is a shutdown checkpoint or we have been explicitly told,
3569 * we write only permanent, dirty buffers. But at shutdown or end of
3570 * recovery, we write all dirty buffers.
3571 */
3574 mask |= BM_PERMANENT;
3575
3576 /*
3577 * Loop over all buffers, and mark the ones that need to be written with
3578 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3579 * can estimate how much work needs to be done.
3580 *
3581 * This allows us to write only those pages that were dirty when the
3582 * checkpoint began, and not those that get dirtied while it proceeds.
3583 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3584 * later in this function, or by normal backends or the bgwriter cleaning
3585 * scan, the flag is cleared. Any buffer dirtied after this point won't
3586 * have the flag set.
3587 *
3588 * Note that if we fail to write some buffer, we may leave buffers with
3589 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3590 * certainly need to be written for the next checkpoint attempt, too.
3591 */
3592 num_to_scan = 0;
3593 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3594 {
3596 uint64 set_bits = 0;
3597
3598 /*
3599 * Header spinlock is enough to examine BM_DIRTY, see comment in
3600 * SyncOneBuffer.
3601 */
3603
3604 if ((buf_state & mask) == mask)
3605 {
3606 CkptSortItem *item;
3607
3609
3610 item = &CkptBufferIds[num_to_scan++];
3611 item->buf_id = buf_id;
3612 item->tsId = bufHdr->tag.spcOid;
3613 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3614 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3615 item->blockNum = bufHdr->tag.blockNum;
3616 }
3617
3619 set_bits, 0,
3620 0);
3621
3622 /* Check for barrier events in case NBuffers is large. */
3625 }
3626
3627 if (num_to_scan == 0)
3628 return; /* nothing to do */
3629
3631
3633
3634 /*
3635 * Sort buffers that need to be written to reduce the likelihood of random
3636 * IO. The sorting is also important for the implementation of balancing
3637 * writes between tablespaces. Without balancing writes we'd potentially
3638 * end up writing to the tablespaces one-by-one; possibly overloading the
3639 * underlying system.
3640 */
3642
3643 num_spaces = 0;
3644
3645 /*
3646 * Allocate progress status for each tablespace with buffers that need to
3647 * be flushed. This requires the to-be-flushed array to be sorted.
3648 */
3650 for (i = 0; i < num_to_scan; i++)
3651 {
3652 CkptTsStatus *s;
3653 Oid cur_tsid;
3654
3656
3657 /*
3658 * Grow array of per-tablespace status structs, every time a new
3659 * tablespace is found.
3660 */
3662 {
3663 Size sz;
3664
3665 num_spaces++;
3666
3667 /*
3668 * Not worth adding grow-by-power-of-2 logic here - even with a
3669 * few hundred tablespaces this should be fine.
3670 */
3671 sz = sizeof(CkptTsStatus) * num_spaces;
3672
3673 if (per_ts_stat == NULL)
3675 else
3677
3678 s = &per_ts_stat[num_spaces - 1];
3679 memset(s, 0, sizeof(*s));
3680 s->tsId = cur_tsid;
3681
3682 /*
3683 * The first buffer in this tablespace. As CkptBufferIds is sorted
3684 * by tablespace all (s->num_to_scan) buffers in this tablespace
3685 * will follow afterwards.
3686 */
3687 s->index = i;
3688
3689 /*
3690 * progress_slice will be determined once we know how many buffers
3691 * are in each tablespace, i.e. after this loop.
3692 */
3693
3695 }
3696 else
3697 {
3698 s = &per_ts_stat[num_spaces - 1];
3699 }
3700
3701 s->num_to_scan++;
3702
3703 /* Check for barrier events. */
3706 }
3707
3708 Assert(num_spaces > 0);
3709
3710 /*
3711 * Build a min-heap over the write-progress in the individual tablespaces,
3712 * and compute how large a portion of the total progress a single
3713 * processed buffer is.
3714 */
3717 NULL);
3718
3719 for (i = 0; i < num_spaces; i++)
3720 {
3722
3723 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3724
3726 }
3727
3729
3730 /*
3731 * Iterate through to-be-checkpointed buffers and write the ones (still)
3732 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3733 * tablespaces; otherwise the sorting would lead to only one tablespace
3734 * receiving writes at a time, making inefficient use of the hardware.
3735 */
3736 num_processed = 0;
3737 num_written = 0;
3738 while (!binaryheap_empty(ts_heap))
3739 {
3743
3744 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3745 Assert(buf_id != -1);
3746
3747 bufHdr = GetBufferDescriptor(buf_id);
3748
3749 num_processed++;
3750
3751 /*
3752 * We don't need to acquire the lock here, because we're only looking
3753 * at a single bit. It's possible that someone else writes the buffer
3754 * and clears the flag right after we check, but that doesn't matter
3755 * since SyncOneBuffer will then do nothing. However, there is a
3756 * further race condition: it's conceivable that between the time we
3757 * examine the bit here and the time SyncOneBuffer acquires the lock,
3758 * someone else not only wrote the buffer but replaced it with another
3759 * page and dirtied it. In that improbable case, SyncOneBuffer will
3760 * write the buffer though we didn't need to. It doesn't seem worth
3761 * guarding against this, though.
3762 */
3764 {
3765 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3766 {
3769 num_written++;
3770 }
3771 }
3772
3773 /*
3774 * Measure progress independent of actually having to flush the buffer
3775 * - otherwise writing become unbalanced.
3776 */
3777 ts_stat->progress += ts_stat->progress_slice;
3778 ts_stat->num_scanned++;
3779 ts_stat->index++;
3780
3781 /* Have all the buffers from the tablespace been processed? */
3782 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3783 {
3785 }
3786 else
3787 {
3788 /* update heap with the new progress */
3790 }
3791
3792 /*
3793 * Sleep to throttle our I/O rate.
3794 *
3795 * (This will check for barrier events even if it doesn't sleep.)
3796 */
3797 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3798 }
3799
3800 /*
3801 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3802 * IOContext will always be IOCONTEXT_NORMAL.
3803 */
3805
3807 per_ts_stat = NULL;
3809
3810 /*
3811 * Update checkpoint statistics. As noted above, this doesn't include
3812 * buffers written by other backends or bgwriter scan.
3813 */
3815
3817}
void binaryheap_build(binaryheap *heap)
Definition binaryheap.c:136
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:253
bh_node_type binaryheap_first(binaryheap *heap)
Definition binaryheap.c:175
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition binaryheap.c:190
void binaryheap_free(binaryheap *heap)
Definition binaryheap.c:73
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:114
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition binaryheap.c:37
#define binaryheap_empty(h)
Definition binaryheap.h:65
CkptSortItem * CkptBufferIds
Definition buf_init.c:28
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BM_CHECKPOINT_NEEDED
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition bufmgr.c:7655
int checkpoint_flush_after
Definition bufmgr.c:223
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition bufmgr.c:7678
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition bufmgr.c:7740
double float8
Definition c.h:714
size_t Size
Definition c.h:689
void CheckpointWriteDelay(int flags, double progress)
volatile sig_atomic_t ProcSignalBarrierPending
Definition globals.c:40
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
PgStat_CheckpointerStats PendingCheckpointerStats
static Datum PointerGetDatum(const void *X)
Definition postgres.h:342
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
#define InvalidOid
unsigned int Oid
void ProcessProcSignalBarrier(void)
Definition procsignal.c:503
int ckpt_bufs_written
Definition xlog.h:179
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition bufmgr.c:164
int num_to_scan
Definition bufmgr.c:167
PgStat_Counter buffers_written
Definition pgstat.h:270
CheckpointStatsData CheckpointStats
Definition xlog.c:216
#define CHECKPOINT_FLUSH_UNLOGGED
Definition xlog.h:155
#define CHECKPOINT_END_OF_RECOVERY
Definition xlog.h:152
#define CHECKPOINT_IS_SHUTDOWN
Definition xlog.h:151

References Assert, binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buffers_written, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_UNLOGGED, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, DatumGetPointer(), fb(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IOCONTEXT_NORMAL, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u64(), PointerGetDatum(), ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), SyncOneBuffer(), ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdrExt(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 7590 of file bufmgr.c.

7591{
7592 int ret;
7595
7598
7600
7601 if (ret != 0)
7602 return ret;
7603
7605 return -1;
7607 return 1;
7608
7609 if (ba->blockNum < bb->blockNum)
7610 return -1;
7611 if (ba->blockNum > bb->blockNum)
7612 return 1;
7613
7614 return 0;
7615}
static int rlocator_comparator(const void *p1, const void *p2)
Definition bufmgr.c:7491

References BufTagGetForkNum(), BufTagGetRelFileLocator(), fb(), and rlocator_comparator().

◆ CheckBufferIsPinnedOnce()

void CheckBufferIsPinnedOnce ( Buffer  buffer)

Definition at line 6637 of file bufmgr.c.

6638{
6639 if (BufferIsLocal(buffer))
6640 {
6641 if (LocalRefCount[-buffer - 1] != 1)
6642 elog(ERROR, "incorrect local pin count: %d",
6643 LocalRefCount[-buffer - 1]);
6644 }
6645 else
6646 {
6647 if (GetPrivateRefCount(buffer) != 1)
6648 elog(ERROR, "incorrect local pin count: %d",
6649 GetPrivateRefCount(buffer));
6650 }
6651}

References PrivateRefCountEntry::buffer, BufferIsLocal, elog, ERROR, GetPrivateRefCount(), and LocalRefCount.

Referenced by GetVictimBuffer(), lazy_scan_heap(), and LockBufferForCleanup().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 4263 of file bufmgr.c.

4264{
4265#ifdef USE_ASSERT_CHECKING
4266 int RefCountErrors = 0;
4268 int i;
4269 char *s;
4270
4271 /* check the array */
4272 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4273 {
4275 {
4276 res = &PrivateRefCountArray[i];
4277
4279 elog(WARNING, "buffer refcount leak: %s", s);
4280 pfree(s);
4281
4283 }
4284 }
4285
4286 /* if necessary search the hash */
4288 {
4289 refcount_iterator iter;
4290
4292 while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4293 {
4295 elog(WARNING, "buffer refcount leak: %s", s);
4296 pfree(s);
4298 }
4299 }
4300
4301 Assert(RefCountErrors == 0);
4302#endif
4303}
#define InvalidBuffer
Definition buf.h:25
static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:263
static refcount_hash * PrivateRefCountHash
Definition bufmgr.c:265
char * DebugPrintBufferRefcount(Buffer buffer)
Definition bufmgr.c:4389
#define REFCOUNT_ARRAY_ENTRIES
Definition bufmgr.c:145
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:264

References Assert, PrivateRefCountEntry::buffer, DebugPrintBufferRefcount(), elog, fb(), i, InvalidBuffer, pfree(), PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and WARNING.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 4432 of file bufmgr.c.

4433{
4434 BufferSync(flags);
4435}
static void BufferSync(int flags)
Definition bufmgr.c:3552

References BufferSync().

Referenced by CheckPointGuts().

◆ CheckReadBuffersOperation()

static void CheckReadBuffersOperation ( ReadBuffersOperation operation,
bool  is_complete 
)
static

Definition at line 1647 of file bufmgr.c.

1648{
1649#ifdef USE_ASSERT_CHECKING
1650 Assert(operation->nblocks_done <= operation->nblocks);
1651 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1652
1653 for (int i = 0; i < operation->nblocks; i++)
1654 {
1655 Buffer buffer = operation->buffers[i];
1656 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1657 GetLocalBufferDescriptor(-buffer - 1) :
1658 GetBufferDescriptor(buffer - 1);
1659
1660 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1662
1663 if (i < operation->nblocks_done)
1665 }
1666#endif
1667}

References Assert, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufferGetBlockNumber(), BufferIsLocal, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, operation, and pg_atomic_read_u64().

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 7624 of file bufmgr.c.

7625{
7626 /* compare tablespace */
7627 if (a->tsId < b->tsId)
7628 return -1;
7629 else if (a->tsId > b->tsId)
7630 return 1;
7631 /* compare relation */
7632 if (a->relNumber < b->relNumber)
7633 return -1;
7634 else if (a->relNumber > b->relNumber)
7635 return 1;
7636 /* compare fork */
7637 else if (a->forkNum < b->forkNum)
7638 return -1;
7639 else if (a->forkNum > b->forkNum)
7640 return 1;
7641 /* compare block number */
7642 else if (a->blockNum < b->blockNum)
7643 return -1;
7644 else if (a->blockNum > b->blockNum)
7645 return 1;
7646 /* equal page IDs are unlikely, but not impossible */
7647 return 0;
7648}
int b
Definition isn.c:74
int a
Definition isn.c:73

References a, and b.

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 6617 of file bufmgr.c.

6618{
6619 BufferDesc *buf;
6620
6621 Assert(BufferIsPinned(buffer));
6622 if (BufferIsLocal(buffer))
6623 return true; /* act as though we got it */
6624
6625 buf = GetBufferDescriptor(buffer - 1);
6626
6628}
static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6050

References Assert, buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsPinned, BufferLockConditional(), and GetBufferDescriptor().

Referenced by _bt_conditionallockbuf(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 6843 of file bufmgr.c.

6844{
6847 refcount;
6848
6849 Assert(BufferIsValid(buffer));
6850
6851 /* see AIO related comment in LockBufferForCleanup() */
6852
6853 if (BufferIsLocal(buffer))
6854 {
6855 refcount = LocalRefCount[-buffer - 1];
6856 /* There should be exactly one pin */
6857 Assert(refcount > 0);
6858 if (refcount != 1)
6859 return false;
6860 /* Nobody else to wait for */
6861 return true;
6862 }
6863
6864 /* There should be exactly one local pin */
6865 refcount = GetPrivateRefCount(buffer);
6866 Assert(refcount);
6867 if (refcount != 1)
6868 return false;
6869
6870 /* Try to acquire lock */
6871 if (!ConditionalLockBuffer(buffer))
6872 return false;
6873
6874 bufHdr = GetBufferDescriptor(buffer - 1);
6877
6878 Assert(refcount > 0);
6879 if (refcount == 1)
6880 {
6881 /* Successfully acquired exclusive lock with pincount 1 */
6883 return true;
6884 }
6885
6886 /* Failed, so release the lock */
6889 return false;
6890}
bool ConditionalLockBuffer(Buffer buffer)
Definition bufmgr.c:6617
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:334

References Assert, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), fb(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 5462 of file bufmgr.c.

5464{
5465 char relpersistence;
5468
5469 /* Set the relpersistence. */
5470 relpersistence = permanent ?
5472
5475
5476 /*
5477 * Create and copy all forks of the relation. During create database we
5478 * have a separate cleanup mechanism which deletes complete database
5479 * directory. Therefore, each individual relation doesn't need to be
5480 * registered for cleanup.
5481 */
5482 RelationCreateStorage(dst_rlocator, relpersistence, false);
5483
5484 /* copy main fork. */
5486 permanent);
5487
5488 /* copy those extra forks that exist */
5489 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5490 forkNum <= MAX_FORKNUM; forkNum++)
5491 {
5492 if (smgrexists(src_rel, forkNum))
5493 {
5494 smgrcreate(dst_rel, forkNum, false);
5495
5496 /*
5497 * WAL log creation if the relation is persistent, or this is the
5498 * init fork of an unlogged relation.
5499 */
5500 if (permanent || forkNum == INIT_FORKNUM)
5501 log_smgrcreate(&dst_rlocator, forkNum);
5502
5503 /* Copy a fork's data, block by block. */
5505 permanent);
5506 }
5507 }
5508}
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition bufmgr.c:5348
@ MAIN_FORKNUM
Definition relpath.h:58
#define MAX_FORKNUM
Definition relpath.h:70
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition smgr.c:481
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:462
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition storage.c:187

References fb(), INIT_FORKNUM, INVALID_PROC_NUMBER, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DebugPrintBufferRefcount()

char * DebugPrintBufferRefcount ( Buffer  buffer)

Definition at line 4389 of file bufmgr.c.

4390{
4391 BufferDesc *buf;
4393 char *result;
4394 ProcNumber backend;
4396
4397 Assert(BufferIsValid(buffer));
4398 if (BufferIsLocal(buffer))
4399 {
4400 buf = GetLocalBufferDescriptor(-buffer - 1);
4401 loccount = LocalRefCount[-buffer - 1];
4402 backend = MyProcNumber;
4403 }
4404 else
4405 {
4406 buf = GetBufferDescriptor(buffer - 1);
4407 loccount = GetPrivateRefCount(buffer);
4408 backend = INVALID_PROC_NUMBER;
4409 }
4410
4411 /* theoretically we should lock the bufHdr here */
4412 buf_state = pg_atomic_read_u64(&buf->state);
4413
4414 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
4415 buffer,
4417 BufTagGetForkNum(&buf->tag)).str,
4418 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4420 return result;
4421}
#define BUF_FLAG_MASK
char * psprintf(const char *fmt,...)
Definition psprintf.c:43

References Assert, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), INVALID_PROC_NUMBER, LocalRefCount, MyProcNumber, pg_atomic_read_u64(), psprintf(), relpathbackend, and result.

Referenced by buffer_call_start_io(), buffer_call_terminate_io(), CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResOwnerPrintBuffer().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 5115 of file bufmgr.c.

5116{
5117 int i;
5118
5119 /*
5120 * We needn't consider local buffers, since by assumption the target
5121 * database isn't our own.
5122 */
5123
5124 for (i = 0; i < NBuffers; i++)
5125 {
5127
5128 /*
5129 * As in DropRelationBuffers, an unlocked precheck should be safe and
5130 * saves some cycles.
5131 */
5132 if (bufHdr->tag.dbOid != dbid)
5133 continue;
5134
5136 if (bufHdr->tag.dbOid == dbid)
5137 InvalidateBuffer(bufHdr); /* releases spinlock */
5138 else
5140 }
5141}
static void InvalidateBuffer(BufferDesc *buf)
Definition bufmgr.c:2361

References fb(), GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 4765 of file bufmgr.c.

4767{
4768 int i;
4769 int j;
4770 RelFileLocatorBackend rlocator;
4773
4774 rlocator = smgr_reln->smgr_rlocator;
4775
4776 /* If it's a local relation, it's localbuf.c's problem. */
4777 if (RelFileLocatorBackendIsTemp(rlocator))
4778 {
4779 if (rlocator.backend == MyProcNumber)
4780 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4782
4783 return;
4784 }
4785
4786 /*
4787 * To remove all the pages of the specified relation forks from the buffer
4788 * pool, we need to scan the entire buffer pool but we can optimize it by
4789 * finding the buffers from BufMapping table provided we know the exact
4790 * size of each fork of the relation. The exact size is required to ensure
4791 * that we don't leave any buffer for the relation being dropped as
4792 * otherwise the background writer or checkpointer can lead to a PANIC
4793 * error while flushing buffers corresponding to files that don't exist.
4794 *
4795 * To know the exact size, we rely on the size cached for each fork by us
4796 * during recovery which limits the optimization to recovery and on
4797 * standbys but we can easily extend it once we have shared cache for
4798 * relation size.
4799 *
4800 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4801 * and the future writes keeps the cached value up-to-date. See
4802 * smgrextend. It is possible that the value of the first lseek is smaller
4803 * than the actual number of existing blocks in the file due to buggy
4804 * Linux kernels that might not have accounted for the recent write. But
4805 * that should be fine because there must not be any buffers after that
4806 * file size.
4807 */
4808 for (i = 0; i < nforks; i++)
4809 {
4810 /* Get the number of blocks for a relation's fork */
4812
4814 {
4816 break;
4817 }
4818
4819 /* calculate the number of blocks to be invalidated */
4821 }
4822
4823 /*
4824 * We apply the optimization iff the total number of blocks to invalidate
4825 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4826 */
4829 {
4830 for (j = 0; j < nforks; j++)
4831 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4833 return;
4834 }
4835
4836 for (i = 0; i < NBuffers; i++)
4837 {
4839
4840 /*
4841 * We can make this a tad faster by prechecking the buffer tag before
4842 * we attempt to lock the buffer; this saves a lot of lock
4843 * acquisitions in typical cases. It should be safe because the
4844 * caller must have AccessExclusiveLock on the relation, or some other
4845 * reason to be certain that no one is loading new pages of the rel
4846 * into the buffer pool. (Otherwise we might well miss such pages
4847 * entirely.) Therefore, while the tag might be changing while we
4848 * look at it, it can't be changing *to* a value we care about, only
4849 * *away* from such a value. So false negatives are impossible, and
4850 * false positives are safe because we'll recheck after getting the
4851 * buffer lock.
4852 *
4853 * We could check forkNum and blockNum as well as the rlocator, but
4854 * the incremental win from doing so seems small.
4855 */
4856 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4857 continue;
4858
4860
4861 for (j = 0; j < nforks; j++)
4862 {
4863 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4864 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4865 bufHdr->tag.blockNum >= firstDelBlock[j])
4866 {
4867 InvalidateBuffer(bufHdr); /* releases spinlock */
4868 break;
4869 }
4870 }
4871 if (j >= nforks)
4873 }
4874}
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition bufmgr.c:95
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition bufmgr.c:5055
int j
Definition isn.c:78
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition localbuf.c:681
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:847

References RelFileLocatorBackend::backend, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), fb(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, RelFileLocatorBackendIsTemp, smgrnblocks_cached(), and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 4885 of file bufmgr.c.

4886{
4887 int i;
4888 int n = 0;
4889 SMgrRelation *rels;
4890 BlockNumber (*block)[MAX_FORKNUM + 1];
4893 bool cached = true;
4894 bool use_bsearch;
4895
4896 if (nlocators == 0)
4897 return;
4898
4899 rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4900
4901 /* If it's a local relation, it's localbuf.c's problem. */
4902 for (i = 0; i < nlocators; i++)
4903 {
4904 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4905 {
4906 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4907 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4908 }
4909 else
4910 rels[n++] = smgr_reln[i];
4911 }
4912
4913 /*
4914 * If there are no non-local relations, then we're done. Release the
4915 * memory and return.
4916 */
4917 if (n == 0)
4918 {
4919 pfree(rels);
4920 return;
4921 }
4922
4923 /*
4924 * This is used to remember the number of blocks for all the relations
4925 * forks.
4926 */
4927 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4928 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4929
4930 /*
4931 * We can avoid scanning the entire buffer pool if we know the exact size
4932 * of each of the given relation forks. See DropRelationBuffers.
4933 */
4934 for (i = 0; i < n && cached; i++)
4935 {
4936 for (int j = 0; j <= MAX_FORKNUM; j++)
4937 {
4938 /* Get the number of blocks for a relation's fork. */
4939 block[i][j] = smgrnblocks_cached(rels[i], j);
4940
4941 /* We need to only consider the relation forks that exists. */
4942 if (block[i][j] == InvalidBlockNumber)
4943 {
4944 if (!smgrexists(rels[i], j))
4945 continue;
4946 cached = false;
4947 break;
4948 }
4949
4950 /* calculate the total number of blocks to be invalidated */
4951 nBlocksToInvalidate += block[i][j];
4952 }
4953 }
4954
4955 /*
4956 * We apply the optimization iff the total number of blocks to invalidate
4957 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4958 */
4960 {
4961 for (i = 0; i < n; i++)
4962 {
4963 for (int j = 0; j <= MAX_FORKNUM; j++)
4964 {
4965 /* ignore relation forks that doesn't exist */
4966 if (!BlockNumberIsValid(block[i][j]))
4967 continue;
4968
4969 /* drop all the buffers for a particular relation fork */
4970 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4971 j, block[i][j], 0);
4972 }
4973 }
4974
4975 pfree(block);
4976 pfree(rels);
4977 return;
4978 }
4979
4980 pfree(block);
4981 locators = palloc_array(RelFileLocator, n); /* non-local relations */
4982 for (i = 0; i < n; i++)
4983 locators[i] = rels[i]->smgr_rlocator.locator;
4984
4985 /*
4986 * For low number of relations to drop just use a simple walk through, to
4987 * save the bsearch overhead. The threshold to use is rather a guess than
4988 * an exactly determined value, as it depends on many factors (CPU and RAM
4989 * speeds, amount of shared buffers etc.).
4990 */
4992
4993 /* sort the list of rlocators if necessary */
4994 if (use_bsearch)
4996
4997 for (i = 0; i < NBuffers; i++)
4998 {
4999 RelFileLocator *rlocator = NULL;
5001
5002 /*
5003 * As in DropRelationBuffers, an unlocked precheck should be safe and
5004 * saves some cycles.
5005 */
5006
5007 if (!use_bsearch)
5008 {
5009 int j;
5010
5011 for (j = 0; j < n; j++)
5012 {
5014 {
5015 rlocator = &locators[j];
5016 break;
5017 }
5018 }
5019 }
5020 else
5021 {
5022 RelFileLocator locator;
5023
5024 locator = BufTagGetRelFileLocator(&bufHdr->tag);
5025 rlocator = bsearch(&locator,
5026 locators, n, sizeof(RelFileLocator),
5028 }
5029
5030 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5031 if (rlocator == NULL)
5032 continue;
5033
5035 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
5036 InvalidateBuffer(bufHdr); /* releases spinlock */
5037 else
5039 }
5040
5041 pfree(locators);
5042 pfree(rels);
5043}
#define RELS_BSEARCH_THRESHOLD
Definition bufmgr.c:87
#define palloc_array(type, count)
Definition fe_memutils.h:76
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition localbuf.c:718
#define qsort(a, b, c, d)
Definition port.h:495

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), fb(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, palloc(), palloc_array, pfree(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ EvictAllUnpinnedBuffers()

void EvictAllUnpinnedBuffers ( int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 7982 of file bufmgr.c.

7984{
7985 *buffers_evicted = 0;
7986 *buffers_skipped = 0;
7987 *buffers_flushed = 0;
7988
7989 for (int buf = 1; buf <= NBuffers; buf++)
7990 {
7991 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7993 bool buffer_flushed;
7994
7996
7998 if (!(buf_state & BM_VALID))
7999 continue;
8000
8003
8004 LockBufHdr(desc);
8005
8007 (*buffers_evicted)++;
8008 else
8009 (*buffers_skipped)++;
8010
8011 if (buffer_flushed)
8012 (*buffers_flushed)++;
8013 }
8014}
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition bufmgr.c:7891
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:125
pg_atomic_uint64 state

References BM_VALID, buf, CHECK_FOR_INTERRUPTS, CurrentResourceOwner, EvictUnpinnedBufferInternal(), fb(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u64(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), and BufferDesc::state.

Referenced by pg_buffercache_evict_all().

◆ EvictRelUnpinnedBuffers()

void EvictRelUnpinnedBuffers ( Relation  rel,
int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 8032 of file bufmgr.c.

8034{
8036
8037 *buffers_skipped = 0;
8038 *buffers_evicted = 0;
8039 *buffers_flushed = 0;
8040
8041 for (int buf = 1; buf <= NBuffers; buf++)
8042 {
8043 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8045 bool buffer_flushed;
8046
8048
8049 /* An unlocked precheck should be safe and saves some cycles. */
8050 if ((buf_state & BM_VALID) == 0 ||
8052 continue;
8053
8054 /* Make sure we can pin the buffer. */
8057
8058 buf_state = LockBufHdr(desc);
8059
8060 /* recheck, could have changed without the lock */
8061 if ((buf_state & BM_VALID) == 0 ||
8063 {
8064 UnlockBufHdr(desc);
8065 continue;
8066 }
8067
8069 (*buffers_evicted)++;
8070 else
8071 (*buffers_skipped)++;
8072
8073 if (buffer_flushed)
8074 (*buffers_flushed)++;
8075 }
8076}
#define RelationUsesLocalBuffers(relation)
Definition rel.h:648
RelFileLocator rd_locator
Definition rel.h:57

References Assert, BM_VALID, buf, BufTagMatchesRelFileLocator(), CHECK_FOR_INTERRUPTS, CurrentResourceOwner, EvictUnpinnedBufferInternal(), fb(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u64(), RelationData::rd_locator, RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by evict_rel(), and pg_buffercache_evict_relation().

◆ EvictUnpinnedBuffer()

bool EvictUnpinnedBuffer ( Buffer  buf,
bool buffer_flushed 
)

◆ EvictUnpinnedBufferInternal()

static bool EvictUnpinnedBufferInternal ( BufferDesc desc,
bool buffer_flushed 
)
static

Definition at line 7891 of file bufmgr.c.

7892{
7894 bool result;
7895
7896 *buffer_flushed = false;
7897
7900
7901 if ((buf_state & BM_VALID) == 0)
7902 {
7903 UnlockBufHdr(desc);
7904 return false;
7905 }
7906
7907 /* Check that it's not pinned already. */
7909 {
7910 UnlockBufHdr(desc);
7911 return false;
7912 }
7913
7914 PinBuffer_Locked(desc); /* releases spinlock */
7915
7916 /* If it was dirty, try to clean it once. */
7917 if (buf_state & BM_DIRTY)
7918 {
7920 *buffer_flushed = true;
7921 }
7922
7923 /* This will return false if it becomes dirty or someone else pins it. */
7925
7926 UnpinBuffer(desc);
7927
7928 return result;
7929}
#define BM_LOCKED
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4626
static void PinBuffer_Locked(BufferDesc *buf)
Definition bufmgr.c:3388
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition bufmgr.c:2462

References Assert, BM_DIRTY, BM_LOCKED, BM_VALID, BUF_STATE_GET_REFCOUNT, fb(), FlushUnlockedBuffer(), InvalidateVictimBuffer(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, pg_atomic_read_u64(), PinBuffer_Locked(), result, BufferDesc::state, UnlockBufHdr(), and UnpinBuffer().

Referenced by EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), and EvictUnpinnedBuffer().

◆ ExtendBufferedRel()

Buffer ExtendBufferedRel ( BufferManagerRelation  bmr,
ForkNumber  forkNum,
BufferAccessStrategy  strategy,
uint32  flags 
)

Definition at line 979 of file bufmgr.c.

983{
984 Buffer buf;
985 uint32 extend_by = 1;
986
987 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
988 &buf, &extend_by);
989
990 return buf;
991}
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:1011

References buf, ExtendBufferedRelBy(), and fb().

Referenced by _bt_allocbuf(), _hash_getnewbuf(), BloomNewBuffer(), brinbuild(), brinbuildempty(), fill_seq_fork_with_data(), ginbuildempty(), GinNewBuffer(), gistbuildempty(), gistNewBuffer(), ReadBuffer_common(), revmap_physical_extend(), and SpGistNewBuffer().

◆ ExtendBufferedRelBy()

BlockNumber ExtendBufferedRelBy ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
Buffer buffers,
uint32 extended_by 
)

Definition at line 1011 of file bufmgr.c.

1018{
1019 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1020 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1021 Assert(extend_by > 0);
1022
1023 if (bmr.relpersistence == '\0')
1024 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1025
1026 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1028 buffers, extended_by);
1029}
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2742

References Assert, ExtendBufferedRelCommon(), fb(), and InvalidBlockNumber.

Referenced by ExtendBufferedRel(), grow_rel(), and RelationAddBlocks().

◆ ExtendBufferedRelCommon()

static BlockNumber ExtendBufferedRelCommon ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2742 of file bufmgr.c.

2750{
2752
2754 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2755 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2756 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2757 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2758 extend_by);
2759
2760 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2763 buffers, &extend_by);
2764 else
2765 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2767 buffers, &extend_by);
2769
2771 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2772 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2773 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2774 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2775 *extended_by,
2776 first_block);
2777
2778 return first_block;
2779}
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2786
#define BMR_GET_SMGR(bmr)
Definition bufmgr.h:118
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition localbuf.c:347

References BMR_GET_SMGR, ExtendBufferedRelLocal(), ExtendBufferedRelShared(), and fb().

Referenced by ExtendBufferedRelBy(), and ExtendBufferedRelTo().

◆ ExtendBufferedRelShared()

static BlockNumber ExtendBufferedRelShared ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2786 of file bufmgr.c.

2794{
2798
2800
2801 /*
2802 * Acquire victim buffers for extension without holding extension lock.
2803 * Writing out victim buffers is the most expensive part of extending the
2804 * relation, particularly when doing so requires WAL flushes. Zeroing out
2805 * the buffers is also quite expensive, so do that before holding the
2806 * extension lock as well.
2807 *
2808 * These pages are pinned by us and not valid. While we hold the pin they
2809 * can't be acquired as victim buffers by another backend.
2810 */
2811 for (uint32 i = 0; i < extend_by; i++)
2812 {
2814
2815 buffers[i] = GetVictimBuffer(strategy, io_context);
2817
2818 /* new buffers are zero-filled */
2819 MemSet(buf_block, 0, BLCKSZ);
2820 }
2821
2822 /*
2823 * Lock relation against concurrent extensions, unless requested not to.
2824 *
2825 * We use the same extension lock for all forks. That's unnecessarily
2826 * restrictive, but currently extensions for forks don't happen often
2827 * enough to make it worth locking more granularly.
2828 *
2829 * Note that another backend might have extended the relation by the time
2830 * we get the lock.
2831 */
2832 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2834
2835 /*
2836 * If requested, invalidate size cache, so that smgrnblocks asks the
2837 * kernel.
2838 */
2839 if (flags & EB_CLEAR_SIZE_CACHE)
2840 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2841
2843
2844 /*
2845 * Now that we have the accurate relation size, check if the caller wants
2846 * us to extend to only up to a specific size. If there were concurrent
2847 * extensions, we might have acquired too many buffers and need to release
2848 * them.
2849 */
2851 {
2853
2855 extend_by = 0;
2856 else if ((uint64) first_block + extend_by > extend_upto)
2858
2859 for (uint32 i = extend_by; i < orig_extend_by; i++)
2860 {
2861 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2862
2864 }
2865
2866 if (extend_by == 0)
2867 {
2868 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2871 return first_block;
2872 }
2873 }
2874
2875 /* Fail if relation is already at maximum possible length */
2877 ereport(ERROR,
2879 errmsg("cannot extend relation %s beyond %u blocks",
2880 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2881 MaxBlockNumber)));
2882
2883 /*
2884 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2885 *
2886 * This needs to happen before we extend the relation, because as soon as
2887 * we do, other backends can start to read in those pages.
2888 */
2889 for (uint32 i = 0; i < extend_by; i++)
2890 {
2891 Buffer victim_buf = buffers[i];
2893 BufferTag tag;
2894 uint32 hash;
2896 int existing_id;
2897
2898 /* in case we need to pin an existing buffer below */
2901
2902 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2903 first_block + i);
2904 hash = BufTableHashCode(&tag);
2906
2908
2910
2911 /*
2912 * We get here only in the corner case where we are trying to extend
2913 * the relation but we found a pre-existing buffer. This can happen
2914 * because a prior attempt at extending the relation failed, and
2915 * because mdread doesn't complain about reads beyond EOF (when
2916 * zero_damaged_pages is ON) and so a previous attempt to read a block
2917 * beyond EOF could have left a "valid" zero-filled buffer.
2918 *
2919 * This has also been observed when relation was overwritten by
2920 * external process. Since the legitimate cases should always have
2921 * left a zero-filled buffer, complain if not PageIsNew.
2922 */
2923 if (existing_id >= 0)
2924 {
2927 bool valid;
2928
2929 /*
2930 * Pin the existing buffer before releasing the partition lock,
2931 * preventing it from being evicted.
2932 */
2933 valid = PinBuffer(existing_hdr, strategy, false);
2934
2937
2940
2941 if (valid && !PageIsNew((Page) buf_block))
2942 ereport(ERROR,
2943 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2944 existing_hdr->tag.blockNum,
2945 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2946
2947 /*
2948 * We *must* do smgr[zero]extend before succeeding, else the page
2949 * will not be reserved by the kernel, and the next P_NEW call
2950 * will decide to return the same page. Clear the BM_VALID bit,
2951 * do StartSharedBufferIO() and proceed.
2952 *
2953 * Loop to handle the very small possibility that someone re-sets
2954 * BM_VALID between our clearing it and StartSharedBufferIO
2955 * inspecting it.
2956 */
2957 while (true)
2958 {
2960
2962
2964
2966 break;
2967 }
2968 }
2969 else
2970 {
2972 uint64 set_bits = 0;
2973
2975
2976 /* some sanity checks while we hold the buffer header lock */
2979
2980 victim_buf_hdr->tag = tag;
2981
2983 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2985
2987 set_bits, 0,
2988 0);
2989
2991
2992 /* XXX: could combine the locked operations in it with the above */
2994 }
2995 }
2996
2998
2999 /*
3000 * Note: if smgrzeroextend fails, we will end up with buffers that are
3001 * allocated but not marked BM_VALID. The next relation extension will
3002 * still select the same block number (because the relation didn't get any
3003 * longer on disk) and so future attempts to extend the relation will find
3004 * the same buffers (if they have not been recycled) but come right back
3005 * here to try smgrzeroextend again.
3006 *
3007 * We don't need to set checksum for all-zero pages.
3008 */
3010
3011 /*
3012 * Release the file-extension lock; it's now OK for someone else to extend
3013 * the relation some more.
3014 *
3015 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
3016 * take noticeable time.
3017 */
3018 if (!(flags & EB_SKIP_EXTENSION_LOCK))
3020
3022 io_start, 1, extend_by * BLCKSZ);
3023
3024 /* Set BM_VALID, terminate IO, and wake up any waiters */
3025 for (uint32 i = 0; i < extend_by; i++)
3026 {
3027 Buffer buf = buffers[i];
3029 bool lock = false;
3030
3031 if (flags & EB_LOCK_FIRST && i == 0)
3032 lock = true;
3033 else if (flags & EB_LOCK_TARGET)
3034 {
3036 if (first_block + i + 1 == extend_upto)
3037 lock = true;
3038 }
3039
3040 if (lock)
3042
3043 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
3044 }
3045
3047
3049
3050 return first_block;
3051}
#define MaxBlockNumber
Definition block.h:35
#define BufHdrGetBlock(bufHdr)
Definition bufmgr.c:76
StartBufferIOResult StartSharedBufferIO(BufferDesc *buf, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition bufmgr.c:7241
void LimitAdditionalPins(uint32 *additional_pins)
Definition bufmgr.c:2724
void * Block
Definition bufmgr.h:26
@ EB_LOCK_TARGET
Definition bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition bufmgr.h:90
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
static bool PageIsNew(const PageData *page)
Definition bufpage.h:258
#define MemSet(start, val, len)
Definition c.h:1107
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
#define ExclusiveLock
Definition lockdefs.h:42
@ IOOP_EXTEND
Definition pgstat.h:318
static unsigned hash(unsigned *uv, int n)
Definition rege_dfa.c:715
#define relpath(rlocator, forknum)
Definition relpath.h:150
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:819
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition smgr.c:649
int64 shared_blks_written
Definition instrument.h:29

References Assert, BM_DIRTY, BM_PERMANENT, BM_TAG_VALID, BM_VALID, BMR_GET_SMGR, buf, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BUFFER_IO_ALREADY_DONE, BUFFER_LOCK_EXCLUSIVE, BufferDescriptorGetBuffer(), BufHdrGetBlock, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), CurrentResourceOwner, EB_CLEAR_SIZE_CACHE, EB_LOCK_FIRST, EB_LOCK_TARGET, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errmsg, ERROR, ExclusiveLock, fb(), GetBufferDescriptor(), GetVictimBuffer(), hash(), i, INIT_FORKNUM, InitBufferTag(), InvalidBlockNumber, IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EXTEND, LimitAdditionalPins(), LockBuffer(), LockBufHdr(), LockRelationForExtension(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MaxBlockNumber, MemSet, PageIsNew(), pg_atomic_fetch_and_u64(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer(), relpath, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_written, smgrnblocks(), smgrzeroextend(), StartSharedBufferIO(), str, TerminateBufferIO(), track_io_timing, UnlockBufHdrExt(), UnlockRelationForExtension(), and UnpinBuffer().

Referenced by ExtendBufferedRelCommon().

◆ ExtendBufferedRelTo()

Buffer ExtendBufferedRelTo ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
BlockNumber  extend_to,
ReadBufferMode  mode 
)

Definition at line 1040 of file bufmgr.c.

1046{
1048 uint32 extended_by = 0;
1049 Buffer buffer = InvalidBuffer;
1050 Buffer buffers[64];
1051
1052 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1053 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1055
1056 if (bmr.relpersistence == '\0')
1057 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1058
1059 /*
1060 * If desired, create the file if it doesn't exist. If
1061 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1062 * an smgrexists call.
1063 */
1064 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
1065 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1066 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1068 {
1070
1071 /* recheck, fork might have been created concurrently */
1074
1076 }
1077
1078 /*
1079 * If requested, invalidate size cache, so that smgrnblocks asks the
1080 * kernel.
1081 */
1082 if (flags & EB_CLEAR_SIZE_CACHE)
1083 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1084
1085 /*
1086 * Estimate how many pages we'll need to extend by. This avoids acquiring
1087 * unnecessarily many victim buffers.
1088 */
1090
1091 /*
1092 * Since no-one else can be looking at the page contents yet, there is no
1093 * difference between an exclusive lock and a cleanup-strength lock. Note
1094 * that we pass the original mode to ReadBuffer_common() below, when
1095 * falling back to reading the buffer to a concurrent relation extension.
1096 */
1098 flags |= EB_LOCK_TARGET;
1099
1100 while (current_size < extend_to)
1101 {
1102 uint32 num_pages = lengthof(buffers);
1104
1105 if ((uint64) current_size + num_pages > extend_to)
1106 num_pages = extend_to - current_size;
1107
1108 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1109 num_pages, extend_to,
1110 buffers, &extended_by);
1111
1113 Assert(num_pages != 0 || current_size >= extend_to);
1114
1115 for (uint32 i = 0; i < extended_by; i++)
1116 {
1117 if (first_block + i != extend_to - 1)
1118 ReleaseBuffer(buffers[i]);
1119 else
1120 buffer = buffers[i];
1121 }
1122 }
1123
1124 /*
1125 * It's possible that another backend concurrently extended the relation.
1126 * In that case read the buffer.
1127 *
1128 * XXX: Should we control this via a flag?
1129 */
1130 if (buffer == InvalidBuffer)
1131 {
1132 Assert(extended_by == 0);
1133 buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1134 fork, extend_to - 1, mode, strategy);
1135 }
1136
1137 return buffer;
1138}
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:1285
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5586
@ EB_PERFORMING_RECOVERY
Definition bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition bufmgr.h:84
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition bufmgr.h:47
#define lengthof(array)
Definition c.h:873
static int64 current_size

References Assert, BMR_GET_SMGR, PrivateRefCountEntry::buffer, current_size, EB_CLEAR_SIZE_CACHE, EB_CREATE_FORK_IF_NEEDED, EB_LOCK_TARGET, EB_PERFORMING_RECOVERY, ExclusiveLock, ExtendBufferedRelCommon(), fb(), i, InvalidBlockNumber, InvalidBuffer, lengthof, LockRelationForExtension(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, ReadBuffer_common(), ReleaseBuffer(), smgrcreate(), smgrexists(), smgrnblocks(), and UnlockRelationForExtension().

Referenced by fsm_extend(), vm_extend(), and XLogReadBufferExtended().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 5055 of file bufmgr.c.

5058{
5059 BlockNumber curBlock;
5060
5061 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
5062 {
5063 uint32 bufHash; /* hash value for tag */
5064 BufferTag bufTag; /* identity of requested block */
5065 LWLock *bufPartitionLock; /* buffer partition lock for it */
5066 int buf_id;
5068
5069 /* create a tag so we can lookup the buffer */
5070 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
5071
5072 /* determine its hash code and partition lock ID */
5075
5076 /* Check that it is in the buffer pool. If not, do nothing. */
5078 buf_id = BufTableLookup(&bufTag, bufHash);
5080
5081 if (buf_id < 0)
5082 continue;
5083
5084 bufHdr = GetBufferDescriptor(buf_id);
5085
5086 /*
5087 * We need to lock the buffer header and recheck if the buffer is
5088 * still associated with the same block because the buffer could be
5089 * evicted by some other backend loading blocks for a different
5090 * relation after we release lock on the BufMapping table.
5091 */
5093
5094 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
5095 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
5096 bufHdr->tag.blockNum >= firstDelBlock)
5097 InvalidateBuffer(bufHdr); /* releases spinlock */
5098 else
5100 }
5101}

References BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), fb(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 4503 of file bufmgr.c.

4505{
4507 ErrorContextCallback errcallback;
4510
4513
4514 /*
4515 * Try to start an I/O operation. If StartBufferIO returns false, then
4516 * someone else flushed the buffer before we could, so we need not do
4517 * anything.
4518 */
4519 if (StartSharedBufferIO(buf, false, true, NULL) == BUFFER_IO_ALREADY_DONE)
4520 return;
4521
4522 /* Setup error traceback support for ereport() */
4524 errcallback.arg = buf;
4525 errcallback.previous = error_context_stack;
4526 error_context_stack = &errcallback;
4527
4528 /* Find smgr relation for buffer */
4529 if (reln == NULL)
4531
4533 buf->tag.blockNum,
4534 reln->smgr_rlocator.locator.spcOid,
4535 reln->smgr_rlocator.locator.dbOid,
4536 reln->smgr_rlocator.locator.relNumber);
4537
4538 /*
4539 * As we hold at least a share-exclusive lock on the buffer, the LSN
4540 * cannot change during the flush (and thus can't be torn).
4541 */
4543
4544 /*
4545 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4546 * rule that log updates must hit disk before any of the data-file changes
4547 * they describe do.
4548 *
4549 * However, this rule does not apply to unlogged relations, which will be
4550 * lost after a crash anyway. Most unlogged relation pages do not bear
4551 * LSNs since we never emit WAL records for them, and therefore flushing
4552 * up through the buffer LSN would be useless, but harmless. However,
4553 * some index AMs use LSNs internally to detect concurrent page
4554 * modifications, and therefore unlogged index pages bear "fake" LSNs
4555 * generated by XLogGetFakeLSN. It is unlikely but possible that the fake
4556 * LSN counter could advance past the WAL insertion point; and if it did
4557 * happen, attempting to flush WAL through that location would fail, with
4558 * disastrous system-wide consequences. To make sure that can't happen,
4559 * skip the flush if the buffer isn't permanent.
4560 */
4561 if (pg_atomic_read_u64(&buf->state) & BM_PERMANENT)
4563
4564 /*
4565 * Now it's safe to write the buffer to disk. Note that no one else should
4566 * have been able to write it, while we were busy with log flushing,
4567 * because we got the exclusive right to perform I/O by setting the
4568 * BM_IO_IN_PROGRESS bit.
4569 */
4571
4572 /* Update page checksum if desired. */
4573 PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
4574
4576
4578 BufTagGetForkNum(&buf->tag),
4579 buf->tag.blockNum,
4580 bufBlock,
4581 false);
4582
4583 /*
4584 * When a strategy is in use, only flushes of dirty buffers already in the
4585 * strategy ring are counted as strategy writes (IOCONTEXT
4586 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4587 * statistics tracking.
4588 *
4589 * If a shared buffer initially added to the ring must be flushed before
4590 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4591 *
4592 * If a shared buffer which was added to the ring later because the
4593 * current strategy buffer is pinned or in use or because all strategy
4594 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4595 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4596 * (from_ring will be false).
4597 *
4598 * When a strategy is not in use, the write can only be a "regular" write
4599 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4600 */
4603
4605
4606 /*
4607 * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
4608 */
4609 TerminateBufferIO(buf, true, 0, true, false);
4610
4612 buf->tag.blockNum,
4613 reln->smgr_rlocator.locator.spcOid,
4614 reln->smgr_rlocator.locator.dbOid,
4615 reln->smgr_rlocator.locator.relNumber);
4616
4617 /* Pop the error context stack */
4618 error_context_stack = errcallback.previous;
4619}
#define BufferGetLSN(bufHdr)
Definition bufmgr.c:77
static void shared_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7459
void PageSetChecksum(Page page, BlockNumber blkno)
Definition bufpage.c:1518
ErrorContextCallback * error_context_stack
Definition elog.c:99
@ IOOP_WRITE
Definition pgstat.h:320
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.h:131
struct ErrorContextCallback * previous
Definition elog.h:299
void(* callback)(void *arg)
Definition elog.h:300
void XLogFlush(XLogRecPtr record)
Definition xlog.c:2801

References ErrorContextCallback::arg, Assert, BM_PERMANENT, buf, BUFFER_IO_ALREADY_DONE, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferGetLSN, BufferLockHeldByMeInMode(), BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, error_context_stack, fb(), INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITE, PageSetChecksum(), pg_atomic_read_u64(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), ErrorContextCallback::previous, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), smgropen(), smgrwrite(), StartSharedBufferIO(), TerminateBufferIO(), track_io_timing, and XLogFlush().

Referenced by FlushOneBuffer(), FlushUnlockedBuffer(), and GetVictimBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 5526 of file bufmgr.c.

5527{
5528 int i;
5530
5531 for (i = 0; i < NBuffers; i++)
5532 {
5534
5536
5537 /*
5538 * As in DropRelationBuffers, an unlocked precheck should be safe and
5539 * saves some cycles.
5540 */
5541 if (bufHdr->tag.dbOid != dbid)
5542 continue;
5543
5544 /* Make sure we can handle the pin */
5547
5549 if (bufHdr->tag.dbOid == dbid &&
5551 {
5555 }
5556 else
5558 }
5559}

References BM_DIRTY, BM_VALID, CurrentResourceOwner, fb(), FlushUnlockedBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 5566 of file bufmgr.c.

5567{
5569
5570 /* currently not needed, but no fundamental reason not to support */
5571 Assert(!BufferIsLocal(buffer));
5572
5573 Assert(BufferIsPinned(buffer));
5574
5575 bufHdr = GetBufferDescriptor(buffer - 1);
5576
5577 Assert(BufferIsLockedByMe(buffer));
5578
5580}
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4503
bool BufferIsLockedByMe(Buffer buffer)
Definition bufmgr.c:3061

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsLockedByMe(), BufferIsPinned, fb(), FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, and IOOBJECT_RELATION.

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), invalidate_one_block(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 5162 of file bufmgr.c.

5163{
5164 int i;
5166 SMgrRelation srel = RelationGetSmgr(rel);
5167
5168 if (RelationUsesLocalBuffers(rel))
5169 {
5170 for (i = 0; i < NLocBuffer; i++)
5171 {
5173
5175 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5176 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
5177 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5178 {
5179 ErrorContextCallback errcallback;
5180
5181 /* Setup error traceback support for ereport() */
5183 errcallback.arg = bufHdr;
5184 errcallback.previous = error_context_stack;
5185 error_context_stack = &errcallback;
5186
5187 /* Make sure we can handle the pin */
5190
5191 /*
5192 * Pin/unpin mostly to make valgrind work, but it also seems
5193 * like the right thing to do.
5194 */
5195 PinLocalBuffer(bufHdr, false);
5196
5197
5198 FlushLocalBuffer(bufHdr, srel);
5199
5201
5202 /* Pop the error context stack */
5203 error_context_stack = errcallback.previous;
5204 }
5205 }
5206
5207 return;
5208 }
5209
5210 for (i = 0; i < NBuffers; i++)
5211 {
5213
5215
5216 /*
5217 * As in DropRelationBuffers, an unlocked precheck should be safe and
5218 * saves some cycles.
5219 */
5221 continue;
5222
5223 /* Make sure we can handle the pin */
5226
5228 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5230 {
5234 }
5235 else
5237 }
5238}
static void local_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7475
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition localbuf.c:183
void UnpinLocalBuffer(Buffer buffer)
Definition localbuf.c:857
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition localbuf.c:821
int NLocBuffer
Definition localbuf.c:45
static SMgrRelation RelationGetSmgr(Relation rel)
Definition rel.h:578

References ErrorContextCallback::arg, BM_DIRTY, BM_VALID, BufferDescriptorGetBuffer(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, fb(), FlushLocalBuffer(), FlushUnlockedBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, local_buffer_write_error_callback(), LockBufHdr(), NBuffers, NLocBuffer, pg_atomic_read_u64(), PinBuffer_Locked(), PinLocalBuffer(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), UnlockBufHdr(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 5250 of file bufmgr.c.

5251{
5252 int i;
5254 bool use_bsearch;
5255
5256 if (nrels == 0)
5257 return;
5258
5259 /* fill-in array for qsort */
5261
5262 for (i = 0; i < nrels; i++)
5263 {
5264 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5265
5266 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5267 srels[i].srel = smgrs[i];
5268 }
5269
5270 /*
5271 * Save the bsearch overhead for low number of relations to sync. See
5272 * DropRelationsAllBuffers for details.
5273 */
5275
5276 /* sort the list of SMgrRelations if necessary */
5277 if (use_bsearch)
5278 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5279
5280 for (i = 0; i < NBuffers; i++)
5281 {
5285
5286 /*
5287 * As in DropRelationBuffers, an unlocked precheck should be safe and
5288 * saves some cycles.
5289 */
5290
5291 if (!use_bsearch)
5292 {
5293 int j;
5294
5295 for (j = 0; j < nrels; j++)
5296 {
5297 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5298 {
5299 srelent = &srels[j];
5300 break;
5301 }
5302 }
5303 }
5304 else
5305 {
5306 RelFileLocator rlocator;
5307
5308 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5309 srelent = bsearch(&rlocator,
5310 srels, nrels, sizeof(SMgrSortArray),
5312 }
5313
5314 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5315 if (srelent == NULL)
5316 continue;
5317
5318 /* Make sure we can handle the pin */
5321
5323 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5325 {
5329 }
5330 else
5332 }
5333
5334 pfree(srels);
5335}

References Assert, BM_DIRTY, BM_VALID, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, fb(), FlushUnlockedBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, LockBufHdr(), NBuffers, palloc_array, pfree(), PinBuffer_Locked(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), rlocator_comparator(), UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ FlushUnlockedBuffer()

static void FlushUnlockedBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 4626 of file bufmgr.c.

4628{
4630
4633 BufferLockUnlock(buffer, buf);
4634}
static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5898
static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6014

References buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferDescriptorGetBuffer(), BufferLockAcquire(), BufferLockUnlock(), fb(), FlushBuffer(), IOCONTEXT_NORMAL, and IOOBJECT_RELATION.

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 565 of file bufmgr.c.

566{
567 Assert(ref->data.refcount == 0);
568 Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
569
570 if (ref >= &PrivateRefCountArray[0] &&
572 {
573 ref->buffer = InvalidBuffer;
575
576
577 /*
578 * Mark the just used entry as reserved - in many scenarios that
579 * allows us to avoid ever having to search the array/hash for free
580 * entries.
581 */
583 }
584 else
585 {
589 }
590}
static int ReservedRefCountSlot
Definition bufmgr.c:268

References Assert, BUFFER_LOCK_UNLOCK, fb(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountSlot.

Referenced by UnlockReleaseBuffer(), and UnpinBufferNoOwner().

◆ GetAdditionalPinLimit()

uint32 GetAdditionalPinLimit ( void  )

Definition at line 2698 of file bufmgr.c.

2699{
2701
2702 /*
2703 * We get the number of "overflowed" pins for free, but don't know the
2704 * number of pins in PrivateRefCountArray. The cost of calculating that
2705 * exactly doesn't seem worth it, so just assume the max.
2706 */
2708
2709 /* Is this backend already holding more than its fair share? */
2711 return 0;
2712
2714}
static uint32 MaxProportionalPins
Definition bufmgr.c:271

References fb(), MaxProportionalPins, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by LimitAdditionalPins(), and read_stream_start_pending_read().

◆ GetPinLimit()

uint32 GetPinLimit ( void  )

Definition at line 2686 of file bufmgr.c.

2687{
2688 return MaxProportionalPins;
2689}

References MaxProportionalPins.

Referenced by GetAccessStrategy(), and read_stream_begin_impl().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 542 of file bufmgr.c.

543{
545
546 Assert(BufferIsValid(buffer));
547 Assert(!BufferIsLocal(buffer));
548
549 /*
550 * Not moving the entry - that's ok for the current users, but we might
551 * want to change this one day.
552 */
553 ref = GetPrivateRefCountEntry(buffer, false);
554
555 if (ref == NULL)
556 return 0;
557 return ref->data.refcount;
558}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), fb(), and GetPrivateRefCountEntry().

Referenced by CheckBufferIsPinnedOnce(), ConditionalLockBufferForCleanup(), DebugPrintBufferRefcount(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), and MarkSharedBufferDirtyHint().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
inlinestatic

Definition at line 507 of file bufmgr.c.

508{
509 Assert(BufferIsValid(buffer));
510 Assert(!BufferIsLocal(buffer));
511
512 /*
513 * It's very common to look up the same buffer repeatedly. To make that
514 * fast, we have a one-entry cache.
515 *
516 * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
517 * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
518 * fewer addresses are computed and fewer cachelines are accessed. Whereas
519 * in GetPrivateRefCountEntrySlow()'s case, checking
520 * PrivateRefCountArrayKeys saves a lot of memory accesses.
521 */
522 if (likely(PrivateRefCountEntryLast != -1) &&
524 {
526 }
527
528 /*
529 * The code for the cached lookup is small enough to be worth inlining
530 * into the caller. In the miss case however, that empirically doesn't
531 * seem worth it.
532 */
533 return GetPrivateRefCountEntrySlow(buffer, do_move);
534}
static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
Definition bufmgr.c:419
static int PrivateRefCountEntryLast
Definition bufmgr.c:269

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), fb(), GetPrivateRefCountEntrySlow(), likely, PrivateRefCountArray, and PrivateRefCountEntryLast.

Referenced by BufferLockAcquire(), BufferLockConditional(), BufferLockDisownInternal(), BufferLockHeldByMe(), BufferLockHeldByMeInMode(), GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), ResOwnerReleaseBuffer(), SharedBufferBeginSetHintBits(), UnlockReleaseBuffer(), and UnpinBufferNoOwner().

◆ GetPrivateRefCountEntrySlow()

static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 419 of file bufmgr.c.

420{
422 int match = -1;
423 int i;
424
425 /*
426 * First search for references in the array, that'll be sufficient in the
427 * majority of cases.
428 */
429 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
430 {
431 if (PrivateRefCountArrayKeys[i] == buffer)
432 {
433 match = i;
434 /* see ReservePrivateRefCountEntry() for why we don't return */
435 }
436 }
437
438 if (likely(match != -1))
439 {
440 /* update cache for the next lookup */
442
443 return &PrivateRefCountArray[match];
444 }
445
446 /*
447 * By here we know that the buffer, if already pinned, isn't residing in
448 * the array.
449 *
450 * Only look up the buffer in the hashtable if we've previously overflowed
451 * into it.
452 */
454 return NULL;
455
457
458 if (res == NULL)
459 return NULL;
460 else if (!do_move)
461 {
462 /* caller doesn't want us to move the hash entry into the array */
463 return res;
464 }
465 else
466 {
467 /* move buffer from hashtable into the free array slot */
470
471 /* Save data and delete from hashtable while res is still valid */
472 data = res->data;
476
477 /* Ensure there's a free array slot */
479
480 /* Use up the reserved slot */
484 Assert(free->buffer == InvalidBuffer);
485
486 /* and fill it */
487 free->buffer = buffer;
488 free->data = data;
490 /* update cache for the next lookup */
492
494
495 return free;
496 }
497}
const void * data
#define free(a)

References Assert, PrivateRefCountEntry::buffer, PrivateRefCountEntry::data, data, fb(), free, i, InvalidBuffer, likely, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountEntryLast, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, ReservedRefCountSlot, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCountEntry().

◆ GetVictimBuffer()

static Buffer GetVictimBuffer ( BufferAccessStrategy  strategy,
IOContext  io_context 
)
static

Definition at line 2539 of file bufmgr.c.

2540{
2542 Buffer buf;
2544 bool from_ring;
2545
2546 /*
2547 * Ensure, before we pin a victim buffer, that there's a free refcount
2548 * entry and resource owner slot for the pin.
2549 */
2552
2553 /* we return here if a prospective victim buffer gets used concurrently */
2554again:
2555
2556 /*
2557 * Select a victim buffer. The buffer is returned pinned and owned by
2558 * this backend.
2559 */
2562
2563 /*
2564 * We shouldn't have any other pins for this buffer.
2565 */
2567
2568 /*
2569 * If the buffer was dirty, try to write it out. There is a race
2570 * condition here, another backend could dirty the buffer between
2571 * StrategyGetBuffer() checking that it is not in use and invalidating the
2572 * buffer below. That's addressed by InvalidateVictimBuffer() verifying
2573 * that the buffer is not dirty.
2574 */
2575 if (buf_state & BM_DIRTY)
2576 {
2579
2580 /*
2581 * We need a share-exclusive lock on the buffer contents to write it
2582 * out (else we might write invalid data, eg because someone else is
2583 * compacting the page contents while we write). We must use a
2584 * conditional lock acquisition here to avoid deadlock. Even though
2585 * the buffer was not pinned (and therefore surely not locked) when
2586 * StrategyGetBuffer returned it, someone else could have pinned and
2587 * (share-)exclusive-locked it by the time we get here. If we try to
2588 * get the lock unconditionally, we'd block waiting for them; if they
2589 * later block waiting for us, deadlock ensues. (This has been
2590 * observed to happen when two backends are both trying to split btree
2591 * index pages, and the second one just happens to be trying to split
2592 * the page the first one got from StrategyGetBuffer.)
2593 */
2595 {
2596 /*
2597 * Someone else has locked the buffer, so give it up and loop back
2598 * to get another one.
2599 */
2601 goto again;
2602 }
2603
2604 /*
2605 * If using a nondefault strategy, and this victim came from the
2606 * strategy ring, let the strategy decide whether to reject it when
2607 * reusing it would require a WAL flush. This only applies to
2608 * permanent buffers; unlogged buffers can have fake LSNs, so
2609 * XLogNeedsFlush() is not meaningful for them.
2610 *
2611 * We need to hold the content lock in at least share-exclusive mode
2612 * to safely inspect the page LSN, so this couldn't have been done
2613 * inside StrategyGetBuffer().
2614 */
2615 if (strategy && from_ring &&
2619 {
2621 goto again;
2622 }
2623
2624 /* OK, do the I/O */
2627
2629 &buf_hdr->tag);
2630 }
2631
2632
2633 if (buf_state & BM_VALID)
2634 {
2635 /*
2636 * When a BufferAccessStrategy is in use, blocks evicted from shared
2637 * buffers are counted as IOOP_EVICT in the corresponding context
2638 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2639 * strategy in two cases: 1) while initially claiming buffers for the
2640 * strategy ring 2) to replace an existing strategy ring buffer
2641 * because it is pinned or in use and cannot be reused.
2642 *
2643 * Blocks evicted from buffers already in the strategy ring are
2644 * counted as IOOP_REUSE in the corresponding strategy context.
2645 *
2646 * At this point, we can accurately count evictions and reuses,
2647 * because we have successfully claimed the valid buffer. Previously,
2648 * we may have been forced to release the buffer due to concurrent
2649 * pinners or erroring out.
2650 */
2652 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2653 }
2654
2655 /*
2656 * If the buffer has an entry in the buffer mapping table, delete it. This
2657 * can fail because another backend could have pinned or dirtied the
2658 * buffer.
2659 */
2661 {
2663 goto again;
2664 }
2665
2666 /* a final set of sanity checks */
2667#ifdef USE_ASSERT_CHECKING
2669
2672
2674#endif
2675
2676 return buf;
2677}
WritebackContext BackendWritebackContext
Definition buf_init.c:27
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition bufmgr.c:6637
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5603
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition bufmgr.c:7690
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
Definition freelist.c:184
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition freelist.c:752
@ IOOP_EVICT
Definition pgstat.h:311
@ IOOP_REUSE
Definition pgstat.h:314
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:68
bool XLogNeedsFlush(XLogRecPtr record)
Definition xlog.c:3163

References Assert, BackendWritebackContext, BM_DIRTY, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetBuffer(), BufferGetLSN, BufferLockConditional(), CheckBufferIsPinnedOnce(), CurrentResourceOwner, fb(), FlushBuffer(), InvalidateVictimBuffer(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, LockBuffer(), pg_atomic_read_u64(), pgstat_count_io_op(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), StrategyGetBuffer(), StrategyRejectBuffer(), UnlockReleaseBuffer(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by BufferAlloc(), and ExtendBufferedRelShared().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 6817 of file bufmgr.c.

6818{
6820
6821 /*
6822 * If we get woken slowly then it's possible that the Startup process was
6823 * already woken by other backends before we got here. Also possible that
6824 * we get here by multiple interrupts or interrupts at inappropriate
6825 * times, so make sure we do nothing if the bufid is not set.
6826 */
6827 if (bufid < 0)
6828 return false;
6829
6830 if (GetPrivateRefCount(bufid + 1) > 0)
6831 return true;
6832
6833 return false;
6834}
int GetStartupBufferPinWaitBufId(void)
Definition proc.c:771

References fb(), GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and ProcessRecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 5670 of file bufmgr.c.

5671{
5672 Assert(BufferIsPinned(buffer));
5674 if (BufferIsLocal(buffer))
5675 LocalRefCount[-buffer - 1]++;
5676 else
5677 {
5679
5680 ref = GetPrivateRefCountEntry(buffer, true);
5681 Assert(ref != NULL);
5682 ref->data.refcount++;
5683 }
5685}
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, fb(), GetPrivateRefCountEntry(), LocalRefCount, ResourceOwnerEnlarge(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), RelationAddBlocks(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

◆ InitBufferManagerAccess()

void InitBufferManagerAccess ( void  )

Definition at line 4216 of file bufmgr.c.

4217{
4218 /*
4219 * An advisory limit on the number of pins each backend should hold, based
4220 * on shared_buffers and the maximum number of connections possible.
4221 * That's very pessimistic, but outside toy-sized shared_buffers it should
4222 * allow plenty of pins. LimitAdditionalPins() and
4223 * GetAdditionalPinLimit() can be used to check the remaining balance.
4224 */
4226
4229
4231
4232 /*
4233 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4234 * the corresponding phase of backend shutdown.
4235 */
4236 Assert(MyProc != NULL);
4238}
static void AtProcExit_Buffers(int code, Datum arg)
Definition bufmgr.c:4245
int MaxBackends
Definition globals.c:149
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:372
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
#define NUM_AUXILIARY_PROCS
Definition proc.h:527

References Assert, AtProcExit_Buffers(), CurrentMemoryContext, fb(), MaxBackends, MaxProportionalPins, MyProc, NBuffers, NUM_AUXILIARY_PROCS, on_shmem_exit(), PrivateRefCountArray, PrivateRefCountArrayKeys, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 2361 of file bufmgr.c.

2362{
2364 uint32 oldHash; /* hash value for oldTag */
2365 LWLock *oldPartitionLock; /* buffer partition lock for it */
2368
2369 /* Save the original buffer tag before dropping the spinlock */
2370 oldTag = buf->tag;
2371
2373
2374 /*
2375 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2376 * worth storing the hashcode in BufferDesc so we need not recompute it
2377 * here? Probably not.
2378 */
2381
2382retry:
2383
2384 /*
2385 * Acquire exclusive mapping lock in preparation for changing the buffer's
2386 * association.
2387 */
2389
2390 /* Re-lock the buffer header */
2392
2393 /* If it's changed while we were waiting for lock, do nothing */
2394 if (!BufferTagsEqual(&buf->tag, &oldTag))
2395 {
2398 return;
2399 }
2400
2401 /*
2402 * We assume the reason for it to be pinned is that either we were
2403 * asynchronously reading the page in before erroring out or someone else
2404 * is flushing the page out. Wait for the IO to finish. (This could be
2405 * an infinite loop if the refcount is messed up... it would be nice to
2406 * time out after awhile, but there seems no way to be sure how many loops
2407 * may be needed. Note that if the other guy has pinned the buffer but
2408 * not yet done StartBufferIO, WaitIO will fall through and we'll
2409 * effectively be busy-looping here.)
2410 */
2412 {
2415 /* safety check: should definitely not be our *own* pin */
2417 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2418 WaitIO(buf);
2419 goto retry;
2420 }
2421
2422 /*
2423 * An invalidated buffer should not have any backends waiting to lock the
2424 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2425 */
2427
2428 /*
2429 * Clear out the buffer's tag and flags. We must do this to ensure that
2430 * linear scans of the buffer array don't think the buffer is valid.
2431 */
2433 ClearBufferTag(&buf->tag);
2434
2436 0,
2438 0);
2439
2440 /*
2441 * Remove the buffer from the lookup hashtable, if it was in there.
2442 */
2443 if (oldFlags & BM_TAG_VALID)
2445
2446 /*
2447 * Done with mapping lock.
2448 */
2450}
#define BUF_USAGECOUNT_MASK
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static void ClearBufferTag(BufferTag *tag)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:154
static void WaitIO(BufferDesc *buf)
Definition bufmgr.c:7139

References Assert, BM_LOCK_WAKE_IN_PROGRESS, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog, ERROR, fb(), GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), UnlockBufHdr(), UnlockBufHdrExt(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ InvalidateVictimBuffer()

static bool InvalidateVictimBuffer ( BufferDesc buf_hdr)
static

Definition at line 2462 of file bufmgr.c.

2463{
2465 uint32 hash;
2467 BufferTag tag;
2468
2470
2471 /* have buffer pinned, so it's safe to read tag without lock */
2472 tag = buf_hdr->tag;
2473
2474 hash = BufTableHashCode(&tag);
2476
2478
2479 /* lock the buffer header */
2481
2482 /*
2483 * We have the buffer pinned nobody else should have been able to unset
2484 * this concurrently.
2485 */
2488 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2489
2490 /*
2491 * If somebody else pinned the buffer since, or even worse, dirtied it,
2492 * give up on this buffer: It's clearly in use.
2493 */
2495 {
2497
2500
2501 return false;
2502 }
2503
2504 /*
2505 * An invalidated buffer should not have any backends waiting to lock the
2506 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2507 */
2509
2510 /*
2511 * Clear out the buffer's tag and flags and usagecount. This is not
2512 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2513 * doing anything with the buffer. But currently it's beneficial, as the
2514 * cheaper pre-check for several linear scans of shared buffers use the
2515 * tag (see e.g. FlushDatabaseBuffers()).
2516 */
2517 ClearBufferTag(&buf_hdr->tag);
2519 0,
2521 0);
2522
2524
2525 /* finally delete buffer from the buffer mapping table */
2526 BufTableDelete(&tag, hash);
2527
2529
2534
2535 return true;
2536}

References Assert, BM_DIRTY, BM_LOCK_WAKE_IN_PROGRESS, BM_TAG_VALID, BM_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), fb(), GetPrivateRefCount(), hash(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u64(), UnlockBufHdr(), and UnlockBufHdrExt().

Referenced by EvictUnpinnedBufferInternal(), and GetVictimBuffer().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 6901 of file bufmgr.c.

6902{
6905
6906 Assert(BufferIsValid(buffer));
6907
6908 /* see AIO related comment in LockBufferForCleanup() */
6909
6910 if (BufferIsLocal(buffer))
6911 {
6912 /* There should be exactly one pin */
6913 if (LocalRefCount[-buffer - 1] != 1)
6914 return false;
6915 /* Nobody else to wait for */
6916 return true;
6917 }
6918
6919 /* There should be exactly one local pin */
6920 if (GetPrivateRefCount(buffer) != 1)
6921 return false;
6922
6923 bufHdr = GetBufferDescriptor(buffer - 1);
6924
6925 /* caller must hold exclusive lock on buffer */
6927
6929
6932 {
6933 /* pincount is OK. */
6935 return true;
6936 }
6937
6939 return false;
6940}

References Assert, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), BufferIsValid(), fb(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext wb_context,
IOContext  io_context 
)

Definition at line 7740 of file bufmgr.c.

7741{
7743 int i;
7744
7745 if (wb_context->nr_pending == 0)
7746 return;
7747
7748 /*
7749 * Executing the writes in-order can make them a lot faster, and allows to
7750 * merge writeback requests to consecutive blocks into larger writebacks.
7751 */
7752 sort_pending_writebacks(wb_context->pending_writebacks,
7753 wb_context->nr_pending);
7754
7756
7757 /*
7758 * Coalesce neighbouring writes, but nothing else. For that we iterate
7759 * through the, now sorted, array of pending flushes, and look forward to
7760 * find all neighbouring (or identical) writes.
7761 */
7762 for (i = 0; i < wb_context->nr_pending; i++)
7763 {
7767 int ahead;
7768 BufferTag tag;
7770 Size nblocks = 1;
7771
7772 cur = &wb_context->pending_writebacks[i];
7773 tag = cur->tag;
7775
7776 /*
7777 * Peek ahead, into following writeback requests, to see if they can
7778 * be combined with the current one.
7779 */
7780 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
7781 {
7782
7783 next = &wb_context->pending_writebacks[i + ahead + 1];
7784
7785 /* different file, stop */
7787 BufTagGetRelFileLocator(&next->tag)) ||
7788 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
7789 break;
7790
7791 /* ok, block queued twice, skip */
7792 if (cur->tag.blockNum == next->tag.blockNum)
7793 continue;
7794
7795 /* only merge consecutive writes */
7796 if (cur->tag.blockNum + 1 != next->tag.blockNum)
7797 break;
7798
7799 nblocks++;
7800 cur = next;
7801 }
7802
7803 i += ahead;
7804
7805 /* and finally tell the kernel to write the data to storage */
7807 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
7808 }
7809
7810 /*
7811 * Assume that writeback requests are only issued for buffers containing
7812 * blocks of permanent relations.
7813 */
7815 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
7816
7817 wb_context->nr_pending = 0;
7818}
static int32 next
Definition blutils.c:225
struct cursor * cur
Definition ecpg.c:29
@ IOOP_WRITEBACK
Definition pgstat.h:315
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition smgr.c:805

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, fb(), i, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITEBACK, next, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RelFileLocatorEquals, smgropen(), smgrwriteback(), and track_io_timing.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ LimitAdditionalPins()

void LimitAdditionalPins ( uint32 additional_pins)

Definition at line 2724 of file bufmgr.c.

2725{
2726 uint32 limit;
2727
2728 if (*additional_pins <= 1)
2729 return;
2730
2731 limit = GetAdditionalPinLimit();
2732 limit = Max(limit, 1);
2733 if (limit < *additional_pins)
2734 *additional_pins = limit;
2735}
uint32 GetAdditionalPinLimit(void)
Definition bufmgr.c:2698
#define Max(x, y)
Definition c.h:1085

References fb(), GetAdditionalPinLimit(), and Max.

Referenced by ExtendBufferedRelShared().

◆ local_buffer_readv_complete()

static PgAioResult local_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 8931 of file bufmgr.c.

8933{
8935}
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition bufmgr.c:8676

References buffer_readv_complete(), and fb().

◆ local_buffer_readv_stage()

static void local_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 8925 of file bufmgr.c.

8926{
8927 buffer_stage_common(ioh, false, true);
8928}
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition bufmgr.c:8281

References buffer_stage_common(), and fb().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void arg)
static

Definition at line 7475 of file bufmgr.c.

7476{
7478
7479 if (bufHdr != NULL)
7480 errcontext("writing block %u of relation \"%s\"",
7481 bufHdr->tag.blockNum,
7484 BufTagGetForkNum(&bufHdr->tag)).str);
7485}
Datum arg
Definition elog.c:1322
#define errcontext
Definition elog.h:200

References arg, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, fb(), MyProcNumber, and relpathbackend.

Referenced by FlushRelationBuffers().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 6670 of file bufmgr.c.

6671{
6673 TimestampTz waitStart = 0;
6674 bool waiting = false;
6675 bool logged_recovery_conflict = false;
6676
6677 Assert(BufferIsPinned(buffer));
6679
6681
6682 /*
6683 * We do not yet need to be worried about in-progress AIOs holding a pin,
6684 * as we, so far, only support doing reads via AIO and this function can
6685 * only be called once the buffer is valid (i.e. no read can be in
6686 * flight).
6687 */
6688
6689 /* Nobody else to wait for */
6690 if (BufferIsLocal(buffer))
6691 return;
6692
6693 bufHdr = GetBufferDescriptor(buffer - 1);
6694
6695 for (;;)
6696 {
6698 uint64 unset_bits = 0;
6699
6700 /* Try to acquire lock */
6703
6706 {
6707 /* Successfully acquired exclusive lock with pincount 1 */
6709
6710 /*
6711 * Emit the log message if recovery conflict on buffer pin was
6712 * resolved but the startup process waited longer than
6713 * deadlock_timeout for it.
6714 */
6717 waitStart, GetCurrentTimestamp(),
6718 NULL, false);
6719
6720 if (waiting)
6721 {
6722 /* reset ps display to remove the suffix if we added one */
6724 waiting = false;
6725 }
6726 return;
6727 }
6728 /* Failed, so mark myself as waiting for pincount 1 */
6730 {
6733 elog(ERROR, "multiple backends attempting to wait for pincount 1");
6734 }
6735 bufHdr->wait_backend_pgprocno = MyProcNumber;
6739 0);
6741
6742 /* Wait to be signaled by UnpinBuffer() */
6743 if (InHotStandby)
6744 {
6745 if (!waiting)
6746 {
6747 /* adjust the process title to indicate that it's waiting */
6748 set_ps_display_suffix("waiting");
6749 waiting = true;
6750 }
6751
6752 /*
6753 * Emit the log message if the startup process is waiting longer
6754 * than deadlock_timeout for recovery conflict on buffer pin.
6755 *
6756 * Skip this if first time through because the startup process has
6757 * not started waiting yet in this case. So, the wait start
6758 * timestamp is set after this logic.
6759 */
6760 if (waitStart != 0 && !logged_recovery_conflict)
6761 {
6763
6764 if (TimestampDifferenceExceeds(waitStart, now,
6766 {
6768 waitStart, now, NULL, true);
6770 }
6771 }
6772
6773 /*
6774 * Set the wait start timestamp if logging is enabled and first
6775 * time through.
6776 */
6777 if (log_recovery_conflict_waits && waitStart == 0)
6778 waitStart = GetCurrentTimestamp();
6779
6780 /* Publish the bufid that Startup process waits on */
6781 SetStartupBufferPinWaitBufId(buffer - 1);
6782 /* Set alarm and then wait to be signaled by UnpinBuffer() */
6784 /* Reset the published bufid */
6786 }
6787 else
6789
6790 /*
6791 * Remove flag marking us as waiter. Normally this will not be set
6792 * anymore, but ProcWaitForSignal() can return for other signals as
6793 * well. We take care to only reset the flag if we're the waiter, as
6794 * theoretically another backend could have started waiting. That's
6795 * impossible with the current usages due to table level locking, but
6796 * better be safe.
6797 */
6799 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
6800 bufHdr->wait_backend_pgprocno == MyProcNumber)
6802
6804 0, unset_bits,
6805 0);
6806
6808 /* Loop back and try again */
6809 }
6810}
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1775
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1639
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1603
#define BM_PIN_COUNT_WAITER
static BufferDesc * PinCountWaitBuf
Definition bufmgr.c:228
int64 TimestampTz
Definition timestamp.h:39
void set_ps_display_remove_suffix(void)
Definition ps_status.c:440
void set_ps_display_suffix(const char *suffix)
Definition ps_status.c:388
int DeadlockTimeout
Definition proc.c:62
void SetStartupBufferPinWaitBufId(int bufid)
Definition proc.c:759
void ProcWaitForSignal(uint32 wait_event_info)
Definition proc.c:2015
void ResolveRecoveryConflictWithBufferPin(void)
Definition standby.c:795
bool log_recovery_conflict_waits
Definition standby.c:43
void LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition standby.c:275
@ RECOVERY_CONFLICT_BUFFERPIN
Definition standby.h:49
static volatile sig_atomic_t waiting
#define InHotStandby
Definition xlogutils.h:60

References Assert, BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, CheckBufferIsPinnedOnce(), DeadlockTimeout, elog, ERROR, fb(), GetBufferDescriptor(), GetCurrentTimestamp(), InHotStandby, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProcNumber, now(), PinCountWaitBuf, ProcWaitForSignal(), RECOVERY_CONFLICT_BUFFERPIN, ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), UnlockBufHdrExt(), and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), XLogReadBufferForRedoExtended(), and ZeroAndLockBuffer().

◆ LockBufferInternal()

void LockBufferInternal ( Buffer  buffer,
BufferLockMode  mode 
)

Definition at line 6574 of file bufmgr.c.

6575{
6577
6578 /*
6579 * We can't wait if we haven't got a PGPROC. This should only occur
6580 * during bootstrap or shared memory initialization. Put an Assert here
6581 * to catch unsafe coding practices.
6582 */
6584
6585 /* handled in LockBuffer() wrapper */
6587
6588 Assert(BufferIsPinned(buffer));
6589 if (BufferIsLocal(buffer))
6590 return; /* local buffers need no lock */
6591
6592 buf_hdr = GetBufferDescriptor(buffer - 1);
6593
6594 /*
6595 * Test the most frequent lock modes first. While a switch (mode) would be
6596 * nice, at least gcc generates considerably worse code for it.
6597 *
6598 * Call BufferLockAcquire() with a constant argument for mode, to generate
6599 * more efficient code for the different lock modes.
6600 */
6601 if (mode == BUFFER_LOCK_SHARE)
6603 else if (mode == BUFFER_LOCK_EXCLUSIVE)
6607 else
6608 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
6609}
bool IsUnderPostmaster
Definition globals.c:122

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, BufferLockAcquire(), elog, ERROR, fb(), GetBufferDescriptor(), IsUnderPostmaster, mode, and MyProc.

Referenced by LockBuffer().

◆ LockBufHdr()

uint64 LockBufHdr ( BufferDesc desc)

Definition at line 7518 of file bufmgr.c.

7519{
7521
7523
7524 while (true)
7525 {
7526 /*
7527 * Always try once to acquire the lock directly, without setting up
7528 * the spin-delay infrastructure. The work necessary for that shows up
7529 * in profiles and is rarely necessary.
7530 */
7532 if (likely(!(old_buf_state & BM_LOCKED)))
7533 break; /* got lock */
7534
7535 /* and then spin without atomic operations until lock is released */
7536 {
7538
7540
7541 while (old_buf_state & BM_LOCKED)
7542 {
7545 }
7547 }
7548
7549 /*
7550 * Retry. The lock might obviously already be re-acquired by the time
7551 * we're attempting to get it again.
7552 */
7553 }
7554
7555 return old_buf_state | BM_LOCKED;
7556}
void perform_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:186
#define init_local_spin_delay(status)
Definition s_lock.h:749

References Assert, BM_LOCKED, BufferDescriptorGetBuffer(), BufferIsLocal, fb(), finish_spin_delay(), init_local_spin_delay, likely, perform_spin_delay(), pg_atomic_fetch_or_u64(), pg_atomic_read_u64(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), buffer_stage_common(), BufferAlloc(), BufferGetLSNAtomic(), BufferLockDequeueSelf(), BufferLockQueueSelf(), BufferLockWakeup(), BufferSync(), ConditionalLockBufferForCleanup(), create_toy_buffer(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FindAndDropRelationBuffers(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), MarkDirtyUnpinnedBuffer(), MarkSharedBufferDirtyHint(), pg_buffercache_os_pages_internal(), pg_buffercache_pages(), StartSharedBufferIO(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), WaitIO(), and WakePinCountWaiter().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 3147 of file bufmgr.c.

3148{
3152
3153 if (!BufferIsValid(buffer))
3154 elog(ERROR, "bad buffer ID: %d", buffer);
3155
3156 if (BufferIsLocal(buffer))
3157 {
3158 MarkLocalBufferDirty(buffer);
3159 return;
3160 }
3161
3162 bufHdr = GetBufferDescriptor(buffer - 1);
3163
3164 Assert(BufferIsPinned(buffer));
3166
3167 /*
3168 * NB: We have to wait for the buffer header spinlock to be not held, as
3169 * TerminateBufferIO() relies on the spinlock.
3170 */
3172 for (;;)
3173 {
3176
3178
3181
3183 buf_state))
3184 break;
3185 }
3186
3187 /*
3188 * If the buffer was not dirty already, do vacuum accounting.
3189 */
3190 if (!(old_buf_state & BM_DIRTY))
3191 {
3193 if (VacuumCostActive)
3195 }
3196}
pg_noinline uint64 WaitBufHdrUnlocked(BufferDesc *buf)
Definition bufmgr.c:7566
int VacuumCostPageDirty
Definition globals.c:156
int64 shared_blks_dirtied
Definition instrument.h:28

References Assert, BM_DIRTY, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), BufferIsPinned, BufferIsValid(), elog, ERROR, fb(), GetBufferDescriptor(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), pgBufferUsage, BufferUsage::shared_blks_dirtied, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), dataExecPlaceToPageInternal(), dataExecPlaceToPageLeaf(), doPickSplit(), entryExecPlaceToPage(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePostingPage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_inplace_update_and_unlock(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune_and_freeze(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), lazy_scan_new_or_empty(), lazy_vacuum_heap_page(), log_newpage_range(), MarkDirtyUnpinnedBufferInternal(), moveLeafs(), nextval_internal(), ProcessSingleRelationFork(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), SetSequence(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

◆ MarkDirtyAllUnpinnedBuffers()

void MarkDirtyAllUnpinnedBuffers ( int32 buffers_dirtied,
int32 buffers_already_dirty,
int32 buffers_skipped 
)

Definition at line 8232 of file bufmgr.c.

8235{
8236 *buffers_dirtied = 0;
8238 *buffers_skipped = 0;
8239
8240 for (int buf = 1; buf <= NBuffers; buf++)
8241 {
8242 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8245
8247
8249 if (!(buf_state & BM_VALID))
8250 continue;
8251
8254
8255 LockBufHdr(desc);
8256
8258 (*buffers_dirtied)++;
8259 else if (buffer_already_dirty)
8260 (*buffers_already_dirty)++;
8261 else
8262 (*buffers_skipped)++;
8263 }
8264}
static bool MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
Definition bufmgr.c:8083

References BM_VALID, buf, CHECK_FOR_INTERRUPTS, CurrentResourceOwner, fb(), GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), NBuffers, pg_atomic_read_u64(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), and BufferDesc::state.

Referenced by pg_buffercache_mark_dirty_all().

◆ MarkDirtyRelUnpinnedBuffers()

void MarkDirtyRelUnpinnedBuffers ( Relation  rel,
int32 buffers_dirtied,
int32 buffers_already_dirty,
int32 buffers_skipped 
)

Definition at line 8175 of file bufmgr.c.

8179{
8181
8182 *buffers_dirtied = 0;
8184 *buffers_skipped = 0;
8185
8186 for (int buf = 1; buf <= NBuffers; buf++)
8187 {
8188 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8191
8193
8194 /* An unlocked precheck should be safe and saves some cycles. */
8195 if ((buf_state & BM_VALID) == 0 ||
8197 continue;
8198
8199 /* Make sure we can pin the buffer. */
8202
8203 buf_state = LockBufHdr(desc);
8204
8205 /* recheck, could have changed without the lock */
8206 if ((buf_state & BM_VALID) == 0 ||
8208 {
8209 UnlockBufHdr(desc);
8210 continue;
8211 }
8212
8214 (*buffers_dirtied)++;
8215 else if (buffer_already_dirty)
8216 (*buffers_already_dirty)++;
8217 else
8218 (*buffers_skipped)++;
8219 }
8220}

References Assert, BM_VALID, buf, BufTagMatchesRelFileLocator(), CHECK_FOR_INTERRUPTS, CurrentResourceOwner, fb(), GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), NBuffers, pg_atomic_read_u64(), RelationData::rd_locator, RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by pg_buffercache_mark_dirty_relation().

◆ MarkDirtyUnpinnedBuffer()

bool MarkDirtyUnpinnedBuffer ( Buffer  buf,
bool buffer_already_dirty 
)

Definition at line 8139 of file bufmgr.c.

8140{
8141 BufferDesc *desc;
8142 bool buffer_dirtied = false;
8143
8145
8146 /* Make sure we can pin the buffer. */
8149
8150 desc = GetBufferDescriptor(buf - 1);
8151 LockBufHdr(desc);
8152
8154 /* Both can not be true at the same time */
8156
8157 return buffer_dirtied;
8158}

References Assert, buf, BufferIsLocal, CurrentResourceOwner, fb(), GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), ReservePrivateRefCountEntry(), and ResourceOwnerEnlarge().

Referenced by pg_buffercache_mark_dirty().

◆ MarkDirtyUnpinnedBufferInternal()

static bool MarkDirtyUnpinnedBufferInternal ( Buffer  buf,
BufferDesc desc,
bool buffer_already_dirty 
)
static

Definition at line 8083 of file bufmgr.c.

8085{
8087 bool result = false;
8088
8089 *buffer_already_dirty = false;
8090
8093
8094 if ((buf_state & BM_VALID) == 0)
8095 {
8096 UnlockBufHdr(desc);
8097 return false;
8098 }
8099
8100 /* Check that it's not pinned already. */
8102 {
8103 UnlockBufHdr(desc);
8104 return false;
8105 }
8106
8107 /* Pin the buffer and then release the buffer spinlock */
8108 PinBuffer_Locked(desc);
8109
8110 /* If it was not already dirty, mark it as dirty. */
8111 if (!(buf_state & BM_DIRTY))
8112 {
8115 result = true;
8116 BufferLockUnlock(buf, desc);
8117 }
8118 else
8119 *buffer_already_dirty = true;
8120
8121 UnpinBuffer(desc);
8122
8123 return result;
8124}
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3147

References Assert, BM_DIRTY, BM_LOCKED, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_EXCLUSIVE, BufferLockAcquire(), BufferLockUnlock(), fb(), MarkBufferDirty(), pg_atomic_read_u64(), PinBuffer_Locked(), result, BufferDesc::state, UnlockBufHdr(), and UnpinBuffer().

Referenced by MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), and MarkDirtyUnpinnedBuffer().

◆ MarkSharedBufferDirtyHint()

static void MarkSharedBufferDirtyHint ( Buffer  buffer,
BufferDesc bufHdr,
uint64  lockstate,
bool  buffer_std 
)
inlinestatic

Definition at line 5696 of file bufmgr.c.

5698{
5699 Page page = BufferGetPage(buffer);
5700
5701 Assert(GetPrivateRefCount(buffer) > 0);
5702
5703 /* here, either share-exclusive or exclusive lock is OK */
5706
5707 /*
5708 * This routine might get called many times on the same page, if we are
5709 * making the first scan after commit of an xact that added/deleted many
5710 * tuples. So, be as quick as we can if the buffer is already dirty.
5711 *
5712 * As we are holding (at least) a share-exclusive lock, nobody could have
5713 * cleaned or dirtied the page concurrently, so we can just rely on the
5714 * previously fetched value here without any danger of races.
5715 */
5716 if (unlikely(!(lockstate & BM_DIRTY)))
5717 {
5719 bool wal_log = false;
5721
5722 /*
5723 * If we need to protect hint bit updates from torn writes, WAL-log a
5724 * full page image of the page. This full page image is only necessary
5725 * if the hint bit update is the first change to the page since the
5726 * last checkpoint.
5727 *
5728 * We don't check full_page_writes here because that logic is included
5729 * when we call XLogInsert() since the value changes dynamically.
5730 */
5732 {
5733 /*
5734 * If we must not write WAL, due to a relfilelocator-specific
5735 * condition or being in recovery, don't dirty the page. We can
5736 * set the hint, just not dirty the page as a result so the hint
5737 * is lost when we evict the page or shutdown.
5738 *
5739 * See src/backend/storage/page/README for longer discussion.
5740 */
5741 if (RecoveryInProgress() ||
5743 return;
5744
5745 wal_log = true;
5746 }
5747
5748 /*
5749 * We must mark the page dirty before we emit the WAL record, as per
5750 * the usual rules, to ensure that BufferSync()/SyncOneBuffer() try to
5751 * flush the buffer, even if we haven't inserted the WAL record yet.
5752 * As we hold at least a share-exclusive lock, checkpoints will wait
5753 * for this backend to be done with the buffer before continuing. If
5754 * we did it the other way round, a checkpoint could start between
5755 * writing the WAL record and marking the buffer dirty.
5756 */
5758
5759 /*
5760 * It should not be possible for the buffer to already be dirty, see
5761 * comment above.
5762 */
5766 BM_DIRTY,
5767 0, 0);
5768
5769 /*
5770 * If the block is already dirty because we either made a change or
5771 * set a hint already, then we don't need to write a full page image.
5772 * Note that aggressive cleaning of blocks dirtied by hint bit setting
5773 * would increase the call rate. Bulk setting of hint bits would
5774 * reduce the call rate...
5775 */
5776 if (wal_log)
5777 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5778
5779 if (XLogRecPtrIsValid(lsn))
5780 {
5781 /*
5782 * Set the page LSN if we wrote a backup block. To allow backends
5783 * that only hold a share lock on the buffer to read the LSN in a
5784 * tear-free manner, we set the page LSN while holding the buffer
5785 * header lock. This allows any reader of an LSN who holds only a
5786 * share lock to also obtain a buffer header lock before using
5787 * PageGetLSN() to read the LSN in a tear free way. This is done
5788 * in BufferGetLSNAtomic().
5789 *
5790 * If checksums are enabled, you might think we should reset the
5791 * checksum here. That will happen when the page is written
5792 * sometime later in this checkpoint cycle.
5793 */
5795 PageSetLSN(page, lsn);
5797 }
5798
5800 if (VacuumCostActive)
5802 }
5803}
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:416
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition storage.c:573
bool RecoveryInProgress(void)
Definition xlog.c:6830
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)

References Assert, BM_DIRTY, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferGetPage(), BufferLockHeldByMeInMode(), BufTagGetRelFileLocator(), fb(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), PageSetLSN(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, unlikely, UnlockBufHdr(), UnlockBufHdrExt(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsValid, and XLogSaveBufferForHint().

Referenced by BufferSetHintBits16(), and MarkBufferDirtyHint().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 388 of file bufmgr.c.

389{
391
392 /* only allowed to be called when a reservation has been made */
394
395 /* use up the reserved entry */
397
398 /* and fill it */
400 res->buffer = buffer;
401 res->data.refcount = 0;
403
404 /* update cache for the next lookup */
406
408
409 return res;
410}

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, PrivateRefCountEntry::data, PrivateRefCountData::lockmode, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountEntryLast, PrivateRefCountData::refcount, and ReservedRefCountSlot.

Referenced by TrackNewBufferPin().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy,
bool  skip_if_not_valid 
)
static

Definition at line 3272 of file bufmgr.c.

3274{
3276 bool result;
3278
3281
3282 ref = GetPrivateRefCountEntry(b, true);
3283
3284 if (ref == NULL)
3285 {
3288
3290 for (;;)
3291 {
3293 return false;
3294
3295 /*
3296 * We're not allowed to increase the refcount while the buffer
3297 * header spinlock is held. Wait for the lock to be released.
3298 */
3300 {
3302
3303 /* perform checks at the top of the loop again */
3304 continue;
3305 }
3306
3308
3309 /* increase refcount */
3311
3312 if (strategy == NULL)
3313 {
3314 /* Default case: increase usagecount unless already max. */
3317 }
3318 else
3319 {
3320 /*
3321 * Ring buffers shouldn't evict others from pool. Thus we
3322 * don't make usagecount more than 1.
3323 */
3326 }
3327
3329 buf_state))
3330 {
3331 result = (buf_state & BM_VALID) != 0;
3332
3334 break;
3335 }
3336 }
3337 }
3338 else
3339 {
3340 /*
3341 * If we previously pinned the buffer, it is likely to be valid, but
3342 * it may not be if StartReadBuffers() was called and
3343 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3344 * the flags without locking. This is racy, but it's OK to return
3345 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3346 * it'll see that it's now valid.
3347 *
3348 * Note: We deliberately avoid a Valgrind client request here.
3349 * Individual access methods can optionally superimpose buffer page
3350 * client requests on top of our client requests to enforce that
3351 * buffers are only accessed while locked (and pinned). It's possible
3352 * that the buffer page is legitimately non-accessible here. We
3353 * cannot meddle with that.
3354 */
3355 result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
3356
3357 Assert(ref->data.refcount > 0);
3358 ref->data.refcount++;
3360 }
3361
3362 return result;
3363}
#define BM_MAX_USAGE_COUNT
#define BUF_STATE_GET_USAGECOUNT(state)
void TrackNewBufferPin(Buffer buf)
Definition bufmgr.c:3512

References Assert, b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, CurrentResourceOwner, fb(), GetPrivateRefCountEntry(), pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), ReservedRefCountSlot, ResourceOwnerRememberBuffer(), result, TrackNewBufferPin(), unlikely, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), ExtendBufferedRelShared(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 3388 of file bufmgr.c.

3389{
3391
3392 /*
3393 * As explained, We don't expect any preexisting pins. That allows us to
3394 * manipulate the PrivateRefCount after releasing the spinlock
3395 */
3397
3398 /*
3399 * Since we hold the buffer spinlock, we can update the buffer state and
3400 * release the lock in one operation.
3401 */
3403
3405 0, 0, 1);
3406
3408}

References Assert, buf, BufferDescriptorGetBuffer(), fb(), GetPrivateRefCountEntry(), pg_atomic_read_u64(), TrackNewBufferPin(), and UnlockBufHdrExt().

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), MarkDirtyUnpinnedBufferInternal(), and SyncOneBuffer().

◆ PinBufferForBlock()

static pg_attribute_always_inline Buffer PinBufferForBlock ( Relation  rel,
SMgrRelation  smgr,
char  persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
IOObject  io_object,
IOContext  io_context,
bool foundPtr 
)
static

Definition at line 1232 of file bufmgr.c.

1241{
1243
1244 Assert(blockNum != P_NEW);
1245
1246 /* Persistence should be set before */
1247 Assert((persistence == RELPERSISTENCE_TEMP ||
1248 persistence == RELPERSISTENCE_PERMANENT ||
1249 persistence == RELPERSISTENCE_UNLOGGED));
1250
1251 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1255 smgr->smgr_rlocator.backend);
1256
1257 if (persistence == RELPERSISTENCE_TEMP)
1258 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1259 else
1260 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1261 strategy, foundPtr, io_context);
1262
1263 if (*foundPtr)
1264 TrackBufferHit(io_object, io_context, rel, persistence, smgr, forkNum, blockNum);
1265
1266 if (rel)
1267 {
1268 /*
1269 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1270 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1271 * zeroed instead), the per-relation stats always count them.
1272 */
1274 }
1275
1277}
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition bufmgr.c:2188
#define P_NEW
Definition bufmgr.h:200
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition localbuf.c:119
#define pgstat_count_buffer_read(rel)
Definition pgstat.h:742
RelFileNumber relNumber

References Assert, RelFileLocatorBackend::backend, BufferAlloc(), BufferDescriptorGetBuffer(), RelFileLocator::dbOid, fb(), LocalBufferAlloc(), RelFileLocatorBackend::locator, P_NEW, pgstat_count_buffer_read, RelFileLocator::relNumber, SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, and TrackBufferHit().

Referenced by ReadBuffer_common(), and StartReadBuffersImpl().

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 787 of file bufmgr.c.

788{
790 Assert(BlockNumberIsValid(blockNum));
791
793 {
794 /* see comments in ReadBufferExtended */
798 errmsg("cannot access temporary tables of other sessions")));
799
800 /* pass it off to localbuf.c */
801 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
802 }
803 else
804 {
805 /* pass it to the shared buffer version */
806 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
807 }
808}
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:697
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition localbuf.c:72
#define RELATION_IS_OTHER_TEMP(relation)
Definition rel.h:669
#define RelationIsValid(relation)
Definition rel.h:491

References Assert, BlockNumberIsValid(), ereport, errcode(), errmsg, ERROR, fb(), PrefetchLocalBuffer(), PrefetchSharedBuffer(), RELATION_IS_OTHER_TEMP, RelationGetSmgr(), RelationIsValid, and RelationUsesLocalBuffers.

Referenced by count_nondeletable_pages(), invalidate_one_block(), and pg_prewarm().

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 697 of file bufmgr.c.

700{
702 BufferTag newTag; /* identity of requested block */
703 uint32 newHash; /* hash value for newTag */
704 LWLock *newPartitionLock; /* buffer partition lock for it */
705 int buf_id;
706
707 Assert(BlockNumberIsValid(blockNum));
708
709 /* create a tag so we can lookup the buffer */
710 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
711 forkNum, blockNum);
712
713 /* determine its hash code and partition lock ID */
716
717 /* see if the block is in the buffer pool already */
719 buf_id = BufTableLookup(&newTag, newHash);
721
722 /* If not in buffers, initiate prefetch */
723 if (buf_id < 0)
724 {
725#ifdef USE_PREFETCH
726 /*
727 * Try to initiate an asynchronous read. This returns false in
728 * recovery if the relation file doesn't exist.
729 */
730 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
731 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
732 {
733 result.initiated_io = true;
734 }
735#endif /* USE_PREFETCH */
736 }
737 else
738 {
739 /*
740 * Report the buffer it was in at that time. The caller may be able
741 * to avoid a buffer table lookup, but it's not pinned and it must be
742 * rechecked!
743 */
744 result.recent_buffer = buf_id + 1;
745 }
746
747 /*
748 * If the block *is* in buffers, we do nothing. This is not really ideal:
749 * the block might be just about to be evicted, which would be stupid
750 * since we know we are going to need it soon. But the only easy answer
751 * is to bump the usage_count, which does not seem like a great solution:
752 * when the caller does ultimately touch the block, usage_count would get
753 * bumped again, resulting in too much favoritism for blocks that are
754 * involved in a prefetch sequence. A real fix would involve some
755 * additional per-buffer state, and it's not clear that there's enough of
756 * a problem to justify that.
757 */
758
759 return result;
760}
int io_direct_flags
Definition fd.c:172
#define IO_DIRECT_DATA
Definition fd.h:54
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition smgr.c:678

References Assert, BlockNumberIsValid(), BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), fb(), InitBufferTag(), InvalidBuffer, IO_DIRECT_DATA, io_direct_flags, LW_SHARED, LWLockAcquire(), LWLockRelease(), result, and smgrprefetch().

Referenced by PrefetchBuffer(), and XLogPrefetcherNextBlock().

◆ ProcessReadBuffersResult()

static void ProcessReadBuffersResult ( ReadBuffersOperation operation)
static

Definition at line 1705 of file bufmgr.c.

1706{
1707 PgAioReturn *aio_ret = &operation->io_return;
1709 int newly_read_blocks = 0;
1710
1711 Assert(pgaio_wref_valid(&operation->io_wref));
1712 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1713
1714 /*
1715 * SMGR reports the number of blocks successfully read as the result of
1716 * the IO operation. Thus we can simply add that to ->nblocks_done.
1717 */
1718
1719 if (likely(rs != PGAIO_RS_ERROR))
1720 newly_read_blocks = aio_ret->result.result;
1721
1722 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1723 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1724 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1725 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1726 {
1727 /*
1728 * We'll retry, so we just emit a debug message to the server log (or
1729 * not even that in prod scenarios).
1730 */
1731 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1732 elog(DEBUG3, "partial read, will retry");
1733 }
1734
1737
1738 operation->nblocks_done += newly_read_blocks;
1739
1740 Assert(operation->nblocks_done <= operation->nblocks);
1741}
PgAioResultStatus
Definition aio_types.h:79
@ PGAIO_RS_UNKNOWN
Definition aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
#define DEBUG3
Definition elog.h:29
uint32 status
Definition aio_types.h:108
PgAioResult result
Definition aio_types.h:132

References Assert, DEBUG1, DEBUG3, elog, ERROR, fb(), likely, MAX_IO_COMBINE_LIMIT, operation, pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PGAIO_RS_UNKNOWN, PGAIO_RS_WARNING, pgaio_wref_valid(), PgAioReturn::result, PgAioResult::status, and WARNING.

Referenced by WaitReadBuffers().

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 879 of file bufmgr.c.

880{
882}
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:926
@ RBM_NORMAL
Definition bufmgr.h:46

References fb(), MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_allocbuf(), _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_lock_tuple(), heap_update(), heapam_index_fetch_tuple(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

◆ ReadBuffer_common()

static pg_attribute_always_inline Buffer ReadBuffer_common ( Relation  rel,
SMgrRelation  smgr,
char  smgr_persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
static

Definition at line 1285 of file bufmgr.c.

1289{
1291 Buffer buffer;
1292 int flags;
1293 char persistence;
1294
1295 /*
1296 * Backward compatibility path, most code should use ExtendBufferedRel()
1297 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1298 * scales a lot better.
1299 */
1300 if (unlikely(blockNum == P_NEW))
1301 {
1303
1304 /*
1305 * Since no-one else can be looking at the page contents yet, there is
1306 * no difference between an exclusive lock and a cleanup-strength
1307 * lock.
1308 */
1310 flags |= EB_LOCK_FIRST;
1311
1312 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1313 }
1314
1315 if (rel)
1316 persistence = rel->rd_rel->relpersistence;
1317 else
1318 persistence = smgr_persistence;
1319
1322 {
1323 bool found;
1326
1327 if (persistence == RELPERSISTENCE_TEMP)
1328 {
1331 }
1332 else
1333 {
1334 io_context = IOContextForStrategy(strategy);
1336 }
1337
1338 buffer = PinBufferForBlock(rel, smgr, persistence,
1339 forkNum, blockNum, strategy,
1340 io_object, io_context, &found);
1341 ZeroAndLockBuffer(buffer, mode, found);
1342 return buffer;
1343 }
1344
1345 /*
1346 * Signal that we are going to immediately wait. If we're immediately
1347 * waiting, there is no benefit in actually executing the IO
1348 * asynchronously, it would just add dispatch overhead.
1349 */
1351 if (mode == RBM_ZERO_ON_ERROR)
1353 operation.smgr = smgr;
1354 operation.rel = rel;
1355 operation.persistence = persistence;
1356 operation.forknum = forkNum;
1357 operation.strategy = strategy;
1359 &buffer,
1360 blockNum,
1361 flags))
1363
1364 return buffer;
1365}
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:979
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition bufmgr.c:1146
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, IOObject io_object, IOContext io_context, bool *foundPtr)
Definition bufmgr.c:1232
bool WaitReadBuffers(ReadBuffersOperation *operation)
Definition bufmgr.c:1750
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition bufmgr.c:1628
@ RBM_ZERO_ON_ERROR
Definition bufmgr.h:51
#define BMR_REL(p_rel)
Definition bufmgr.h:114
Form_pg_class rd_rel
Definition rel.h:111

References BMR_REL, PrivateRefCountEntry::buffer, EB_LOCK_FIRST, EB_SKIP_EXTENSION_LOCK, ExtendBufferedRel(), fb(), IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, mode, operation, P_NEW, PinBufferForBlock(), RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelationData::rd_rel, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, StartReadBuffer(), unlikely, WaitReadBuffers(), and ZeroAndLockBuffer().

Referenced by ExtendBufferedRelTo(), ReadBufferExtended(), and ReadBufferWithoutRelcache().

◆ ReadBufferExtended()

Buffer ReadBufferExtended ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
inline

Definition at line 926 of file bufmgr.c.

928{
929 Buffer buf;
930
931 /*
932 * Reject attempts to read non-local temporary relations; we would be
933 * likely to get wrong data since we have no visibility into the owning
934 * session's local buffers.
935 */
939 errmsg("cannot access temporary tables of other sessions")));
940
941 /*
942 * Read the buffer, and update pgstat counters to reflect a cache hit or
943 * miss.
944 */
946 forkNum, blockNum, mode, strategy);
947
948 return buf;
949}

References buf, ereport, errcode(), errmsg, ERROR, fb(), mode, ReadBuffer_common(), RELATION_IS_OTHER_TEMP, and RelationGetSmgr().

Referenced by _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), BloomInitMetapage(), bt_recheck_sibling_links(), btvacuumpage(), count_nondeletable_pages(), create_toy_buffer(), fsm_readbuf(), get_raw_page_internal(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), gin_refind_parent(), ginbulkdelete(), ginScanPostingTreeToDelete(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistvacuum_delete_empty_pages(), gistvacuumpage(), heapam_scan_sample_next_block(), log_newpage_range(), modify_rel_block(), palloc_btree_page(), pgstat_btree_page(), pgstat_gist_page(), pgstat_hash_page(), pgstat_heap(), pgstatindex_impl(), ProcessSingleRelationFork(), ReadBuffer(), ReadBufferBI(), spgprocesspending(), and vm_readbuf().

◆ ReadBufferWithoutRelcache()

Buffer ReadBufferWithoutRelcache ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool  permanent 
)

Definition at line 963 of file bufmgr.c.

966{
967 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
968
969 return ReadBuffer_common(NULL, smgr,
971 forkNum, blockNum,
972 mode, strategy);
973}

References fb(), INVALID_PROC_NUMBER, mode, ReadBuffer_common(), and smgropen().

Referenced by RelationCopyStorageUsingBuffer(), ScanSourceDatabasePgClass(), and XLogReadBufferExtended().

◆ ReadRecentBuffer()

bool ReadRecentBuffer ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
Buffer  recent_buffer 
)

Definition at line 818 of file bufmgr.c.

820{
822 BufferTag tag;
824
825 Assert(BufferIsValid(recent_buffer));
826
829 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
830
831 if (BufferIsLocal(recent_buffer))
832 {
833 int b = -recent_buffer - 1;
834
837
838 /* Is it still valid and holding the right tag? */
839 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
840 {
841 PinLocalBuffer(bufHdr, true);
842
844
845 return true;
846 }
847 }
848 else
849 {
850 bufHdr = GetBufferDescriptor(recent_buffer - 1);
851
852 /*
853 * Is it still valid and holding the right tag? We do an unlocked tag
854 * comparison first, to make it unlikely that we'll increment the
855 * usage counter of the wrong buffer, if someone calls us with a very
856 * out of date recent_buffer. Then we'll check it again if we get the
857 * pin.
858 */
859 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
860 PinBuffer(bufHdr, NULL, true))
861 {
862 if (BufferTagsEqual(&tag, &bufHdr->tag))
863 {
865 return true;
866 }
868 }
869 }
870
871 return false;
872}
int64 local_blks_hit
Definition instrument.h:30
int64 shared_blks_hit
Definition instrument.h:26

References Assert, b, BM_VALID, BufferIsLocal, BufferIsValid(), BufferTagsEqual(), CurrentResourceOwner, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), InitBufferTag(), BufferUsage::local_blks_hit, pg_atomic_read_u64(), pgBufferUsage, PinBuffer(), PinLocalBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_hit, and UnpinBuffer().

Referenced by invalidate_one_block(), and XLogReadBufferExtended().

◆ RelationCopyStorageUsingBuffer()

static void RelationCopyStorageUsingBuffer ( RelFileLocator  srclocator,
RelFileLocator  dstlocator,
ForkNumber  forkNum,
bool  permanent 
)
static

Definition at line 5348 of file bufmgr.c.

5351{
5352 Buffer srcBuf;
5353 Buffer dstBuf;
5354 Page srcPage;
5355 Page dstPage;
5356 bool use_wal;
5357 BlockNumber nblocks;
5358 BlockNumber blkno;
5365
5366 /*
5367 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5368 * can skip it when copying any fork of an unlogged relation other than
5369 * the init fork.
5370 */
5371 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5372
5373 /* Get number of blocks in the source relation. */
5375 forkNum);
5376
5377 /* Nothing to copy; just return. */
5378 if (nblocks == 0)
5379 return;
5380
5381 /*
5382 * Bulk extend the destination relation of the same size as the source
5383 * relation before starting to copy block by block.
5384 */
5385 memset(buf.data, 0, BLCKSZ);
5386 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5387 buf.data, true);
5388
5389 /* This is a bulk operation, so use buffer access strategies. */
5392
5393 /* Initialize streaming read */
5394 p.current_blocknum = 0;
5395 p.last_exclusive = nblocks;
5397
5398 /*
5399 * It is safe to use batchmode as block_range_read_stream_cb takes no
5400 * locks.
5401 */
5405 src_smgr,
5407 forkNum,
5409 &p,
5410 0);
5411
5412 /* Iterate over each block of the source relation file. */
5413 for (blkno = 0; blkno < nblocks; blkno++)
5414 {
5416
5417 /* Read block from source relation. */
5421
5425 permanent);
5427
5429
5430 /* Copy page data from the source to the destination. */
5433
5434 /* WAL-log the copied page. */
5435 if (use_wal)
5437
5439
5442 }
5445
5448}
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition bufmgr.c:963
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:426
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:608
#define START_CRIT_SECTION()
Definition miscadmin.h:152
#define END_CRIT_SECTION()
Definition miscadmin.h:154
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.c:620
#define XLogIsNeeded()
Definition xlog.h:112
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)

References Assert, BAS_BULKREAD, BAS_BULKWRITE, block_range_read_stream_cb(), buf, BUFFER_LOCK_SHARE, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, BlockRangeReadStreamPrivate::current_blocknum, END_CRIT_SECTION, fb(), FreeAccessStrategy(), GetAccessStrategy(), INIT_FORKNUM, INVALID_PROC_NUMBER, InvalidBuffer, BlockRangeReadStreamPrivate::last_exclusive, LockBuffer(), log_newpage_buffer(), MarkBufferDirty(), memcpy(), RBM_ZERO_AND_LOCK, read_stream_begin_smgr_relation(), read_stream_end(), READ_STREAM_FULL, read_stream_next_buffer(), READ_STREAM_USE_BATCHING, ReadBufferWithoutRelcache(), smgrextend(), smgrnblocks(), smgropen(), START_CRIT_SECTION, UnlockReleaseBuffer(), and XLogIsNeeded.

Referenced by CreateAndCopyRelationData().

◆ RelationGetNumberOfBlocksInFork()

BlockNumber RelationGetNumberOfBlocksInFork ( Relation  relation,
ForkNumber  forkNum 
)

Definition at line 4645 of file bufmgr.c.

4646{
4647 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4648 {
4649 /*
4650 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4651 * tableam returns the size in bytes - but for the purpose of this
4652 * routine, we want the number of blocks. Therefore divide, rounding
4653 * up.
4654 */
4656
4657 szbytes = table_relation_size(relation, forkNum);
4658
4659 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4660 }
4661 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4662 {
4663 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4664 }
4665 else
4666 Assert(false);
4667
4668 return 0; /* keep compiler quiet */
4669}
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition tableam.h:1940

References Assert, fb(), RelationData::rd_rel, RelationGetSmgr(), smgrnblocks(), and table_relation_size().

Referenced by _hash_getnewbuf(), _hash_init(), autoprewarm_database_main(), get_raw_page_internal(), pg_prewarm(), and ProcessSingleRelationFork().

◆ ReleaseAndReadBuffer()

Buffer ReleaseAndReadBuffer ( Buffer  buffer,
Relation  relation,
BlockNumber  blockNum 
)

Definition at line 3212 of file bufmgr.c.

3215{
3216 ForkNumber forkNum = MAIN_FORKNUM;
3218
3219 if (BufferIsValid(buffer))
3220 {
3221 Assert(BufferIsPinned(buffer));
3222 if (BufferIsLocal(buffer))
3223 {
3224 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3225 if (bufHdr->tag.blockNum == blockNum &&
3226 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3227 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3228 return buffer;
3229 UnpinLocalBuffer(buffer);
3230 }
3231 else
3232 {
3233 bufHdr = GetBufferDescriptor(buffer - 1);
3234 /* we have pin, so it's ok to examine tag without spinlock */
3235 if (bufHdr->tag.blockNum == blockNum &&
3236 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3237 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3238 return buffer;
3240 }
3241 }
3242
3243 return ReadBuffer(relation, blockNum);
3244}
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:879

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), MAIN_FORKNUM, RelationData::rd_locator, ReadBuffer(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by ginFindLeafPage().

◆ ReleaseBuffer()

void ReleaseBuffer ( Buffer  buffer)

Definition at line 5586 of file bufmgr.c.

5587{
5588 if (!BufferIsValid(buffer))
5589 elog(ERROR, "bad buffer ID: %d", buffer);
5590
5591 if (BufferIsLocal(buffer))
5592 UnpinLocalBuffer(buffer);
5593 else
5594 UnpinBuffer(GetBufferDescriptor(buffer - 1));
5595}

References PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by _bt_allocbuf(), _bt_pagedel(), _bt_search_insert(), _bt_unlink_halfdead_page(), _hash_dropbuf(), _hash_getbuf_with_condlock_cleanup(), autoprewarm_database_main(), BitmapHeapScanNextBlock(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brin_vacuum_scan(), bringetbitmap(), brinGetTupleForHeapBlock(), brininsert(), brinRevmapTerminate(), brinsummarize(), buffer_create_toy(), collect_corrupt_items(), collect_visibility_data(), entryLoadMoreItems(), ExecEndIndexOnlyScan(), ExtendBufferedRelTo(), FreeBulkInsertState(), freeGinBtreeStack(), fsm_search(), get_actual_variable_endpoint(), GetRecordedFreeSpace(), ginFindParents(), ginFinishSplit(), ginFreeScanKeys(), ginInsertCleanup(), GinNewBuffer(), gistdoinsert(), gistFindCorrectParent(), gistNewBuffer(), gistvacuum_delete_empty_pages(), grow_rel(), heap_abort_speculative(), heap_delete(), heap_endscan(), heap_fetch(), heap_fetch_next_buffer(), heap_force_common(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_rescan(), heap_update(), heap_vac_scan_next_block(), heap_xlog_delete(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heapam_index_fetch_end(), heapam_index_fetch_tuple(), heapam_scan_sample_next_block(), heapam_tuple_lock(), heapgettup(), heapgettup_pagemode(), lazy_scan_heap(), lazy_vacuum_heap_rel(), pg_prewarm(), pg_visibility(), pg_visibility_map(), pgstatindex_impl(), read_buffers(), read_rel_block_ll(), read_stream_for_blocks(), read_stream_reset(), ReadBufferBI(), RelationAddBlocks(), ReleaseBulkInsertStatePin(), revmap_get_buffer(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), statapprox_heap(), summarize_range(), terminate_brin_buildstate(), tts_buffer_heap_clear(), tts_buffer_heap_materialize(), tts_buffer_heap_store_tuple(), verify_heapam(), visibilitymap_count(), visibilitymap_get_status(), visibilitymap_pin(), and XLogReadBufferExtended().

◆ ReservePrivateRefCountEntry()

static void ReservePrivateRefCountEntry ( void  )
static

Definition at line 309 of file bufmgr.c.

310{
311 /* Already reserved (or freed), nothing to do */
312 if (ReservedRefCountSlot != -1)
313 return;
314
315 /*
316 * First search for a free entry the array, that'll be sufficient in the
317 * majority of cases.
318 */
319 {
320 int i;
321
322 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
323 {
325 {
327
328 /*
329 * We could return immediately, but iterating till the end of
330 * the array allows compiler-autovectorization.
331 */
332 }
333 }
334
335 if (ReservedRefCountSlot != -1)
336 return;
337 }
338
339 /*
340 * No luck. All array entries are full. Move one array entry into the hash
341 * table.
342 */
343 {
344 /*
345 * Move entry from the current clock position in the array into the
346 * hashtable. Use that slot.
347 */
348 int victim_slot;
351 bool found;
352
353 /* select victim slot */
357
358 /* Better be used, otherwise we shouldn't get here. */
362
363 /* enter victim array entry into hashtable */
366 &found);
367 Assert(!found);
368 /* move data from the entry in the array to the hash entry */
369 hashent->data = victim_entry->data;
370
371 /* clear the now free array slot */
373 victim_entry->buffer = InvalidBuffer;
374
375 /* clear the whole data member, just for future proofing */
376 memset(&victim_entry->data, 0, sizeof(victim_entry->data));
377 victim_entry->data.refcount = 0;
378 victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
379
381 }
382}
static uint32 PrivateRefCountClock
Definition bufmgr.c:267

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, fb(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountClock, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountSlot.

Referenced by BufferAlloc(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetPrivateRefCountEntrySlow(), GetVictimBuffer(), MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), MarkDirtyUnpinnedBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ ResOwnerPrintBuffer()

static char * ResOwnerPrintBuffer ( Datum  res)
static

Definition at line 7881 of file bufmgr.c.

7882{
7884}
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202

References DatumGetInt32(), and DebugPrintBufferRefcount().

◆ ResOwnerPrintBufferIO()

static char * ResOwnerPrintBufferIO ( Datum  res)
static

Definition at line 7831 of file bufmgr.c.

7832{
7833 Buffer buffer = DatumGetInt32(res);
7834
7835 return psprintf("lost track of buffer IO on buffer %d", buffer);
7836}

References PrivateRefCountEntry::buffer, DatumGetInt32(), and psprintf().

◆ ResOwnerReleaseBuffer()

static void ResOwnerReleaseBuffer ( Datum  res)
static

Definition at line 7845 of file bufmgr.c.

7846{
7847 Buffer buffer = DatumGetInt32(res);
7848
7849 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
7850 if (!BufferIsValid(buffer))
7851 elog(ERROR, "bad buffer ID: %d", buffer);
7852
7853 if (BufferIsLocal(buffer))
7855 else
7856 {
7858
7859 ref = GetPrivateRefCountEntry(buffer, false);
7860
7861 /* not having a private refcount would imply resowner corruption */
7862 Assert(ref != NULL);
7863
7864 /*
7865 * If the buffer was locked at the time of the resowner release,
7866 * release the lock now. This should only happen after errors.
7867 */
7868 if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
7869 {
7870 BufferDesc *buf = GetBufferDescriptor(buffer - 1);
7871
7872 HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
7873 BufferLockUnlock(buffer, buf);
7874 }
7875
7877 }
7878}
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition bufmgr.c:3465
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition localbuf.c:864

References Assert, buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), BufferLockUnlock(), DatumGetInt32(), elog, ERROR, fb(), GetBufferDescriptor(), GetPrivateRefCountEntry(), HOLD_INTERRUPTS, UnpinBufferNoOwner(), and UnpinLocalBufferNoOwner().

◆ ResOwnerReleaseBufferIO()

static void ResOwnerReleaseBufferIO ( Datum  res)
static

Definition at line 7823 of file bufmgr.c.

7824{
7825 Buffer buffer = DatumGetInt32(res);
7826
7827 AbortBufferIO(buffer);
7828}
static void AbortBufferIO(Buffer buffer)
Definition bufmgr.c:7420

References AbortBufferIO(), PrivateRefCountEntry::buffer, and DatumGetInt32().

◆ rlocator_comparator()

static int rlocator_comparator ( const void p1,
const void p2 
)
static

Definition at line 7491 of file bufmgr.c.

7492{
7493 RelFileLocator n1 = *(const RelFileLocator *) p1;
7494 RelFileLocator n2 = *(const RelFileLocator *) p2;
7495
7496 if (n1.relNumber < n2.relNumber)
7497 return -1;
7498 else if (n1.relNumber > n2.relNumber)
7499 return 1;
7500
7501 if (n1.dbOid < n2.dbOid)
7502 return -1;
7503 else if (n1.dbOid > n2.dbOid)
7504 return 1;
7505
7506 if (n1.spcOid < n2.spcOid)
7507 return -1;
7508 else if (n1.spcOid > n2.spcOid)
7509 return 1;
7510 else
7511 return 0;
7512}

References fb().

Referenced by buffertag_comparator(), DropRelationsAllBuffers(), and FlushRelationsAllBuffers().

◆ ScheduleBufferTagForWriteback()

void ScheduleBufferTagForWriteback ( WritebackContext wb_context,
IOContext  io_context,
BufferTag tag 
)

Definition at line 7690 of file bufmgr.c.

7692{
7693 PendingWriteback *pending;
7694
7695 /*
7696 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
7697 * point in tracking in that case.
7698 */
7700 !enableFsync)
7701 return;
7702
7703 /*
7704 * Add buffer to the pending writeback array, unless writeback control is
7705 * disabled.
7706 */
7707 if (*wb_context->max_pending > 0)
7708 {
7710
7711 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
7712
7713 pending->tag = *tag;
7714 }
7715
7716 /*
7717 * Perform pending flushes if the writeback limit is exceeded. This
7718 * includes the case where previously an item has been added, but control
7719 * is now disabled.
7720 */
7721 if (wb_context->nr_pending >= *wb_context->max_pending)
7723}
bool enableFsync
Definition globals.c:131
#define WRITEBACK_MAX_PENDING_FLUSHES

References Assert, enableFsync, fb(), IO_DIRECT_DATA, io_direct_flags, IssuePendingWritebacks(), PendingWriteback::tag, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by GetVictimBuffer(), and SyncOneBuffer().

◆ shared_buffer_readv_complete()

static PgAioResult shared_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 8880 of file bufmgr.c.

8882{
8884}

References buffer_readv_complete(), and fb().

◆ shared_buffer_readv_complete_local()

static PgAioResult shared_buffer_readv_complete_local ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

◆ shared_buffer_readv_stage()

static void shared_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 8874 of file bufmgr.c.

8875{
8876 buffer_stage_common(ioh, false, false);
8877}

References buffer_stage_common(), and fb().

◆ shared_buffer_write_error_callback()

static void shared_buffer_write_error_callback ( void arg)
static

Definition at line 7459 of file bufmgr.c.

7460{
7462
7463 /* Buffer is pinned, so we can read the tag without locking the spinlock */
7464 if (bufHdr != NULL)
7465 errcontext("writing block %u of relation \"%s\"",
7466 bufHdr->tag.blockNum,
7468 BufTagGetForkNum(&bufHdr->tag)).str);
7469}

References arg, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, fb(), and relpathperm.

Referenced by FlushBuffer().

◆ SharedBufferBeginSetHintBits()

static bool SharedBufferBeginSetHintBits ( Buffer  buffer,
BufferDesc buf_hdr,
uint64 lockstate 
)
inlinestatic

Definition at line 6951 of file bufmgr.c.

6952{
6956
6957 ref = GetPrivateRefCountEntry(buffer, true);
6958
6959 if (ref == NULL)
6960 elog(ERROR, "buffer is not pinned");
6961
6962 mode = ref->data.lockmode;
6963 if (mode == BUFFER_LOCK_UNLOCK)
6964 elog(ERROR, "buffer is not locked");
6965
6966 /* we're done if we are already holding a sufficient lock level */
6968 {
6970 return true;
6971 }
6972
6973 /*
6974 * We are only holding a share lock right now, try to upgrade it to
6975 * SHARE_EXCLUSIVE.
6976 */
6978
6980 while (true)
6981 {
6983
6985
6986 /*
6987 * Can't upgrade if somebody else holds the lock in exclusive or
6988 * share-exclusive mode.
6989 */
6991 {
6992 return false;
6993 }
6994
6995 /* currently held lock state */
6997
6998 /* new lock level */
7000
7003 {
7004 ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE;
7006
7007 return true;
7008 }
7009 }
7010}

References Assert, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_VAL_SHARE_EXCLUSIVE, BM_LOCK_VAL_SHARED, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, elog, ERROR, fb(), GetPrivateRefCountEntry(), likely, mode, pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), and unlikely.

Referenced by BufferBeginSetHintBits(), and BufferSetHintBits16().

◆ StartBufferIO()

StartBufferIOResult StartBufferIO ( Buffer  buffer,
bool  forInput,
bool  wait,
PgAioWaitRef io_wref 
)

Definition at line 7321 of file bufmgr.c.

7322{
7324
7325 if (BufferIsLocal(buffer))
7326 {
7327 buf_hdr = GetLocalBufferDescriptor(-buffer - 1);
7328
7329 return StartLocalBufferIO(buf_hdr, forInput, wait, io_wref);
7330 }
7331 else
7332 {
7333 buf_hdr = GetBufferDescriptor(buffer - 1);
7334
7335 return StartSharedBufferIO(buf_hdr, forInput, wait, io_wref);
7336 }
7337}
StartBufferIOResult StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition localbuf.c:524

References PrivateRefCountEntry::buffer, BufferIsLocal, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), StartLocalBufferIO(), and StartSharedBufferIO().

Referenced by AsyncReadBuffers().

◆ StartReadBuffer()

bool StartReadBuffer ( ReadBuffersOperation operation,
Buffer buffer,
BlockNumber  blocknum,
int  flags 
)

Definition at line 1628 of file bufmgr.c.

1632{
1633 int nblocks = 1;
1634 bool result;
1635
1636 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1637 false /* single block, no forwarding */ );
1638 Assert(nblocks == 1); /* single block can't be short */
1639
1640 return result;
1641}
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition bufmgr.c:1368

References Assert, PrivateRefCountEntry::buffer, operation, result, and StartReadBuffersImpl().

Referenced by read_stream_next_buffer(), and ReadBuffer_common().

◆ StartReadBuffers()

bool StartReadBuffers ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int nblocks,
int  flags 
)

Definition at line 1609 of file bufmgr.c.

1614{
1615 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1616 true /* expect forwarded buffers */ );
1617}

References operation, and StartReadBuffersImpl().

Referenced by read_buffers(), and read_stream_start_pending_read().

◆ StartReadBuffersImpl()

static pg_attribute_always_inline bool StartReadBuffersImpl ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int nblocks,
int  flags,
bool  allow_forwarding 
)
static

Definition at line 1368 of file bufmgr.c.

1374{
1375 int actual_nblocks = *nblocks;
1376 int maxcombine = 0;
1377 bool did_start_io;
1380
1381 Assert(*nblocks == 1 || allow_forwarding);
1382 Assert(*nblocks > 0);
1383 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1384
1385 if (operation->persistence == RELPERSISTENCE_TEMP)
1386 {
1389 }
1390 else
1391 {
1394 }
1395
1396 for (int i = 0; i < actual_nblocks; ++i)
1397 {
1398 bool found;
1399
1400 if (allow_forwarding && buffers[i] != InvalidBuffer)
1401 {
1403
1404 /*
1405 * This is a buffer that was pinned by an earlier call to
1406 * StartReadBuffers(), but couldn't be handled in one operation at
1407 * that time. The operation was split, and the caller has passed
1408 * an already pinned buffer back to us to handle the rest of the
1409 * operation. It must continue at the expected block number.
1410 */
1411 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1412
1413 /*
1414 * It might be an already valid buffer (a hit) that followed the
1415 * final contiguous block of an earlier I/O (a miss) marking the
1416 * end of it, or a buffer that some other backend has since made
1417 * valid by performing the I/O for us, in which case we can handle
1418 * it as a hit now. It is safe to check for a BM_VALID flag with
1419 * a relaxed load, because we got a fresh view of it while pinning
1420 * it in the previous call.
1421 *
1422 * On the other hand if we don't see BM_VALID yet, it must be an
1423 * I/O that was split by the previous call and we need to try to
1424 * start a new I/O from this block. We're also racing against any
1425 * other backend that might start the I/O or even manage to mark
1426 * it BM_VALID after this check, but StartBufferIO() will handle
1427 * those cases.
1428 */
1429 if (BufferIsLocal(buffers[i]))
1430 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1431 else
1432 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1434 found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
1435 }
1436 else
1437 {
1438 buffers[i] = PinBufferForBlock(operation->rel,
1439 operation->smgr,
1440 operation->persistence,
1441 operation->forknum,
1442 blockNum + i,
1443 operation->strategy,
1445 &found);
1446 }
1447
1448 if (found)
1449 {
1450 /*
1451 * We have a hit. If it's the first block in the requested range,
1452 * we can return it immediately and report that WaitReadBuffers()
1453 * does not need to be called. If the initial value of *nblocks
1454 * was larger, the caller will have to call again for the rest.
1455 */
1456 if (i == 0)
1457 {
1458 *nblocks = 1;
1459
1460#ifdef USE_ASSERT_CHECKING
1461
1462 /*
1463 * Initialize enough of ReadBuffersOperation to make
1464 * CheckReadBuffersOperation() work. Outside of assertions
1465 * that's not necessary when no IO is issued.
1466 */
1467 operation->buffers = buffers;
1468 operation->blocknum = blockNum;
1469 operation->nblocks = 1;
1470 operation->nblocks_done = 1;
1472#endif
1473 return false;
1474 }
1475
1476 /*
1477 * Otherwise we already have an I/O to perform, but this block
1478 * can't be included as it is already valid. Split the I/O here.
1479 * There may or may not be more blocks requiring I/O after this
1480 * one, we haven't checked, but they can't be contiguous with this
1481 * one in the way. We'll leave this buffer pinned, forwarding it
1482 * to the next call, avoiding the need to unpin it here and re-pin
1483 * it in the next call.
1484 */
1485 actual_nblocks = i;
1486 break;
1487 }
1488 else
1489 {
1490 /*
1491 * Check how many blocks we can cover with the same IO. The smgr
1492 * implementation might e.g. be limited due to a segment boundary.
1493 */
1494 if (i == 0 && actual_nblocks > 1)
1495 {
1497 operation->forknum,
1498 blockNum);
1500 {
1501 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1502 blockNum, actual_nblocks, maxcombine);
1504 }
1505 }
1506 }
1507 }
1508 *nblocks = actual_nblocks;
1509
1510 /* Populate information needed for I/O. */
1511 operation->buffers = buffers;
1512 operation->blocknum = blockNum;
1513 operation->flags = flags;
1514 operation->nblocks = actual_nblocks;
1515 operation->nblocks_done = 0;
1516 pgaio_wref_clear(&operation->io_wref);
1517
1518 /*
1519 * When using AIO, start the IO in the background. If not, issue prefetch
1520 * requests if desired by the caller.
1521 *
1522 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1523 * de-risk the introduction of AIO somewhat. It's a large architectural
1524 * change, with lots of chances for unanticipated performance effects.
1525 *
1526 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1527 * asynchronously, but without the check here we'd execute IO earlier than
1528 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1529 */
1530 if (io_method != IOMETHOD_SYNC)
1531 {
1532 /*
1533 * Try to start IO asynchronously. It's possible that no IO needs to
1534 * be started, if another backend already performed the IO.
1535 *
1536 * Note that if an IO is started, it might not cover the entire
1537 * requested range, e.g. because an intermediary block has been read
1538 * in by another backend. In that case any "trailing" buffers we
1539 * already pinned above will be "forwarded" by read_stream.c to the
1540 * next call to StartReadBuffers().
1541 *
1542 * This is signalled to the caller by decrementing *nblocks *and*
1543 * reducing operation->nblocks. The latter is done here, but not below
1544 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1545 * overall read size anymore, we need to retry until done in its
1546 * entirety or until failed.
1547 */
1549
1550 operation->nblocks = *nblocks;
1551 }
1552 else
1553 {
1555
1556 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1557 {
1558 /*
1559 * In theory we should only do this if PinBufferForBlock() had to
1560 * allocate new buffers above. That way, if two calls to
1561 * StartReadBuffers() were made for the same blocks before
1562 * WaitReadBuffers(), only the first would issue the advice.
1563 * That'd be a better simulation of true asynchronous I/O, which
1564 * would only start the I/O once, but isn't done here for
1565 * simplicity.
1566 */
1567 smgrprefetch(operation->smgr,
1568 operation->forknum,
1569 blockNum,
1571 }
1572
1573 /*
1574 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1575 * will initiate the necessary IO.
1576 */
1577 did_start_io = true;
1578 }
1579
1581
1582 return did_start_io;
1583}
int io_method
Definition aio.c:74
@ IOMETHOD_SYNC
Definition aio.h:34
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition bufmgr.c:1647
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition bufmgr.c:1929
#define READ_BUFFERS_ISSUE_ADVICE
Definition bufmgr.h:124
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition smgr.c:697

References Assert, AsyncReadBuffers(), BM_TAG_VALID, BM_VALID, BufferGetBlockNumber(), BufferIsLocal, CheckReadBuffersOperation(), DEBUG2, elog, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, InvalidBuffer, io_method, IOCONTEXT_NORMAL, IOContextForStrategy(), IOMETHOD_SYNC, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, MAX_IO_COMBINE_LIMIT, operation, pg_atomic_read_u64(), pgaio_wref_clear(), PinBufferForBlock(), READ_BUFFERS_ISSUE_ADVICE, READ_BUFFERS_SYNCHRONOUSLY, smgrmaxcombine(), smgrprefetch(), and unlikely.

Referenced by StartReadBuffer(), and StartReadBuffers().

◆ StartSharedBufferIO()

StartBufferIOResult StartSharedBufferIO ( BufferDesc buf,
bool  forInput,
bool  wait,
PgAioWaitRef io_wref 
)

Definition at line 7241 of file bufmgr.c.

7242{
7244
7246
7247 for (;;)
7248 {
7250
7252 break;
7253
7254 /* Join the existing IO */
7255 if (io_wref != NULL && pgaio_wref_valid(&buf->io_wref))
7256 {
7257 *io_wref = buf->io_wref;
7259
7260 return BUFFER_IO_IN_PROGRESS;
7261 }
7262 else if (!wait)
7263 {
7265 return BUFFER_IO_IN_PROGRESS;
7266 }
7267 else
7268 {
7269 /*
7270 * With wait = true, we always have to wait if the caller has
7271 * passed io_wref = NULL.
7272 *
7273 * Even with io_wref != NULL, we have to wait if the buffer's wait
7274 * ref is not valid but the IO is in progress, someone else
7275 * started IO but hasn't set the wait ref yet. We have no choice
7276 * but to wait until the IO completes.
7277 */
7279
7280 /*
7281 * If this backend currently has staged IO, submit it before
7282 * waiting for in-progress IO, to avoid potential deadlocks and
7283 * unnecessary delays.
7284 */
7286
7287 WaitIO(buf);
7288 }
7289 }
7290
7291 /* Once we get here, there is definitely no I/O active on this buffer */
7292
7293 /* Check if someone else already did the I/O */
7294 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
7295 {
7298 }
7299
7300 /*
7301 * No IO in progress and not already done; we will start IO. It's possible
7302 * that the IO was in progress but we're not done, because the IO errored
7303 * out. We'll do the IO ourselves.
7304 */
7307 0);
7308
7311
7313}
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)

References BM_DIRTY, BM_IO_IN_PROGRESS, BM_VALID, buf, BUFFER_IO_ALREADY_DONE, BUFFER_IO_IN_PROGRESS, BUFFER_IO_READY_FOR_IO, BufferDescriptorGetBuffer(), CurrentResourceOwner, fb(), LockBufHdr(), pgaio_submit_staged(), pgaio_wref_valid(), ResourceOwnerEnlarge(), ResourceOwnerRememberBufferIO(), UnlockBufHdr(), UnlockBufHdrExt(), and WaitIO().

Referenced by buffer_call_start_io(), ExtendBufferedRelShared(), FlushBuffer(), read_rel_block_ll(), StartBufferIO(), and ZeroAndLockBuffer().

◆ SyncOneBuffer()

static int SyncOneBuffer ( int  buf_id,
bool  skip_recently_used,
WritebackContext wb_context 
)
static

Definition at line 4129 of file bufmgr.c.

4130{
4132 int result = 0;
4134 BufferTag tag;
4135
4136 /* Make sure we can handle the pin */
4139
4140 /*
4141 * Check whether buffer needs writing.
4142 *
4143 * We can make this check without taking the buffer content lock so long
4144 * as we mark pages dirty in access methods *before* logging changes with
4145 * XLogInsert(): if someone marks the buffer dirty just after our check we
4146 * don't worry because our checkpoint.redo points before log record for
4147 * upcoming changes and so we are not required to write such dirty buffer.
4148 */
4150
4153 {
4155 }
4156 else if (skip_recently_used)
4157 {
4158 /* Caller told us not to write recently-used buffers */
4160 return result;
4161 }
4162
4163 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4164 {
4165 /* It's clean, so nothing to do */
4167 return result;
4168 }
4169
4170 /*
4171 * Pin it, share-exclusive-lock it, write it. (FlushBuffer will do
4172 * nothing if the buffer is clean by the time we've locked it.)
4173 */
4175
4177
4178 tag = bufHdr->tag;
4179
4181
4182 /*
4183 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4184 * IOContext will always be IOCONTEXT_NORMAL.
4185 */
4187
4188 return result | BUF_WRITTEN;
4189}

References BM_DIRTY, BM_VALID, BUF_REUSABLE, BUF_STATE_GET_REFCOUNT, BUF_STATE_GET_USAGECOUNT, BUF_WRITTEN, CurrentResourceOwner, fb(), FlushUnlockedBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), result, ScheduleBufferTagForWriteback(), UnlockBufHdr(), and UnpinBuffer().

Referenced by BgBufferSync(), and BufferSync().

◆ TerminateBufferIO()

void TerminateBufferIO ( BufferDesc buf,
bool  clear_dirty,
uint64  set_flag_bits,
bool  forget_owner,
bool  release_aio 
)

Definition at line 7358 of file bufmgr.c.

7360{
7363 int refcount_change = 0;
7364
7366
7369
7370 /* Clear earlier errors, if this IO failed, it'll be marked again */
7372
7373 if (clear_dirty)
7375
7376 if (release_aio)
7377 {
7378 /* release ownership by the AIO subsystem */
7380 refcount_change = -1;
7381 pgaio_wref_clear(&buf->io_wref);
7382 }
7383
7387
7388 if (forget_owner)
7391
7393
7394 /*
7395 * Support LockBufferForCleanup()
7396 *
7397 * We may have just released the last pin other than the waiter's. In most
7398 * cases, this backend holds another pin on the buffer. But, if, for
7399 * example, this backend is completing an IO issued by another backend, it
7400 * may be time to wake the waiter.
7401 */
7404}
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void WakePinCountWaiter(BufferDesc *buf)
Definition bufmgr.c:3420
void ConditionVariableBroadcast(ConditionVariable *cv)

References Assert, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_PIN_COUNT_WAITER, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetIOCV(), ConditionVariableBroadcast(), CurrentResourceOwner, fb(), LockBufHdr(), pgaio_wref_clear(), ResourceOwnerForgetBufferIO(), UnlockBufHdrExt(), and WakePinCountWaiter().

Referenced by AbortBufferIO(), buffer_call_terminate_io(), buffer_readv_complete_one(), ExtendBufferedRelShared(), FlushBuffer(), and ZeroAndLockBuffer().

◆ TrackBufferHit()

◆ TrackNewBufferPin()

void TrackNewBufferPin ( Buffer  buf)
inline

Definition at line 3512 of file bufmgr.c.

3513{
3515
3517 ref->data.refcount++;
3518
3520
3521 /*
3522 * This is the first pin for this page by this backend, mark its page as
3523 * defined to valgrind. While the page contents might not actually be
3524 * valid yet, we don't currently guarantee that such pages are marked
3525 * undefined or non-accessible.
3526 *
3527 * It's not necessarily the prettiest to do this here, but otherwise we'd
3528 * need this block of code in multiple places.
3529 */
3531 BLCKSZ);
3532}
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition bufmgr.c:388

References buf, BufHdrGetBlock, CurrentResourceOwner, fb(), GetBufferDescriptor(), NewPrivateRefCountEntry(), ResourceOwnerRememberBuffer(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by GetBufferFromRing(), PinBuffer(), PinBuffer_Locked(), and StrategyGetBuffer().

◆ ts_ckpt_progress_comparator()

static int ts_ckpt_progress_comparator ( Datum  a,
Datum  b,
void arg 
)
static

Definition at line 7655 of file bufmgr.c.

7656{
7659
7660 /* we want a min-heap, so return 1 for the a < b */
7661 if (sa->progress < sb->progress)
7662 return 1;
7663 else if (sa->progress == sb->progress)
7664 return 0;
7665 else
7666 return -1;
7667}

References a, b, DatumGetPointer(), and fb().

Referenced by BufferSync().

◆ UnlockBuffer()

void UnlockBuffer ( Buffer  buffer)

Definition at line 6558 of file bufmgr.c.

6559{
6561
6562 Assert(BufferIsPinned(buffer));
6563 if (BufferIsLocal(buffer))
6564 return; /* local buffers need no lock */
6565
6566 buf_hdr = GetBufferDescriptor(buffer - 1);
6567 BufferLockUnlock(buffer, buf_hdr);
6568}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockUnlock(), fb(), and GetBufferDescriptor().

Referenced by LockBuffer().

◆ UnlockBuffers()

void UnlockBuffers ( void  )

Definition at line 5852 of file bufmgr.c.

5853{
5855
5856 if (buf)
5857 {
5859 uint64 unset_bits = 0;
5860
5862
5863 /*
5864 * Don't complain if flag bit not set; it could have been reset but we
5865 * got a cancel/die interrupt before getting the signal.
5866 */
5867 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5868 buf->wait_backend_pgprocno == MyProcNumber)
5870
5872 0, unset_bits,
5873 0);
5874
5876 }
5877}

References BM_PIN_COUNT_WAITER, buf, fb(), LockBufHdr(), MyProcNumber, PinCountWaitBuf, and UnlockBufHdrExt().

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

◆ UnlockReleaseBuffer()

void UnlockReleaseBuffer ( Buffer  buffer)

Definition at line 5603 of file bufmgr.c.

5604{
5605 int mode;
5606 BufferDesc *buf;
5608 uint64 sub;
5610
5611 Assert(BufferIsPinned(buffer));
5612
5613 if (BufferIsLocal(buffer))
5614 {
5615 UnpinLocalBuffer(buffer);
5616 return;
5617 }
5618
5620
5621 buf = GetBufferDescriptor(buffer - 1);
5622
5624
5625 /* compute state modification for lock release */
5627
5628 /* compute state modification for pin release */
5629 ref = GetPrivateRefCountEntry(buffer, false);
5630 Assert(ref != NULL);
5631 Assert(ref->data.refcount > 0);
5632 ref->data.refcount--;
5633
5634 /* no more backend local pins, reduce shared pin count */
5635 if (likely(ref->data.refcount == 0))
5636 {
5637 /* See comment in UnpinBufferNoOwner() */
5639
5640 sub |= BUF_REFCOUNT_ONE;
5642 }
5643
5644 /* perform the lock and pin release in one atomic op */
5645 lockstate = pg_atomic_sub_fetch_u64(&buf->state, sub);
5646
5647 /* wake up waiters for the lock */
5649
5650 /* wake up waiter for the pin release */
5653
5654 /*
5655 * Now okay to allow cancel/die interrupts again, which were held when the
5656 * lock was acquired.
5657 */
5659}
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition bufmgr.c:565

References Assert, BM_PIN_COUNT_WAITER, buf, BUF_REFCOUNT_ONE, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockDisownInternal(), BufferLockProcessRelease(), BufferLockReleaseSub(), BufHdrGetBlock, CurrentResourceOwner, fb(), ForgetPrivateRefCountEntry(), GetBufferDescriptor(), GetPrivateRefCountEntry(), likely, mode, pg_atomic_sub_fetch_u64(), ResourceOwnerForgetBuffer(), RESUME_INTERRUPTS, UnpinLocalBuffer(), VALGRIND_MAKE_MEM_NOACCESS, and WakePinCountWaiter().

Referenced by _bt_clear_incomplete_split(), _bt_relbuf(), _bt_restore_meta(), _hash_relbuf(), allocNewBuffer(), AlterSequence(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinGetStats(), brinRevmapDesummarizeRange(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), collect_corrupt_items(), collect_visibility_data(), count_nondeletable_pages(), createPostingTree(), doPickSplit(), entryLoadMoreItems(), fill_seq_fork_with_data(), flushCachedPage(), FreeSpaceMapPrepareTruncateRel(), fsm_search(), fsm_set_and_search(), fsm_vacuum_page(), generic_redo(), get_raw_page_internal(), GetVictimBuffer(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), gin_refind_parent(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoSplit(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginRedoVacuumPage(), ginScanPostingTreeToDelete(), ginStepRight(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbufferinginserttuples(), gistbuild(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistplacetopage(), gistProcessItup(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_split_page(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_insert(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heapam_scan_analyze_next_tuple(), initBloomState(), invalidate_one_block(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), log_newpage_range(), modify_rel_block(), moveLeafs(), nextval_internal(), palloc_btree_page(), pg_get_sequence_data(), pg_sequence_last_value(), pg_visibility(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), ProcessSingleRelationFork(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), ResetSequence(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), scanPostingTree(), ScanSourceDatabasePgClass(), seq_redo(), SequenceChangePersistence(), SetSequence(), shiftList(), spgAddNodeAction(), spgbuild(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistUpdateMetaPage(), spgMatchNodeAction(), spgprocesspending(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), statapprox_heap(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_prepare_truncate(), writeListPage(), xlog_redo(), and XLogRecordPageWithFreeSpace().

◆ UnpinBuffer()

◆ UnpinBufferNoOwner()

static void UnpinBufferNoOwner ( BufferDesc buf)
static

Definition at line 3465 of file bufmgr.c.

3466{
3469
3471
3472 /* not moving as we're likely deleting it soon anyway */
3473 ref = GetPrivateRefCountEntry(b, false);
3474 Assert(ref != NULL);
3475 Assert(ref->data.refcount > 0);
3476 ref->data.refcount--;
3477 if (ref->data.refcount == 0)
3478 {
3480
3481 /*
3482 * Mark buffer non-accessible to Valgrind.
3483 *
3484 * Note that the buffer may have already been marked non-accessible
3485 * within access method code that enforces that buffers are only
3486 * accessed while a buffer lock is held.
3487 */
3489
3490 /*
3491 * I'd better not still hold the buffer content lock. Can't use
3492 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3493 */
3495
3496 /* decrement the shared reference count */
3498
3499 /* Support LockBufferForCleanup() */
3502
3504 }
3505}
static uint64 pg_atomic_fetch_sub_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:541

References Assert, b, BM_PIN_COUNT_WAITER, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, BufferLockHeldByMe(), BufHdrGetBlock, fb(), ForgetPrivateRefCountEntry(), GetPrivateRefCountEntry(), pg_atomic_fetch_sub_u64(), VALGRIND_MAKE_MEM_NOACCESS, and WakePinCountWaiter().

Referenced by ResOwnerReleaseBuffer(), and UnpinBuffer().

◆ WaitBufHdrUnlocked()

pg_noinline uint64 WaitBufHdrUnlocked ( BufferDesc buf)

◆ WaitIO()

static void WaitIO ( BufferDesc buf)
static

Definition at line 7139 of file bufmgr.c.

7140{
7142
7143 /*
7144 * Should never end up here with unsubmitted IO, as no AIO unaware code
7145 * may be used while in batch mode and AIO aware code needs to have
7146 * submitted all staged IO to avoid deadlocks & slowness.
7147 */
7149
7151 for (;;)
7152 {
7155
7156 /*
7157 * It may not be necessary to acquire the spinlock to check the flag
7158 * here, but since this test is essential for correctness, we'd better
7159 * play it safe.
7160 */
7162
7163 /*
7164 * Copy the wait reference while holding the spinlock. This protects
7165 * against a concurrent TerminateBufferIO() in another backend from
7166 * clearing the wref while it's being read.
7167 */
7168 iow = buf->io_wref;
7170
7171 /* no IO in progress, we don't need to wait */
7173 break;
7174
7175 /*
7176 * The buffer has asynchronous IO in progress, wait for it to
7177 * complete.
7178 */
7179 if (pgaio_wref_valid(&iow))
7180 {
7182
7183 /*
7184 * The AIO subsystem internally uses condition variables and thus
7185 * might remove this backend from the BufferDesc's CV. While that
7186 * wouldn't cause a correctness issue (the first CV sleep just
7187 * immediately returns if not already registered), it seems worth
7188 * avoiding unnecessary loop iterations, given that we take care
7189 * to do so at the start of the function.
7190 */
7192 continue;
7193 }
7194
7195 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
7197 }
7199}
bool pgaio_have_staged(void)
Definition aio.c:1117
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition aio.c:991
bool ConditionVariableCancelSleep(void)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)

References Assert, BM_IO_IN_PROGRESS, buf, BufferDescriptorGetIOCV(), ConditionVariableCancelSleep(), ConditionVariablePrepareToSleep(), ConditionVariableSleep(), fb(), LockBufHdr(), pgaio_have_staged(), pgaio_wref_valid(), pgaio_wref_wait(), and UnlockBufHdr().

Referenced by InvalidateBuffer(), and StartSharedBufferIO().

◆ WaitReadBuffers()

bool WaitReadBuffers ( ReadBuffersOperation operation)

Definition at line 1750 of file bufmgr.c.

1751{
1752 PgAioReturn *aio_ret = &operation->io_return;
1755 bool needed_wait = false;
1756
1757 if (operation->persistence == RELPERSISTENCE_TEMP)
1758 {
1761 }
1762 else
1763 {
1766 }
1767
1768 /*
1769 * If we get here without an IO operation having been issued, the
1770 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1771 * caller should not have called WaitReadBuffers().
1772 *
1773 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1774 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1775 * of the retry logic below, no extra code is required.
1776 *
1777 * This path is expected to eventually go away.
1778 */
1779 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1780 elog(ERROR, "waiting for read operation that didn't read");
1781
1782 /*
1783 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1784 * done. We may need multiple retries, not just because we could get
1785 * multiple partial reads, but also because some of the remaining
1786 * to-be-read buffers may have been read in by other backends, limiting
1787 * the IO size.
1788 */
1789 while (true)
1790 {
1792
1794
1795 /*
1796 * If there is an IO associated with the operation, we may need to
1797 * wait for it.
1798 */
1799 if (pgaio_wref_valid(&operation->io_wref))
1800 {
1801 /*
1802 * Track the time spent waiting for the IO to complete. As
1803 * tracking a wait even if we don't actually need to wait
1804 *
1805 * a) is not cheap, due to the timestamping overhead
1806 *
1807 * b) reports some time as waiting, even if we never waited
1808 *
1809 * we first check if we already know the IO is complete.
1810 *
1811 * Note that operation->io_return is uninitialized for foreign IO,
1812 * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check.
1813 */
1814 if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
1815 !pgaio_wref_check_done(&operation->io_wref))
1816 {
1818
1819 pgaio_wref_wait(&operation->io_wref);
1820 needed_wait = true;
1821
1822 /*
1823 * The IO operation itself was already counted earlier, in
1824 * AsyncReadBuffers(), this just accounts for the wait time.
1825 */
1827 io_start, 0, 0);
1828 }
1829 else
1830 {
1832 }
1833
1834 if (unlikely(operation->foreign_io))
1835 {
1836 Buffer buffer = operation->buffers[operation->nblocks_done];
1837 BufferDesc *desc = BufferIsLocal(buffer) ?
1838 GetLocalBufferDescriptor(-buffer - 1) :
1839 GetBufferDescriptor(buffer - 1);
1841
1842 if (buf_state & BM_VALID)
1843 {
1844 BlockNumber blocknum = operation->blocknum + operation->nblocks_done;
1845
1846 operation->nblocks_done += 1;
1847 Assert(operation->nblocks_done <= operation->nblocks);
1848
1849 /*
1850 * Track this as a 'hit' for this backend. The backend
1851 * performing the IO will track it as a 'read'.
1852 */
1854 operation->rel, operation->persistence,
1855 operation->smgr, operation->forknum,
1856 blocknum);
1857 }
1858
1859 /*
1860 * If the foreign IO failed and left the buffer invalid,
1861 * nblocks_done is not incremented. The retry loop below will
1862 * call AsyncReadBuffers() which will attempt the IO itself.
1863 */
1864 }
1865 else
1866 {
1867 /*
1868 * We now are sure the IO completed. Check the results. This
1869 * includes reporting on errors if there were any.
1870 */
1872 }
1873 }
1874
1875 /*
1876 * Most of the time, the one IO we already started, will read in
1877 * everything. But we need to deal with partial reads and buffers not
1878 * needing IO anymore.
1879 */
1880 if (operation->nblocks_done == operation->nblocks)
1881 break;
1882
1884
1885 /*
1886 * If the IO completed only partially, we need to perform additional
1887 * work, consider that a form of having had to wait.
1888 */
1889 needed_wait = true;
1890
1891 /*
1892 * This may only complete the IO partially, either because some
1893 * buffers were already valid, or because of a partial read.
1894 *
1895 * NB: In contrast to after the AsyncReadBuffers() call in
1896 * StartReadBuffers(), we do *not* reduce
1897 * ReadBuffersOperation->nblocks here, callers expect the full
1898 * operation to be completed at this point (as more operations may
1899 * have been queued).
1900 */
1902 }
1903
1905
1906 /* NB: READ_DONE tracepoint was already executed in completion callback */
1907 return needed_wait;
1908}
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition aio.c:1005
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition bufmgr.c:1705

References Assert, AsyncReadBuffers(), BM_VALID, PrivateRefCountEntry::buffer, BufferIsLocal, CHECK_FOR_INTERRUPTS, CheckReadBuffersOperation(), elog, ERROR, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), io_method, IOCONTEXT_NORMAL, IOContextForStrategy(), IOMETHOD_SYNC, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_READ, operation, pg_atomic_read_u64(), PGAIO_RS_UNKNOWN, pgaio_wref_check_done(), pgaio_wref_valid(), pgaio_wref_wait(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), ProcessReadBuffersResult(), BufferDesc::state, track_io_timing, TrackBufferHit(), and unlikely.

Referenced by read_buffers(), read_stream_next_buffer(), and ReadBuffer_common().

◆ WakePinCountWaiter()

static void WakePinCountWaiter ( BufferDesc buf)
static

Definition at line 3420 of file bufmgr.c.

3421{
3422 /*
3423 * Acquire the buffer header lock, re-check that there's a waiter. Another
3424 * backend could have unpinned this buffer, and already woken up the
3425 * waiter.
3426 *
3427 * There's no danger of the buffer being replaced after we unpinned it
3428 * above, as it's pinned by the waiter. The waiter removes
3429 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3430 * backend waking it up.
3431 */
3433
3436 {
3437 /* we just released the last pin other than the waiter's */
3438 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3439
3442 0);
3443 ProcSendSignal(wait_backend_pgprocno);
3444 }
3445 else
3447}
void ProcSendSignal(ProcNumber procNumber)
Definition proc.c:2027

References BM_PIN_COUNT_WAITER, buf, BUF_STATE_GET_REFCOUNT, fb(), LockBufHdr(), ProcSendSignal(), UnlockBufHdr(), and UnlockBufHdrExt().

Referenced by TerminateBufferIO(), UnlockReleaseBuffer(), and UnpinBufferNoOwner().

◆ WritebackContextInit()

void WritebackContextInit ( WritebackContext context,
int max_pending 
)

Definition at line 7678 of file bufmgr.c.

7679{
7680 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7681
7682 context->max_pending = max_pending;
7683 context->nr_pending = 0;
7684}

References Assert, WritebackContext::max_pending, WritebackContext::nr_pending, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by BackgroundWriterMain(), BufferManagerShmemAttach(), BufferManagerShmemInit(), and BufferSync().

◆ ZeroAndLockBuffer()

static void ZeroAndLockBuffer ( Buffer  buffer,
ReadBufferMode  mode,
bool  already_valid 
)
static

Definition at line 1146 of file bufmgr.c.

1147{
1149 bool need_to_zero;
1150 bool isLocalBuf = BufferIsLocal(buffer);
1152
1154
1155 if (already_valid)
1156 {
1157 /*
1158 * If the caller already knew the buffer was valid, we can skip some
1159 * header interaction. The caller just wants to lock the buffer.
1160 */
1161 need_to_zero = false;
1162 }
1163 else
1164 {
1165 if (isLocalBuf)
1166 {
1167 /* Simple case for non-shared buffers. */
1168 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1169 sbres = StartLocalBufferIO(bufHdr, true, true, NULL);
1170 }
1171 else
1172 {
1173 /*
1174 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1175 * concurrently. Even though we aren't doing I/O, that ensures
1176 * that we don't zero a page that someone else has pinned. An
1177 * exclusive content lock wouldn't be enough, because readers are
1178 * allowed to drop the content lock after determining that a tuple
1179 * is visible (see buffer access rules in README).
1180 */
1181 bufHdr = GetBufferDescriptor(buffer - 1);
1182 sbres = StartSharedBufferIO(bufHdr, true, true, NULL);
1183 }
1184
1187 }
1188
1189 if (need_to_zero)
1190 {
1191 memset(BufferGetPage(buffer), 0, BLCKSZ);
1192
1193 /*
1194 * Grab the buffer content lock before marking the page as valid, to
1195 * make sure that no other backend sees the zeroed page before the
1196 * caller has had a chance to initialize it.
1197 *
1198 * Since no-one else can be looking at the page contents yet, there is
1199 * no difference between an exclusive lock and a cleanup-strength
1200 * lock. (Note that we cannot use LockBuffer() or
1201 * LockBufferForCleanup() here, because they assert that the buffer is
1202 * already valid.)
1203 */
1204 if (!isLocalBuf)
1206
1207 /* Set BM_VALID, terminate IO, and wake up any waiters */
1208 if (isLocalBuf)
1209 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1210 else
1211 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1212 }
1213 else if (!isLocalBuf)
1214 {
1215 /*
1216 * The buffer is valid, so we can't zero it. The caller still expects
1217 * the page to be locked on return.
1218 */
1219 if (mode == RBM_ZERO_AND_LOCK)
1221 else
1222 LockBufferForCleanup(buffer);
1223 }
1224}
void LockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6670

References Assert, BM_VALID, PrivateRefCountEntry::buffer, BUFFER_IO_IN_PROGRESS, BUFFER_IO_READY_FOR_IO, BUFFER_LOCK_EXCLUSIVE, BufferGetPage(), BufferIsLocal, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), LockBuffer(), LockBufferForCleanup(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, StartLocalBufferIO(), StartSharedBufferIO(), TerminateBufferIO(), and TerminateLocalBufferIO().

Referenced by ReadBuffer_common().

Variable Documentation

◆ aio_local_buffer_readv_cb

const PgAioHandleCallbacks aio_local_buffer_readv_cb
Initial value:
= {
.complete_local = local_buffer_readv_complete,
}
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8931
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8925
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition bufmgr.c:8779

Definition at line 8947 of file bufmgr.c.

8947 {
8948 .stage = local_buffer_readv_stage,
8949
8950 /*
8951 * Note that this, in contrast to the shared_buffers case, uses
8952 * complete_local, as only the issuing backend has access to the required
8953 * datastructures. This is important in case the IO completion may be
8954 * consumed incidentally by another backend.
8955 */
8956 .complete_local = local_buffer_readv_complete,
8957 .report = buffer_readv_report,
8958};

◆ aio_shared_buffer_readv_cb

const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Initial value:
= {
.complete_shared = shared_buffer_readv_complete,
}
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8894
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8874
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8880

Definition at line 8938 of file bufmgr.c.

8938 {
8940 .complete_shared = shared_buffer_readv_complete,
8941 /* need a local callback to report checksum failures */
8942 .complete_local = shared_buffer_readv_complete_local,
8943 .report = buffer_readv_report,
8944};

◆ backend_flush_after

int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER

Definition at line 225 of file bufmgr.c.

Referenced by BufferManagerShmemAttach(), and BufferManagerShmemInit().

◆ bgwriter_flush_after

int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER

Definition at line 224 of file bufmgr.c.

Referenced by BackgroundWriterMain().

◆ bgwriter_lru_maxpages

int bgwriter_lru_maxpages = 100

Definition at line 190 of file bufmgr.c.

Referenced by BgBufferSync().

◆ bgwriter_lru_multiplier

double bgwriter_lru_multiplier = 2.0

Definition at line 191 of file bufmgr.c.

Referenced by BgBufferSync().

◆ buffer_io_resowner_desc

const ResourceOwnerDesc buffer_io_resowner_desc
Initial value:
=
{
.name = "buffer io",
.release_priority = RELEASE_PRIO_BUFFER_IOS,
.ReleaseResource = ResOwnerReleaseBufferIO,
.DebugPrint = ResOwnerPrintBufferIO
}
static void ResOwnerReleaseBufferIO(Datum res)
Definition bufmgr.c:7823
static char * ResOwnerPrintBufferIO(Datum res)
Definition bufmgr.c:7831
#define RELEASE_PRIO_BUFFER_IOS
Definition resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition resowner.h:54

Definition at line 285 of file bufmgr.c.

286{
287 .name = "buffer io",
288 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
289 .release_priority = RELEASE_PRIO_BUFFER_IOS,
290 .ReleaseResource = ResOwnerReleaseBufferIO,
291 .DebugPrint = ResOwnerPrintBufferIO
292};

Referenced by ResourceOwnerForgetBufferIO(), and ResourceOwnerRememberBufferIO().

◆ buffer_resowner_desc

const ResourceOwnerDesc buffer_resowner_desc
Initial value:
=
{
.name = "buffer",
.release_priority = RELEASE_PRIO_BUFFER_PINS,
.ReleaseResource = ResOwnerReleaseBuffer,
.DebugPrint = ResOwnerPrintBuffer
}
static void ResOwnerReleaseBuffer(Datum res)
Definition bufmgr.c:7845
static char * ResOwnerPrintBuffer(Datum res)
Definition bufmgr.c:7881
#define RELEASE_PRIO_BUFFER_PINS
Definition resowner.h:63

Definition at line 294 of file bufmgr.c.

295{
296 .name = "buffer",
297 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
298 .release_priority = RELEASE_PRIO_BUFFER_PINS,
299 .ReleaseResource = ResOwnerReleaseBuffer,
300 .DebugPrint = ResOwnerPrintBuffer
301};

Referenced by ResourceOwnerForgetBuffer(), and ResourceOwnerRememberBuffer().

◆ checkpoint_flush_after

int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER

Definition at line 223 of file bufmgr.c.

Referenced by BufferSync().

◆ effective_io_concurrency

◆ io_combine_limit

◆ io_combine_limit_guc

int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT

Definition at line 216 of file bufmgr.c.

Referenced by assign_io_max_combine_limit().

◆ io_max_combine_limit

◆ maintenance_io_concurrency

◆ MaxProportionalPins

uint32 MaxProportionalPins
static

Definition at line 271 of file bufmgr.c.

Referenced by GetAdditionalPinLimit(), GetPinLimit(), and InitBufferManagerAccess().

◆ PinCountWaitBuf

BufferDesc* PinCountWaitBuf = NULL
static

Definition at line 228 of file bufmgr.c.

Referenced by LockBufferForCleanup(), and UnlockBuffers().

◆ PrivateRefCountArray

◆ PrivateRefCountArrayKeys

◆ PrivateRefCountClock

uint32 PrivateRefCountClock = 0
static

Definition at line 267 of file bufmgr.c.

Referenced by ReservePrivateRefCountEntry().

◆ PrivateRefCountEntryLast

int PrivateRefCountEntryLast = -1
static

◆ PrivateRefCountHash

◆ PrivateRefCountOverflowed

◆ ReservedRefCountSlot

int ReservedRefCountSlot = -1
static

◆ track_io_timing

◆ zero_damaged_pages

bool zero_damaged_pages = false

Definition at line 189 of file bufmgr.c.

Referenced by AsyncReadBuffers(), mdreadv(), and read_rel_block_ll().