PostgreSQL Source Code git master
Loading...
Searching...
No Matches
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "common/hashfn.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/aio.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/proclist.h"
#include "storage/procsignal.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
#include "utils/wait_event.h"
#include "lib/simplehash.h"
#include "lib/sort_template.h"
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountData
 
struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define SH_PREFIX   refcount
 
#define SH_ELEMENT_TYPE   PrivateRefCountEntry
 
#define SH_KEY_TYPE   Buffer
 
#define SH_KEY   buffer
 
#define SH_HASH_KEY(tb, key)   murmurhash32((uint32) (key))
 
#define SH_EQUAL(tb, a, b)   ((a) == (b))
 
#define SH_SCOPE   static inline
 
#define SH_DECLARE
 
#define SH_DEFINE
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define READV_COUNT_BITS   7
 
#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)
 

Typedefs

typedef struct PrivateRefCountData PrivateRefCountData
 
typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static void ResOwnerReleaseBufferIO (Datum res)
 
static charResOwnerPrintBufferIO (Datum res)
 
static void ResOwnerReleaseBuffer (Datum res)
 
static charResOwnerPrintBuffer (Datum res)
 
static pg_noinline PrivateRefCountEntryGetPrivateRefCountEntrySlow (Buffer buffer, bool do_move)
 
static Buffer ReadBuffer_common (Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static BlockNumber ExtendBufferedRelCommon (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static BlockNumber ExtendBufferedRelShared (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void UnpinBufferNoOwner (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static void AbortBufferIO (Buffer buffer)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
 
static bool AsyncReadBuffers (ReadBuffersOperation *operation, int *nblocks_progress)
 
static void CheckReadBuffersOperation (ReadBuffersOperation *operation, bool is_complete)
 
static Buffer GetVictimBuffer (BufferAccessStrategy strategy, IOContext io_context)
 
static void FlushUnlockedBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
static void BufferLockAcquire (Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
 
static void BufferLockUnlock (Buffer buffer, BufferDesc *buf_hdr)
 
static bool BufferLockConditional (Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
 
static bool BufferLockHeldByMeInMode (BufferDesc *buf_hdr, BufferLockMode mode)
 
static bool BufferLockHeldByMe (BufferDesc *buf_hdr)
 
static void BufferLockDisown (Buffer buffer, BufferDesc *buf_hdr)
 
static int BufferLockDisownInternal (Buffer buffer, BufferDesc *buf_hdr)
 
static bool BufferLockAttempt (BufferDesc *buf_hdr, BufferLockMode mode)
 
static void BufferLockQueueSelf (BufferDesc *buf_hdr, BufferLockMode mode)
 
static void BufferLockDequeueSelf (BufferDesc *buf_hdr)
 
static void BufferLockWakeup (BufferDesc *buf_hdr, bool unlocked)
 
static void BufferLockProcessRelease (BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
 
static uint64 BufferLockReleaseSub (BufferLockMode mode)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
Buffer ExtendBufferedRel (BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
 
BlockNumber ExtendBufferedRelBy (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
 
Buffer ExtendBufferedRelTo (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
 
static void ZeroAndLockBuffer (Buffer buffer, ReadBufferMode mode, bool already_valid)
 
static pg_attribute_always_inline Buffer PinBufferForBlock (Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static pg_attribute_always_inline bool StartReadBuffersImpl (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
 
bool StartReadBuffers (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
 
bool StartReadBuffer (ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
 
static bool ReadBuffersCanStartIOOnce (Buffer buffer, bool nowait)
 
static bool ReadBuffersCanStartIO (Buffer buffer, bool nowait)
 
static void ProcessReadBuffersResult (ReadBuffersOperation *operation)
 
void WaitReadBuffers (ReadBuffersOperation *operation)
 
static void InvalidateBuffer (BufferDesc *buf)
 
static bool InvalidateVictimBuffer (BufferDesc *buf_hdr)
 
uint32 GetPinLimit (void)
 
uint32 GetAdditionalPinLimit (void)
 
void LimitAdditionalPins (uint32 *additional_pins)
 
bool BufferIsLockedByMe (Buffer buffer)
 
bool BufferIsLockedByMeInMode (Buffer buffer, BufferLockMode mode)
 
bool BufferIsDirty (Buffer buffer)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
static void WakePinCountWaiter (BufferDesc *buf)
 
void TrackNewBufferPin (Buffer buf)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferManagerAccess (void)
 
charDebugPrintBufferRefcount (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
static void MarkSharedBufferDirtyHint (Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, bool buffer_std)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void UnlockBuffer (Buffer buffer)
 
void LockBufferInternal (Buffer buffer, BufferLockMode mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void CheckBufferIsPinnedOnce (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
static bool SharedBufferBeginSetHintBits (Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
 
bool BufferBeginSetHintBits (Buffer buffer)
 
void BufferFinishSetHintBits (Buffer buffer, bool mark_dirty, bool buffer_std)
 
bool BufferSetHintBits16 (uint16 *ptr, uint16 val, Buffer buffer)
 
bool StartBufferIO (BufferDesc *buf, bool forInput, bool nowait)
 
void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
 
uint64 LockBufHdr (BufferDesc *desc)
 
pg_noinline uint64 WaitBufHdrUnlocked (BufferDesc *buf)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *wb_context, IOContext io_context)
 
static bool EvictUnpinnedBufferInternal (BufferDesc *desc, bool *buffer_flushed)
 
bool EvictUnpinnedBuffer (Buffer buf, bool *buffer_flushed)
 
void EvictAllUnpinnedBuffers (int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
void EvictRelUnpinnedBuffers (Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
static bool MarkDirtyUnpinnedBufferInternal (Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
 
bool MarkDirtyUnpinnedBuffer (Buffer buf, bool *buffer_already_dirty)
 
void MarkDirtyRelUnpinnedBuffers (Relation rel, int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
 
void MarkDirtyAllUnpinnedBuffers (int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
 
static pg_attribute_always_inline void buffer_stage_common (PgAioHandle *ioh, bool is_write, bool is_temp)
 
static void buffer_readv_decode_error (PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
 
static void buffer_readv_encode_error (PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
 
static pg_attribute_always_inline void buffer_readv_complete_one (PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
 
static pg_attribute_always_inline PgAioResult buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
 
static void buffer_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static void shared_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete_local (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void local_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult local_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT
 
int io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescPinCountWaitBuf = NULL
 
static Buffer PrivateRefCountArrayKeys [REFCOUNT_ARRAY_ENTRIES]
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static refcount_hashPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static int ReservedRefCountSlot = -1
 
static int PrivateRefCountEntryLast = -1
 
static uint32 MaxProportionalPins
 
const ResourceOwnerDesc buffer_io_resowner_desc
 
const ResourceOwnerDesc buffer_resowner_desc
 
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
 
const PgAioHandleCallbacks aio_local_buffer_readv_cb
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 95 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 85 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 84 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 77 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
false \
: \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition bufmgr.c:542
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:421
int32 * LocalRefCount
Definition localbuf.c:49
static int fb(int x)

Definition at line 599 of file bufmgr.c.

603 : \
605 (LocalRefCount[-(bufnum) - 1] > 0) \
606 : \
608)

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 76 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 80 of file bufmgr.c.

◆ READV_COUNT_BITS

#define READV_COUNT_BITS   7

◆ READV_COUNT_MASK

#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 145 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 87 of file bufmgr.c.

◆ SH_DECLARE

#define SH_DECLARE

Definition at line 140 of file bufmgr.c.

◆ SH_DEFINE

#define SH_DEFINE

Definition at line 141 of file bufmgr.c.

◆ SH_ELEMENT_TYPE

#define SH_ELEMENT_TYPE   PrivateRefCountEntry

Definition at line 134 of file bufmgr.c.

◆ SH_EQUAL

#define SH_EQUAL (   tb,
  a,
  b 
)    ((a) == (b))

Definition at line 138 of file bufmgr.c.

◆ SH_HASH_KEY

#define SH_HASH_KEY (   tb,
  key 
)    murmurhash32((uint32) (key))

Definition at line 137 of file bufmgr.c.

◆ SH_KEY

#define SH_KEY   buffer

Definition at line 136 of file bufmgr.c.

◆ SH_KEY_TYPE

#define SH_KEY_TYPE   Buffer

Definition at line 135 of file bufmgr.c.

◆ SH_PREFIX

#define SH_PREFIX   refcount

Definition at line 133 of file bufmgr.c.

◆ SH_SCOPE

#define SH_SCOPE   static inline

Definition at line 139 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 3447 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 3447 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 3449 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 3449 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 3446 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 3446 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 3448 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 3448 of file bufmgr.c.

◆ ST_SORT [1/2]

Definition at line 3445 of file bufmgr.c.

◆ ST_SORT [2/2]

Definition at line 3445 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

◆ PrivateRefCountData

◆ PrivateRefCountEntry

◆ SMgrSortArray

Function Documentation

◆ AbortBufferIO()

static void AbortBufferIO ( Buffer  buffer)
static

Definition at line 7203 of file bufmgr.c.

7204{
7205 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
7207
7210
7211 if (!(buf_state & BM_VALID))
7212 {
7215 }
7216 else
7217 {
7220
7221 /* Issue notice if this is not the first failure... */
7222 if (buf_state & BM_IO_ERROR)
7223 {
7224 /* Buffer is pinned, so we can read tag without spinlock */
7227 errmsg("could not write block %u of %s",
7228 buf_hdr->tag.blockNum,
7230 BufTagGetForkNum(&buf_hdr->tag)).str),
7231 errdetail("Multiple failures --- write error might be permanent.")));
7232 }
7233 }
7234
7235 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
7236}
#define BM_TAG_VALID
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc)
#define BM_DIRTY
#define BM_IO_IN_PROGRESS
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
#define BM_IO_ERROR
static BufferDesc * GetBufferDescriptor(uint32 id)
uint64 LockBufHdr(BufferDesc *desc)
Definition bufmgr.c:7301
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
Definition bufmgr.c:7141
#define Assert(condition)
Definition c.h:945
uint64_t uint64
Definition c.h:619
int errcode(int sqlerrcode)
Definition elog.c:874
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:36
#define ereport(elevel,...)
Definition elog.h:150
static char * errmsg
#define relpathperm(rlocator, forknum)
Definition relpath.h:146

References Assert, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg, fb(), GetBufferDescriptor(), LockBufHdr(), relpathperm, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by ResOwnerReleaseBufferIO().

◆ AsyncReadBuffers()

static bool AsyncReadBuffers ( ReadBuffersOperation operation,
int nblocks_progress 
)
static

Definition at line 1874 of file bufmgr.c.

1875{
1876 Buffer *buffers = &operation->buffers[0];
1877 int flags = operation->flags;
1878 BlockNumber blocknum = operation->blocknum;
1879 ForkNumber forknum = operation->forknum;
1880 char persistence = operation->persistence;
1881 int16 nblocks_done = operation->nblocks_done;
1882 Buffer *io_buffers = &operation->buffers[nblocks_done];
1883 int io_buffers_len = 0;
1885 uint32 ioh_flags = 0;
1889 bool did_start_io;
1890
1891 /*
1892 * When this IO is executed synchronously, either because the caller will
1893 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1894 * the AIO subsystem needs to know.
1895 */
1896 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1898
1899 if (persistence == RELPERSISTENCE_TEMP)
1900 {
1904 }
1905 else
1906 {
1909 }
1910
1911 /*
1912 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1913 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1914 * set globally, but on a per-session basis. The completion callback,
1915 * which may be run in other processes, e.g. in IO workers, may have a
1916 * different value of the zero_damaged_pages GUC.
1917 *
1918 * XXX: We probably should eventually use a different flag for
1919 * zero_damaged_pages, so we can report different log levels / error codes
1920 * for zero_damaged_pages and ZERO_ON_ERROR.
1921 */
1924
1925 /*
1926 * For the same reason as with zero_damaged_pages we need to use this
1927 * backend's ignore_checksum_failure value.
1928 */
1931
1932
1933 /*
1934 * To be allowed to report stats in the local completion callback we need
1935 * to prepare to report stats now. This ensures we can safely report the
1936 * checksum failure even in a critical section.
1937 */
1939
1940 /*
1941 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1942 * might block, which we don't want after setting IO_IN_PROGRESS.
1943 *
1944 * If we need to wait for IO before we can get a handle, submit
1945 * already-staged IO first, so that other backends don't need to wait.
1946 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1947 * wait for already submitted IO, which doesn't require additional locks,
1948 * but it could still cause undesirable waits.
1949 *
1950 * A secondary benefit is that this would allow us to measure the time in
1951 * pgaio_io_acquire() without causing undue timer overhead in the common,
1952 * non-blocking, case. However, currently the pgstats infrastructure
1953 * doesn't really allow that, as it a) asserts that an operation can't
1954 * have time without operations b) doesn't have an API to report
1955 * "accumulated" time.
1956 */
1958 if (unlikely(!ioh))
1959 {
1961
1963 }
1964
1965 /*
1966 * Check if we can start IO on the first to-be-read buffer.
1967 *
1968 * If an I/O is already in progress in another backend, we want to wait
1969 * for the outcome: either done, or something went wrong and we will
1970 * retry.
1971 */
1972 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1973 {
1974 /*
1975 * Someone else has already completed this block, we're done.
1976 *
1977 * When IO is necessary, ->nblocks_done is updated in
1978 * ProcessReadBuffersResult(), but that is not called if no IO is
1979 * necessary. Thus update here.
1980 */
1981 operation->nblocks_done += 1;
1982 *nblocks_progress = 1;
1983
1985 pgaio_wref_clear(&operation->io_wref);
1986 did_start_io = false;
1987
1988 /*
1989 * Report and track this as a 'hit' for this backend, even though it
1990 * must have started out as a miss in PinBufferForBlock(). The other
1991 * backend will track this as a 'read'.
1992 */
1993 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1994 operation->smgr->smgr_rlocator.locator.spcOid,
1995 operation->smgr->smgr_rlocator.locator.dbOid,
1996 operation->smgr->smgr_rlocator.locator.relNumber,
1997 operation->smgr->smgr_rlocator.backend,
1998 true);
1999
2000 if (persistence == RELPERSISTENCE_TEMP)
2002 else
2004
2005 if (operation->rel)
2006 pgstat_count_buffer_hit(operation->rel);
2007
2009
2010 if (VacuumCostActive)
2012 }
2013 else
2014 {
2016
2017 /* We found a buffer that we need to read in. */
2018 Assert(io_buffers[0] == buffers[nblocks_done]);
2019 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
2020 io_buffers_len = 1;
2021
2022 /*
2023 * How many neighboring-on-disk blocks can we scatter-read into other
2024 * buffers at the same time? In this case we don't wait if we see an
2025 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
2026 * head block, so we should get on with that I/O as soon as possible.
2027 */
2028 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
2029 {
2030 if (!ReadBuffersCanStartIO(buffers[i], true))
2031 break;
2032 /* Must be consecutive block numbers. */
2033 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
2034 BufferGetBlockNumber(buffers[i]) - 1);
2035 Assert(io_buffers[io_buffers_len] == buffers[i]);
2036
2037 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2038 }
2039
2040 /* get a reference to wait for in WaitReadBuffers() */
2041 pgaio_io_get_wref(ioh, &operation->io_wref);
2042
2043 /* provide the list of buffers to the completion callbacks */
2045
2047 persistence == RELPERSISTENCE_TEMP ?
2050 flags);
2051
2053
2054 /* ---
2055 * Even though we're trying to issue IO asynchronously, track the time
2056 * in smgrstartreadv():
2057 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2058 * immediately
2059 * - the io method might not support the IO (e.g. worker IO for a temp
2060 * table)
2061 * ---
2062 */
2064 smgrstartreadv(ioh, operation->smgr, forknum,
2065 blocknum + nblocks_done,
2069
2070 if (persistence == RELPERSISTENCE_TEMP)
2072 else
2074
2075 /*
2076 * Track vacuum cost when issuing IO, not after waiting for it.
2077 * Otherwise we could end up issuing a lot of IO in a short timespan,
2078 * despite a low cost limit.
2079 */
2080 if (VacuumCostActive)
2082
2084 did_start_io = true;
2085 }
2086
2087 return did_start_io;
2088}
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:162
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition aio.c:964
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition aio.c:366
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
void pgaio_submit_staged(void)
Definition aio.c:1123
void pgaio_io_release(PgAioHandle *ioh)
Definition aio.c:240
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:188
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition aio.h:198
@ PGAIO_HF_SYNCHRONOUS
Definition aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
uint32 BlockNumber
Definition block.h:31
int Buffer
Definition buf.h:23
bool track_io_timing
Definition bufmgr.c:192
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4357
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition bufmgr.c:1674
bool zero_damaged_pages
Definition bufmgr.c:189
#define READ_BUFFERS_ZERO_ON_ERROR
Definition bufmgr.h:122
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:437
#define MAX_IO_COMBINE_LIMIT
Definition bufmgr.h:173
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition bufmgr.h:126
#define READ_BUFFERS_SYNCHRONOUSLY
Definition bufmgr.h:128
bool ignore_checksum_failure
Definition bufpage.c:27
int16_t int16
Definition c.h:613
#define unlikely(x)
Definition c.h:432
uint32_t uint32
Definition c.h:618
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition freelist.c:747
int VacuumCostPageMiss
Definition globals.c:152
bool VacuumCostActive
Definition globals.c:158
int VacuumCostBalance
Definition globals.c:157
int VacuumCostPageHit
Definition globals.c:151
BufferUsage pgBufferUsage
Definition instrument.c:20
int i
Definition isn.c:77
IOObject
Definition pgstat.h:280
@ IOOBJECT_RELATION
Definition pgstat.h:281
@ IOOBJECT_TEMP_RELATION
Definition pgstat.h:282
IOContext
Definition pgstat.h:289
@ IOCONTEXT_NORMAL
Definition pgstat.h:293
@ IOOP_READ
Definition pgstat.h:319
@ IOOP_HIT
Definition pgstat.h:313
#define pgstat_count_buffer_hit(rel)
Definition pgstat.h:751
void pgstat_prepare_report_checksum_failure(Oid dboid)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
ForkNumber
Definition relpath.h:56
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition smgr.c:753
int64 local_blks_hit
Definition instrument.h:30
int64 shared_blks_read
Definition instrument.h:27
int64 local_blks_read
Definition instrument.h:31
int64 shared_blks_hit
Definition instrument.h:26
ForkNumber forknum
Definition bufmgr.h:137
PgAioWaitRef io_wref
Definition bufmgr.h:150
SMgrRelation smgr
Definition bufmgr.h:135
BufferAccessStrategy strategy
Definition bufmgr.h:138
BlockNumber blocknum
Definition bufmgr.h:146
PgAioReturn io_return
Definition bufmgr.h:151
RelFileLocator locator
RelFileNumber relNumber
RelFileLocatorBackend smgr_rlocator
Definition smgr.h:38

References Assert, RelFileLocatorBackend::backend, ReadBuffersOperation::blocknum, BufferGetBlock(), BufferGetBlockNumber(), ReadBuffersOperation::buffers, CurrentResourceOwner, RelFileLocator::dbOid, fb(), ReadBuffersOperation::flags, ReadBuffersOperation::forknum, i, ignore_checksum_failure, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_HIT, IOOP_READ, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, RelFileLocatorBackend::locator, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_HF_REFERENCES_LOCAL, PGAIO_HF_SYNCHRONOUS, pgaio_io_acquire(), pgaio_io_acquire_nb(), pgaio_io_get_wref(), pgaio_io_register_callbacks(), pgaio_io_release(), pgaio_io_set_flag(), pgaio_io_set_handle_data_32(), pgaio_submit_staged(), pgaio_wref_clear(), pgBufferUsage, pgstat_count_buffer_hit, pgstat_count_io_op(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), pgstat_prepare_report_checksum_failure(), READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, ReadBuffersCanStartIO(), ReadBuffersOperation::rel, RelFileLocator::relNumber, BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, ReadBuffersOperation::smgr, SMgrRelationData::smgr_rlocator, smgrstartreadv(), RelFileLocator::spcOid, ReadBuffersOperation::strategy, track_io_timing, unlikely, VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, and zero_damaged_pages.

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 4110 of file bufmgr.c.

4111{
4113
4115
4117}
static void CheckForBufferLeaks(void)
Definition bufmgr.c:4174
static int32 PrivateRefCountOverflowed
Definition bufmgr.c:266
void AtEOXact_LocalBuffers(bool isCommit)
Definition localbuf.c:1003

References Assert, AtEOXact_LocalBuffers(), CheckForBufferLeaks(), fb(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 4156 of file bufmgr.c.

4157{
4158 UnlockBuffers();
4159
4161
4162 /* localbuf.c needs a chance too */
4164}
void UnlockBuffers(void)
Definition bufmgr.c:5719
void AtProcExit_LocalBuffers(void)
Definition localbuf.c:1014

References AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferManagerAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 3742 of file bufmgr.c.

3743{
3744 /* info obtained from freelist.c */
3745 int strategy_buf_id;
3748
3749 /*
3750 * Information saved between calls so we can determine the strategy
3751 * point's advance rate and avoid scanning already-cleaned buffers.
3752 */
3753 static bool saved_info_valid = false;
3754 static int prev_strategy_buf_id;
3756 static int next_to_clean;
3757 static uint32 next_passes;
3758
3759 /* Moving averages of allocation rate and clean-buffer density */
3760 static float smoothed_alloc = 0;
3761 static float smoothed_density = 10.0;
3762
3763 /* Potentially these could be tunables, but for now, not */
3764 float smoothing_samples = 16;
3765 float scan_whole_pool_milliseconds = 120000.0;
3766
3767 /* Used to compute how far we scan ahead */
3768 long strategy_delta;
3769 int bufs_to_lap;
3770 int bufs_ahead;
3771 float scans_per_alloc;
3774 int min_scan_buffers;
3775
3776 /* Variables for the scanning loop proper */
3777 int num_to_scan;
3778 int num_written;
3779 int reusable_buffers;
3780
3781 /* Variables for final smoothed_density update */
3782 long new_strategy_delta;
3784
3785 /*
3786 * Find out where the clock-sweep currently is, and how many buffer
3787 * allocations have happened since our last call.
3788 */
3790
3791 /* Report buffer alloc counts to pgstat */
3793
3794 /*
3795 * If we're not running the LRU scan, just stop after doing the stats
3796 * stuff. We mark the saved state invalid so that we can recover sanely
3797 * if LRU scan is turned back on later.
3798 */
3799 if (bgwriter_lru_maxpages <= 0)
3800 {
3801 saved_info_valid = false;
3802 return true;
3803 }
3804
3805 /*
3806 * Compute strategy_delta = how many buffers have been scanned by the
3807 * clock-sweep since last time. If first time through, assume none. Then
3808 * see if we are still ahead of the clock-sweep, and if so, how many
3809 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3810 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3811 * behavior when the passes counts wrap around.
3812 */
3813 if (saved_info_valid)
3814 {
3816
3819
3820 Assert(strategy_delta >= 0);
3821
3822 if ((int32) (next_passes - strategy_passes) > 0)
3823 {
3824 /* we're one pass ahead of the strategy point */
3826#ifdef BGW_DEBUG
3827 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3831#endif
3832 }
3833 else if (next_passes == strategy_passes &&
3835 {
3836 /* on same pass, but ahead or at least not behind */
3838#ifdef BGW_DEBUG
3839 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3843#endif
3844 }
3845 else
3846 {
3847 /*
3848 * We're behind, so skip forward to the strategy point and start
3849 * cleaning from there.
3850 */
3851#ifdef BGW_DEBUG
3852 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3856#endif
3860 }
3861 }
3862 else
3863 {
3864 /*
3865 * Initializing at startup or after LRU scanning had been off. Always
3866 * start at the strategy point.
3867 */
3868#ifdef BGW_DEBUG
3869 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3871#endif
3872 strategy_delta = 0;
3876 }
3877
3878 /* Update saved info for next time */
3881 saved_info_valid = true;
3882
3883 /*
3884 * Compute how many buffers had to be scanned for each new allocation, ie,
3885 * 1/density of reusable buffers, and track a moving average of that.
3886 *
3887 * If the strategy point didn't move, we don't update the density estimate
3888 */
3889 if (strategy_delta > 0 && recent_alloc > 0)
3890 {
3894 }
3895
3896 /*
3897 * Estimate how many reusable buffers there are between the current
3898 * strategy point and where we've scanned ahead to, based on the smoothed
3899 * density estimate.
3900 */
3903
3904 /*
3905 * Track a moving average of recent buffer allocations. Here, rather than
3906 * a true average we want a fast-attack, slow-decline behavior: we
3907 * immediately follow any increase.
3908 */
3909 if (smoothed_alloc <= (float) recent_alloc)
3911 else
3914
3915 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3917
3918 /*
3919 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3920 * eventually underflow to zero, and the underflows produce annoying
3921 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3922 * zero, there's no point in tracking smaller and smaller values of
3923 * smoothed_alloc, so just reset it to exactly zero to avoid this
3924 * syndrome. It will pop back up as soon as recent_alloc increases.
3925 */
3926 if (upcoming_alloc_est == 0)
3927 smoothed_alloc = 0;
3928
3929 /*
3930 * Even in cases where there's been little or no buffer allocation
3931 * activity, we want to make a small amount of progress through the buffer
3932 * cache so that as many reusable buffers as possible are clean after an
3933 * idle period.
3934 *
3935 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3936 * the BGW will be called during the scan_whole_pool time; slice the
3937 * buffer pool into that many sections.
3938 */
3940
3942 {
3943#ifdef BGW_DEBUG
3944 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3946#endif
3948 }
3949
3950 /*
3951 * Now write out dirty reusable buffers, working forward from the
3952 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3953 * enough buffers to match our estimate of the next cycle's allocation
3954 * requirements, or hit the bgwriter_lru_maxpages limit.
3955 */
3956
3957 num_to_scan = bufs_to_lap;
3958 num_written = 0;
3960
3961 /* Execute the LRU scan */
3962 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3963 {
3965 wb_context);
3966
3967 if (++next_to_clean >= NBuffers)
3968 {
3969 next_to_clean = 0;
3970 next_passes++;
3971 }
3972 num_to_scan--;
3973
3974 if (sync_state & BUF_WRITTEN)
3975 {
3978 {
3980 break;
3981 }
3982 }
3983 else if (sync_state & BUF_REUSABLE)
3985 }
3986
3988
3989#ifdef BGW_DEBUG
3990 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3993 bufs_to_lap - num_to_scan,
3996#endif
3997
3998 /*
3999 * Consider the above scan as being like a new allocation scan.
4000 * Characterize its density and update the smoothed one based on it. This
4001 * effectively halves the moving average period in cases where both the
4002 * strategy and the background writer are doing some useful scanning,
4003 * which is helpful because a long memory isn't as desirable on the
4004 * density estimates.
4005 */
4006 new_strategy_delta = bufs_to_lap - num_to_scan;
4008 if (new_strategy_delta > 0 && new_recent_alloc > 0)
4009 {
4013
4014#ifdef BGW_DEBUG
4015 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
4018#endif
4019 }
4020
4021 /* Return true if OK to hibernate */
4022 return (bufs_to_lap == 0 && recent_alloc == 0);
4023}
int BgWriterDelay
Definition bgwriter.c:59
#define BUF_REUSABLE
Definition bufmgr.c:85
double bgwriter_lru_multiplier
Definition bufmgr.c:191
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition bufmgr.c:4040
int bgwriter_lru_maxpages
Definition bufmgr.c:190
#define BUF_WRITTEN
Definition bufmgr.c:84
int32_t int32
Definition c.h:614
#define DEBUG2
Definition elog.h:29
#define DEBUG1
Definition elog.h:30
#define elog(elevel,...)
Definition elog.h:226
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition freelist.c:321
int NBuffers
Definition globals.c:142
PgStat_BgWriterStats PendingBgWriterStats
PgStat_Counter buf_written_clean
Definition pgstat.h:246
PgStat_Counter maxwritten_clean
Definition pgstat.h:247
PgStat_Counter buf_alloc
Definition pgstat.h:248

References Assert, bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, DEBUG1, DEBUG2, elog, fb(), PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ buffer_readv_complete()

static pg_attribute_always_inline PgAioResult buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data,
bool  is_temp 
)
static

Definition at line 8452 of file bufmgr.c.

8454{
8455 PgAioResult result = prior_result;
8460 uint8 error_count = 0;
8461 uint8 zeroed_count = 0;
8462 uint8 ignored_count = 0;
8464 uint64 *io_data;
8465 uint8 handle_data_len;
8466
8467 if (is_temp)
8468 {
8469 Assert(td->smgr.is_temp);
8471 }
8472 else
8473 Assert(!td->smgr.is_temp);
8474
8475 /*
8476 * Iterate over all the buffers affected by this IO and call the
8477 * per-buffer completion function for each buffer.
8478 */
8479 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8480 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
8481 {
8483 bool failed;
8484 bool failed_verification = false;
8485 bool failed_checksum = false;
8486 bool zeroed_buffer = false;
8487 bool ignored_checksum = false;
8488
8490
8491 /*
8492 * If the entire I/O failed on a lower-level, each buffer needs to be
8493 * marked as failed. In case of a partial read, the first few buffers
8494 * may be ok.
8495 */
8496 failed =
8498 || prior_result.result <= buf_off;
8499
8500 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
8504 &zeroed_buffer);
8505
8506 /*
8507 * Track information about the number of different kinds of error
8508 * conditions across all pages, as there can be multiple pages failing
8509 * verification as part of one IO.
8510 */
8513 if (zeroed_buffer && zeroed_count++ == 0)
8515 if (ignored_checksum && ignored_count++ == 0)
8517 if (failed_checksum)
8519 }
8520
8521 /*
8522 * If the smgr read succeeded [partially] and page verification failed for
8523 * some of the pages, adjust the IO's result state appropriately.
8524 */
8525 if (prior_result.status != PGAIO_RS_ERROR &&
8526 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
8527 {
8528 buffer_readv_encode_error(&result, is_temp,
8529 zeroed_count > 0, ignored_count > 0,
8533 pgaio_result_report(result, td, DEBUG1);
8534 }
8535
8536 /*
8537 * For shared relations this reporting is done in
8538 * shared_buffer_readv_complete_local().
8539 */
8540 if (is_temp && checkfail_count > 0)
8543
8544 return result;
8545}
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition aio.c:355
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
@ PGAIO_RS_ERROR
Definition aio_types.h:84
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition bufmgr.c:8308
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition bufmgr.c:8213
uint8_t uint8
Definition c.h:616
ProcNumber MyProcNumber
Definition globals.c:90
static char buf[DEFAULT_XLOG_SEG_SIZE]
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@128 smgr

References Assert, buf, buffer_readv_complete_one(), buffer_readv_encode_error(), BufferIsValid(), RelFileLocator::dbOid, DEBUG1, fb(), PgAioTargetData::is_temp, MyProcNumber, pgaio_io_get_handle_data(), pgaio_io_get_owner(), pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, pgstat_report_checksum_failures_in_db(), PgAioTargetData::rlocator, and PgAioTargetData::smgr.

Referenced by local_buffer_readv_complete(), and shared_buffer_readv_complete().

◆ buffer_readv_complete_one()

static pg_attribute_always_inline void buffer_readv_complete_one ( PgAioTargetData td,
uint8  buf_off,
Buffer  buffer,
uint8  flags,
bool  failed,
bool  is_temp,
bool buffer_invalid,
bool failed_checksum,
bool ignored_checksum,
bool zeroed_buffer 
)
static

Definition at line 8308 of file bufmgr.c.

8314{
8315 BufferDesc *buf_hdr = is_temp ?
8316 GetLocalBufferDescriptor(-buffer - 1)
8317 : GetBufferDescriptor(buffer - 1);
8318 BufferTag tag = buf_hdr->tag;
8319 char *bufdata = BufferGetBlock(buffer);
8321 int piv_flags;
8322
8323 /* check that the buffer is in the expected state for a read */
8324#ifdef USE_ASSERT_CHECKING
8325 {
8327
8330 /* temp buffers don't use BM_IO_IN_PROGRESS */
8331 if (!is_temp)
8334 }
8335#endif
8336
8337 *buffer_invalid = false;
8338 *failed_checksum = false;
8339 *ignored_checksum = false;
8340 *zeroed_buffer = false;
8341
8342 /*
8343 * We ask PageIsVerified() to only log the message about checksum errors,
8344 * as the completion might be run in any backend (or IO workers). We will
8345 * report checksum errors in buffer_readv_report().
8346 */
8348
8349 /* the local zero_damaged_pages may differ from the definer's */
8352
8353 /* Check for garbage data. */
8354 if (!failed)
8355 {
8356 /*
8357 * If the buffer is not currently pinned by this backend, e.g. because
8358 * we're completing this IO after an error, the buffer data will have
8359 * been marked as inaccessible when the buffer was unpinned. The AIO
8360 * subsystem holds a pin, but that doesn't prevent the buffer from
8361 * having been marked as inaccessible. The completion might also be
8362 * executed in a different process.
8363 */
8364#ifdef USE_VALGRIND
8365 if (!BufferIsPinned(buffer))
8367#endif
8368
8369 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
8371 {
8372 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8373 {
8374 memset(bufdata, 0, BLCKSZ);
8375 *zeroed_buffer = true;
8376 }
8377 else
8378 {
8379 *buffer_invalid = true;
8380 /* mark buffer as having failed */
8381 failed = true;
8382 }
8383 }
8384 else if (*failed_checksum)
8385 *ignored_checksum = true;
8386
8387 /* undo what we did above */
8388#ifdef USE_VALGRIND
8389 if (!BufferIsPinned(buffer))
8391#endif
8392
8393 /*
8394 * Immediately log a message about the invalid page, but only to the
8395 * server log. The reason to do so immediately is that this may be
8396 * executed in a different backend than the one that originated the
8397 * request. The reason to do so immediately is that the originator
8398 * might not process the query result immediately (because it is busy
8399 * doing another part of query processing) or at all (e.g. if it was
8400 * cancelled or errored out due to another IO also failing). The
8401 * definer of the IO will emit an ERROR or WARNING when processing the
8402 * IO's results
8403 *
8404 * To avoid duplicating the code to emit these log messages, we reuse
8405 * buffer_readv_report().
8406 */
8408 {
8409 PgAioResult result_one = {0};
8410
8415 *zeroed_buffer ? 1 : 0,
8416 *failed_checksum ? 1 : 0,
8419 }
8420 }
8421
8422 /* Terminate I/O and set BM_VALID. */
8423 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
8424 if (is_temp)
8426 else
8427 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
8428
8429 /*
8430 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
8431 * callback may not be executed in the same backend that called
8432 * BUFFER_READ_START. The alternative would be to defer calling the
8433 * tracepoint to a later point (e.g. the local completion callback for
8434 * shared buffer reads), which seems even less helpful.
8435 */
8437 tag.blockNum,
8438 tag.spcOid,
8439 tag.dbOid,
8440 tag.relNumber,
8442 false);
8443}
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition bufmgr.c:599
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition bufpage.c:94
#define PIV_LOG_LOG
Definition bufpage.h:501
PageData * Page
Definition bufpage.h:81
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition bufpage.h:502
#define LOG_SERVER_ONLY
Definition elog.h:32
#define false
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint64 set_flag_bits, bool release_aio)
Definition localbuf.c:562
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition memdebug.h:27
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum

References Assert, buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, buffer_readv_encode_error(), BufferGetBlock(), BufferIsPinned, buftag::dbOid, fb(), buftag::forkNum, GetBufferDescriptor(), GetLocalBufferDescriptor(), INVALID_PROC_NUMBER, LOG_SERVER_ONLY, MyProcNumber, PageIsVerified(), pg_atomic_read_u64(), pgaio_result_report(), PIV_IGNORE_CHECKSUM_FAILURE, PIV_LOG_LOG, READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_ZERO_ON_ERROR, buftag::relNumber, buftag::spcOid, TerminateBufferIO(), TerminateLocalBufferIO(), VALGRIND_MAKE_MEM_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by buffer_readv_complete().

◆ buffer_readv_decode_error()

static void buffer_readv_decode_error ( PgAioResult  result,
bool zeroed_any,
bool ignored_any,
uint8 zeroed_or_error_count,
uint8 checkfail_count,
uint8 first_off 
)
inlinestatic

Definition at line 8171 of file bufmgr.c.

8177{
8178 uint32 rem_error = result.error_data;
8179
8180 /* see static asserts in buffer_readv_encode_error */
8181#define READV_COUNT_BITS 7
8182#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
8183
8184 *zeroed_any = rem_error & 1;
8185 rem_error >>= 1;
8186
8187 *ignored_any = rem_error & 1;
8188 rem_error >>= 1;
8189
8192
8195
8198}
#define READV_COUNT_BITS
#define READV_COUNT_MASK
uint32 error_data
Definition aio_types.h:111

References PgAioResult::error_data, fb(), READV_COUNT_BITS, and READV_COUNT_MASK.

Referenced by buffer_readv_encode_error(), buffer_readv_report(), and shared_buffer_readv_complete_local().

◆ buffer_readv_encode_error()

static void buffer_readv_encode_error ( PgAioResult result,
bool  is_temp,
bool  zeroed_any,
bool  ignored_any,
uint8  error_count,
uint8  zeroed_count,
uint8  checkfail_count,
uint8  first_error_off,
uint8  first_zeroed_off,
uint8  first_ignored_off 
)
inlinestatic

Definition at line 8213 of file bufmgr.c.

8223{
8224
8225 uint8 shift = 0;
8229
8231 "PG_IOV_MAX is bigger than reserved space for error data");
8233 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
8234
8235 /*
8236 * We only have space to encode one offset - but luckily that's good
8237 * enough. If there is an error, the error is the interesting offset, same
8238 * with a zeroed buffer vs an ignored buffer.
8239 */
8240 if (error_count > 0)
8242 else if (zeroed_count > 0)
8244 else
8246
8247 Assert(!zeroed_any || error_count == 0);
8248
8249 result->error_data = 0;
8250
8251 result->error_data |= zeroed_any << shift;
8252 shift += 1;
8253
8254 result->error_data |= ignored_any << shift;
8255 shift += 1;
8256
8257 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
8258 shift += READV_COUNT_BITS;
8259
8260 result->error_data |= ((uint32) checkfail_count) << shift;
8261 shift += READV_COUNT_BITS;
8262
8263 result->error_data |= ((uint32) first_off) << shift;
8264 shift += READV_COUNT_BITS;
8265
8266 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
8268
8269 if (error_count > 0)
8270 result->status = PGAIO_RS_ERROR;
8271 else
8272 result->status = PGAIO_RS_WARNING;
8273
8274 /*
8275 * The encoding is complicated enough to warrant cross-checking it against
8276 * the decode function.
8277 */
8278#ifdef USE_ASSERT_CHECKING
8279 {
8280 bool zeroed_any_2,
8285
8290 &first_off_2);
8296 }
8297#endif
8298
8299#undef READV_COUNT_BITS
8300#undef READV_COUNT_MASK
8301}
#define PGAIO_RESULT_ERROR_BITS
Definition aio_types.h:98
@ PGAIO_RS_WARNING
Definition aio_types.h:83
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition bufmgr.c:8171
#define StaticAssertDecl(condition, errmessage)
Definition c.h:1010
#define PG_IOV_MAX
Definition pg_iovec.h:47
uint32 status
Definition aio_types.h:108
uint32 id
Definition aio_types.h:105

References Assert, buffer_readv_decode_error(), PgAioResult::error_data, fb(), PgAioResult::id, PG_IOV_MAX, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_RESULT_ERROR_BITS, PGAIO_RS_ERROR, PGAIO_RS_WARNING, READV_COUNT_BITS, StaticAssertDecl, and PgAioResult::status.

Referenced by buffer_readv_complete(), and buffer_readv_complete_one().

◆ buffer_readv_report()

static void buffer_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 8555 of file bufmgr.c.

8557{
8558 int nblocks = td->smgr.nblocks;
8559 BlockNumber first = td->smgr.blockNum;
8560 BlockNumber last = first + nblocks - 1;
8563 RelPathStr rpath =
8565 bool zeroed_any,
8569 first_off;
8571 const char *msg_one,
8572 *msg_mult,
8573 *det_mult,
8574 *hint_mult;
8575
8579 &first_off);
8580
8581 /*
8582 * Treat a read that had both zeroed buffers *and* ignored checksums as a
8583 * special case, it's too irregular to be emitted the same way as the
8584 * other cases.
8585 */
8586 if (zeroed_any && ignored_any)
8587 {
8589 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
8590 Assert(result.status != PGAIO_RS_ERROR);
8592
8593 ereport(elevel,
8595 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
8596 affected_count, checkfail_count, first, last, rpath.str),
8597 affected_count > 1 ?
8598 errdetail("Block %u held the first zeroed page.",
8599 first + first_off) : 0,
8600 errhint_plural("See server log for details about the other %d invalid block.",
8601 "See server log for details about the other %d invalid blocks.",
8604 return;
8605 }
8606
8607 /*
8608 * The other messages are highly repetitive. To avoid duplicating a long
8609 * and complicated ereport(), gather the translated format strings
8610 * separately and then do one common ereport.
8611 */
8612 if (result.status == PGAIO_RS_ERROR)
8613 {
8614 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
8616 msg_one = _("invalid page in block %u of relation \"%s\"");
8617 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
8618 det_mult = _("Block %u held the first invalid page.");
8619 hint_mult = _("See server log for the other %u invalid block(s).");
8620 }
8621 else if (zeroed_any && !ignored_any)
8622 {
8624 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
8625 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
8626 det_mult = _("Block %u held the first zeroed page.");
8627 hint_mult = _("See server log for the other %u zeroed block(s).");
8628 }
8629 else if (!zeroed_any && ignored_any)
8630 {
8632 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
8633 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
8634 det_mult = _("Block %u held the first ignored page.");
8635 hint_mult = _("See server log for the other %u ignored block(s).");
8636 }
8637 else
8639
8640 ereport(elevel,
8642 affected_count == 1 ?
8643 errmsg_internal(msg_one, first + first_off, rpath.str) :
8644 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
8647}
#define pg_unreachable()
Definition c.h:361
#define _(x)
Definition elog.c:95
int int errdetail_internal(const char *fmt,...) pg_attribute_printf(1
int int int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...) pg_attribute_printf(1
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
int int errhint_internal(const char *fmt,...) pg_attribute_printf(1
const char * str
#define ERRCODE_DATA_CORRUPTED
int ProcNumber
Definition procnumber.h:24
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
BlockNumber blockNum
Definition aio_types.h:66
BlockNumber nblocks
Definition aio_types.h:67
ForkNumber forkNum
Definition aio_types.h:68

References _, Assert, PgAioTargetData::blockNum, buffer_readv_decode_error(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errdetail(), errdetail_internal(), errhint_internal(), errhint_plural(), errmsg, errmsg_internal(), fb(), PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, pg_unreachable, PGAIO_RS_ERROR, relpathbackend, PgAioTargetData::rlocator, PgAioTargetData::smgr, PgAioResult::status, and RelPathStr::str.

◆ buffer_stage_common()

static pg_attribute_always_inline void buffer_stage_common ( PgAioHandle ioh,
bool  is_write,
bool  is_temp 
)
static

Definition at line 8064 of file bufmgr.c.

8065{
8066 uint64 *io_data;
8067 uint8 handle_data_len;
8070
8071 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8072
8074
8075 /* iterate over all buffers affected by the vectored readv/writev */
8076 for (int i = 0; i < handle_data_len; i++)
8077 {
8078 Buffer buffer = (Buffer) io_data[i];
8079 BufferDesc *buf_hdr = is_temp ?
8080 GetLocalBufferDescriptor(-buffer - 1)
8081 : GetBufferDescriptor(buffer - 1);
8083
8084 /*
8085 * Check that all the buffers are actually ones that could conceivably
8086 * be done in one IO, i.e. are sequential. This is the last
8087 * buffer-aware code before IO is actually executed and confusion
8088 * about which buffers are targeted by IO can be hard to debug, making
8089 * it worth doing extra-paranoid checks.
8090 */
8091 if (i == 0)
8092 first = buf_hdr->tag;
8093 else
8094 {
8095 Assert(buf_hdr->tag.relNumber == first.relNumber);
8096 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
8097 }
8098
8099 if (is_temp)
8101 else
8103
8104 /* verify the buffer is in the expected state */
8106 if (is_write)
8107 {
8110 }
8111 else
8112 {
8115 }
8116
8117 /* temp buffers don't use BM_IO_IN_PROGRESS */
8118 if (!is_temp)
8120
8122
8123 /*
8124 * Reflect that the buffer is now owned by the AIO subsystem.
8125 *
8126 * For local buffers: This can't be done just via LocalRefCount, as
8127 * one might initially think, as this backend could error out while
8128 * AIO is still in progress, releasing all the pins by the backend
8129 * itself.
8130 *
8131 * This pin is released again in TerminateBufferIO().
8132 */
8133 buf_hdr->io_wref = io_ref;
8134
8135 if (is_temp)
8136 {
8139 }
8140 else
8142
8143 /*
8144 * Ensure the content lock that prevents buffer modifications while
8145 * the buffer is being written out is not released early due to an
8146 * error.
8147 */
8148 if (is_write && !is_temp)
8149 {
8151
8152 /*
8153 * Lock is now owned by AIO subsystem.
8154 */
8155 BufferLockDisown(buffer, buf_hdr);
8156 }
8157
8158 /*
8159 * Stop tracking this buffer via the resowner - the AIO system now
8160 * keeps track.
8161 */
8162 if (!is_temp)
8164 }
8165}
static void pg_atomic_unlocked_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:494
#define BUF_REFCOUNT_ONE
static uint64 UnlockBufHdrExt(BufferDesc *desc, uint64 old_buf_state, uint64 set_bits, uint64 unset_bits, int refcount_change)
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BUF_STATE_GET_REFCOUNT(state)
static void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6138
static bool BufferLockHeldByMe(BufferDesc *buf_hdr)
Definition bufmgr.c:6410
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:243
BufferTag tag

References Assert, BM_DIRTY, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, BUF_REFCOUNT_ONE, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferLockDisown(), BufferLockHeldByMe(), CurrentResourceOwner, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, LockBufHdr(), pg_atomic_read_u64(), pg_atomic_unlocked_write_u64(), PG_USED_FOR_ASSERTS_ONLY, pgaio_io_get_handle_data(), pgaio_io_get_wref(), ResourceOwnerForgetBufferIO(), and UnlockBufHdrExt().

Referenced by local_buffer_readv_stage(), and shared_buffer_readv_stage().

◆ BufferAlloc()

static pg_attribute_always_inline BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr,
IOContext  io_context 
)
inlinestatic

Definition at line 2110 of file bufmgr.c.

2114{
2115 BufferTag newTag; /* identity of requested block */
2116 uint32 newHash; /* hash value for newTag */
2117 LWLock *newPartitionLock; /* buffer partition lock for it */
2118 int existing_buf_id;
2122 uint64 set_bits = 0;
2123
2124 /* Make sure we will have room to remember the buffer pin */
2127
2128 /* create a tag so we can lookup the buffer */
2129 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2130
2131 /* determine its hash code and partition lock ID */
2134
2135 /* see if the block is in the buffer pool already */
2138 if (existing_buf_id >= 0)
2139 {
2140 BufferDesc *buf;
2141 bool valid;
2142
2143 /*
2144 * Found it. Now, pin the buffer so no one can steal it from the
2145 * buffer pool, and check to see if the correct data has been loaded
2146 * into the buffer.
2147 */
2149
2150 valid = PinBuffer(buf, strategy, false);
2151
2152 /* Can release the mapping lock as soon as we've pinned it */
2154
2155 *foundPtr = true;
2156
2157 if (!valid)
2158 {
2159 /*
2160 * We can only get here if (a) someone else is still reading in
2161 * the page, (b) a previous read attempt failed, or (c) someone
2162 * called StartReadBuffers() but not yet WaitReadBuffers().
2163 */
2164 *foundPtr = false;
2165 }
2166
2167 return buf;
2168 }
2169
2170 /*
2171 * Didn't find it in the buffer pool. We'll have to initialize a new
2172 * buffer. Remember to unlock the mapping lock while doing the work.
2173 */
2175
2176 /*
2177 * Acquire a victim buffer. Somebody else might try to do the same, we
2178 * don't hold any conflicting locks. If so we'll have to undo our work
2179 * later.
2180 */
2183
2184 /*
2185 * Try to make a hashtable entry for the buffer under its new tag. If
2186 * somebody else inserted another buffer for the tag, we'll release the
2187 * victim buffer we acquired and use the already inserted one.
2188 */
2191 if (existing_buf_id >= 0)
2192 {
2194 bool valid;
2195
2196 /*
2197 * Got a collision. Someone has already done what we were about to do.
2198 * We'll just handle this as if it were found in the buffer pool in
2199 * the first place. First, give up the buffer we were planning to
2200 * use.
2201 *
2202 * We could do this after releasing the partition lock, but then we'd
2203 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2204 * before acquiring the lock, for the rare case of such a collision.
2205 */
2207
2208 /* remaining code should match code at top of routine */
2209
2211
2212 valid = PinBuffer(existing_buf_hdr, strategy, false);
2213
2214 /* Can release the mapping lock as soon as we've pinned it */
2216
2217 *foundPtr = true;
2218
2219 if (!valid)
2220 {
2221 /*
2222 * We can only get here if (a) someone else is still reading in
2223 * the page, (b) a previous read attempt failed, or (c) someone
2224 * called StartReadBuffers() but not yet WaitReadBuffers().
2225 */
2226 *foundPtr = false;
2227 }
2228
2229 return existing_buf_hdr;
2230 }
2231
2232 /*
2233 * Need to lock the buffer header too in order to change its tag.
2234 */
2236
2237 /* some sanity checks while we hold the buffer header lock */
2240
2241 victim_buf_hdr->tag = newTag;
2242
2243 /*
2244 * Make sure BM_PERMANENT is set for buffers that must be written at every
2245 * checkpoint. Unlogged buffers only need to be written at shutdown
2246 * checkpoints, except for their "init" forks, which need to be treated
2247 * just like permanent relations.
2248 */
2250 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2252
2254 set_bits, 0, 0);
2255
2257
2258 /*
2259 * Buffer contents are currently invalid.
2260 */
2261 *foundPtr = false;
2262
2263 return victim_buf_hdr;
2264}
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_PERMANENT
#define BUF_USAGECOUNT_ONE
static LWLock * BufMappingPartitionLock(uint32 hashcode)
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition buf_table.c:118
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition bufmgr.c:2461
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition bufmgr.c:3188
static void ReservePrivateRefCountEntry(void)
Definition bufmgr.c:309
static void UnpinBuffer(BufferDesc *buf)
Definition bufmgr.c:3367
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1177
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1794
@ LW_SHARED
Definition lwlock.h:113
@ LW_EXCLUSIVE
Definition lwlock.h:112
@ INIT_FORKNUM
Definition relpath.h:61
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449

References Assert, BM_DIRTY, BM_IO_IN_PROGRESS, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), CurrentResourceOwner, fb(), GetBufferDescriptor(), GetVictimBuffer(), INIT_FORKNUM, InitBufferTag(), RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrRelationData::smgr_rlocator, UnlockBufHdrExt(), and UnpinBuffer().

Referenced by PinBufferForBlock().

◆ BufferBeginSetHintBits()

bool BufferBeginSetHintBits ( Buffer  buffer)

Definition at line 6909 of file bufmgr.c.

6910{
6913
6914 if (BufferIsLocal(buffer))
6915 {
6916 /*
6917 * NB: Will need to check if there is a write in progress, once it is
6918 * possible for writes to be done asynchronously.
6919 */
6920 return true;
6921 }
6922
6923 buf_hdr = GetBufferDescriptor(buffer - 1);
6924
6926}
#define BufferIsLocal(buffer)
Definition buf.h:37
static bool SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
Definition bufmgr.c:6818

References PrivateRefCountEntry::buffer, BufferIsLocal, fb(), GetBufferDescriptor(), and SharedBufferBeginSetHintBits().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), fsm_search_avail(), fsm_vacuum_page(), gistkillitems(), and SetHintBitsExt().

◆ BufferFinishSetHintBits()

void BufferFinishSetHintBits ( Buffer  buffer,
bool  mark_dirty,
bool  buffer_std 
)

Definition at line 6937 of file bufmgr.c.

6938{
6939 if (!BufferIsLocal(buffer))
6942
6943 if (mark_dirty)
6945}
bool BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:3003
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5688
@ BUFFER_LOCK_SHARE_EXCLUSIVE
Definition bufmgr.h:215
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:220

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), fb(), and MarkBufferDirtyHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), fsm_search_avail(), fsm_vacuum_page(), gistkillitems(), and HeapTupleSatisfiesMVCCBatch().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 4357 of file bufmgr.c.

4358{
4360
4361 Assert(BufferIsPinned(buffer));
4362
4363 if (BufferIsLocal(buffer))
4364 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4365 else
4366 bufHdr = GetBufferDescriptor(buffer - 1);
4367
4368 /* pinned, so OK to read tag without spinlock */
4369 return bufHdr->tag.blockNum;
4370}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, fb(), GetBufferDescriptor(), and GetLocalBufferDescriptor().

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_finish_split(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), AsyncReadBuffers(), BitmapHeapScanNextBlock(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), btvacuumpage(), check_index_page(), CheckReadBuffersOperation(), collect_corrupt_items(), collectMatchBitmap(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginDeletePostingPage(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), gistvacuumpage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_fetch_next_buffer(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_prune_opt(), heap_page_would_be_all_visible(), heap_prepare_pagescan(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), heapam_scan_analyze_next_block(), heapgettup(), heapgettup_pagemode(), index_compute_xid_horizon_for_tuples(), lazy_scan_heap(), lazy_scan_noprune(), lazy_scan_prune(), lazy_vacuum_heap_rel(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), prune_freeze_setup(), read_stream_start_pending_read(), ReadBufferBI(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), StartReadBuffersImpl(), startScanEntry(), statapprox_heap(), terminate_brin_buildstate(), vacuumLeafPage(), verify_heapam(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), and visibilitymap_set_vmbits().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 4632 of file bufmgr.c.

4633{
4634 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4635 Assert(BufferIsValid(buffer));
4636 Assert(BufferIsPinned(buffer));
4637
4638#ifdef PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
4639 return PageGetLSN(BufferGetPage(buffer));
4640#else
4641 {
4642 char *page = BufferGetPage(buffer);
4644 XLogRecPtr lsn;
4645
4646 /*
4647 * If we don't need locking for correctness, fastpath out.
4648 */
4649 if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
4650 return PageGetLSN(page);
4651
4652 bufHdr = GetBufferDescriptor(buffer - 1);
4654 lsn = PageGetLSN(page);
4656
4657 return lsn;
4658 }
4659#endif
4660}
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:470
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:411
#define XLogHintBitIsNeeded()
Definition xlog.h:122
uint64 XLogRecPtr
Definition xlogdefs.h:21

References Assert, PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), fb(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_killitems(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), and SetHintBitsExt().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 4378 of file bufmgr.c.

4380{
4382
4383 /* Do the same checks as BufferGetBlockNumber. */
4384 Assert(BufferIsPinned(buffer));
4385
4386 if (BufferIsLocal(buffer))
4387 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4388 else
4389 bufHdr = GetBufferDescriptor(buffer - 1);
4390
4391 /* pinned, so OK to read tag without spinlock */
4392 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4393 *forknum = BufTagGetForkNum(&bufHdr->tag);
4394 *blknum = bufHdr->tag.blockNum;
4395}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), fb(), GetBufferDescriptor(), and GetLocalBufferDescriptor().

Referenced by fsm_search_avail(), ginRedoInsertEntry(), heap_inplace_update_and_unlock(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

◆ BufferIsDirty()

bool BufferIsDirty ( Buffer  buffer)

◆ BufferIsLockedByMe()

bool BufferIsLockedByMe ( Buffer  buffer)

Definition at line 2977 of file bufmgr.c.

2978{
2980
2981 Assert(BufferIsPinned(buffer));
2982
2983 if (BufferIsLocal(buffer))
2984 {
2985 /* Content locks are not maintained for local buffers. */
2986 return true;
2987 }
2988 else
2989 {
2990 bufHdr = GetBufferDescriptor(buffer - 1);
2991 return BufferLockHeldByMe(bufHdr);
2992 }
2993}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockHeldByMe(), fb(), and GetBufferDescriptor().

Referenced by FlushOneBuffer().

◆ BufferIsLockedByMeInMode()

bool BufferIsLockedByMeInMode ( Buffer  buffer,
BufferLockMode  mode 
)

Definition at line 3003 of file bufmgr.c.

3004{
3006
3007 Assert(BufferIsPinned(buffer));
3008
3009 if (BufferIsLocal(buffer))
3010 {
3011 /* Content locks are not maintained for local buffers. */
3012 return true;
3013 }
3014 else
3015 {
3016 bufHdr = GetBufferDescriptor(buffer - 1);
3018 }
3019}
static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6392
static PgChecksumMode mode

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockHeldByMeInMode(), fb(), GetBufferDescriptor(), and mode.

Referenced by BufferFinishSetHintBits(), BufferIsDirty(), heap_page_fix_vm_corruption(), HeapTupleSetHintBits(), IsBufferCleanupOK(), MarkBufferDirty(), visibilitymap_set(), visibilitymap_set_vmbits(), and XLogRegisterBuffer().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 4596 of file bufmgr.c.

4597{
4599
4600 /* Local buffers are used only for temp relations. */
4601 if (BufferIsLocal(buffer))
4602 return false;
4603
4604 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4605 Assert(BufferIsValid(buffer));
4606 Assert(BufferIsPinned(buffer));
4607
4608 /*
4609 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4610 * need not bother with the buffer header spinlock. Even if someone else
4611 * changes the buffer header state while we're doing this, the state is
4612 * changed atomically, so we'll read the old value or the new value, but
4613 * not random garbage.
4614 */
4615 bufHdr = GetBufferDescriptor(buffer - 1);
4616 return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
4617}

References Assert, BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), fb(), GetBufferDescriptor(), and pg_atomic_read_u64().

Referenced by SetHintBitsExt().

◆ BufferLockAcquire()

static void BufferLockAcquire ( Buffer  buffer,
BufferDesc buf_hdr,
BufferLockMode  mode 
)
inlinestatic

Definition at line 5765 of file bufmgr.c.

5766{
5767 PrivateRefCountEntry *entry;
5768 int extraWaits = 0;
5769
5770 /*
5771 * Get reference to the refcount entry before we hold the lock, it seems
5772 * better to do before holding the lock.
5773 */
5774 entry = GetPrivateRefCountEntry(buffer, true);
5775
5776 /*
5777 * We better not already hold a lock on the buffer.
5778 */
5780
5781 /*
5782 * Lock out cancel/die interrupts until we exit the code section protected
5783 * by the content lock. This ensures that interrupts will not interfere
5784 * with manipulations of data structures in shared memory.
5785 */
5787
5788 for (;;)
5789 {
5790 uint32 wait_event = 0; /* initialized to avoid compiler warning */
5791 bool mustwait;
5792
5793 /*
5794 * Try to grab the lock the first time, we're not in the waitqueue
5795 * yet/anymore.
5796 */
5798
5799 if (likely(!mustwait))
5800 {
5801 break;
5802 }
5803
5804 /*
5805 * Ok, at this point we couldn't grab the lock on the first try. We
5806 * cannot simply queue ourselves to the end of the list and wait to be
5807 * woken up because by now the lock could long have been released.
5808 * Instead add us to the queue and try to grab the lock again. If we
5809 * succeed we need to revert the queuing and be happy, otherwise we
5810 * recheck the lock. If we still couldn't grab it, we know that the
5811 * other locker will see our queue entries when releasing since they
5812 * existed before we checked for the lock.
5813 */
5814
5815 /* add to the queue */
5817
5818 /* we're now guaranteed to be woken up if necessary */
5820
5821 /* ok, grabbed the lock the second time round, need to undo queueing */
5822 if (!mustwait)
5823 {
5825 break;
5826 }
5827
5828 switch (mode)
5829 {
5832 break;
5835 break;
5836 case BUFFER_LOCK_SHARE:
5838 break;
5839 case BUFFER_LOCK_UNLOCK:
5841
5842 }
5844
5845 /*
5846 * Wait until awakened.
5847 *
5848 * It is possible that we get awakened for a reason other than being
5849 * signaled by BufferLockWakeup(). If so, loop back and wait again.
5850 * Once we've gotten the lock, re-increment the sema by the number of
5851 * additional signals received.
5852 */
5853 for (;;)
5854 {
5857 break;
5858 extraWaits++;
5859 }
5860
5862
5863 /* Retrying, allow BufferLockRelease to release waiters again. */
5865 }
5866
5867 /* Remember that we now hold this lock */
5868 entry->data.lockmode = mode;
5869
5870 /*
5871 * Fix the process wait semaphore's count for any absorbed wakeups.
5872 */
5873 while (unlikely(extraWaits-- > 0))
5875}
static uint64 pg_atomic_fetch_and_u64(volatile pg_atomic_uint64 *ptr, uint64 and_)
Definition atomics.h:551
#define BM_LOCK_WAKE_IN_PROGRESS
static bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5963
static void BufferLockDequeueSelf(BufferDesc *buf_hdr)
Definition bufmgr.c:6070
static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6030
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition bufmgr.c:507
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:210
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:205
#define likely(x)
Definition c.h:431
@ LW_WS_NOT_WAITING
Definition lwlock.h:30
#define HOLD_INTERRUPTS()
Definition miscadmin.h:134
void PGSemaphoreUnlock(PGSemaphore sema)
Definition posix_sema.c:335
void PGSemaphoreLock(PGSemaphore sema)
Definition posix_sema.c:315
PGPROC * MyProc
Definition proc.c:68
PGSemaphore sem
Definition proc.h:255
uint8 lwWaiting
Definition proc.h:280
BufferLockMode lockmode
Definition bufmgr.c:112
PrivateRefCountData data
Definition bufmgr.c:130
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:69
static void pgstat_report_wait_end(void)
Definition wait_event.h:85

References Assert, BM_LOCK_WAKE_IN_PROGRESS, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferLockAttempt(), BufferLockDequeueSelf(), BufferLockQueueSelf(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), HOLD_INTERRUPTS, likely, PrivateRefCountData::lockmode, LW_WS_NOT_WAITING, PGPROC::lwWaiting, mode, MyProc, pg_atomic_fetch_and_u64(), pg_unreachable, PGSemaphoreLock(), PGSemaphoreUnlock(), pgstat_report_wait_end(), pgstat_report_wait_start(), PGPROC::sem, and unlikely.

Referenced by FlushUnlockedBuffer(), LockBufferInternal(), and MarkDirtyUnpinnedBufferInternal().

◆ BufferLockAttempt()

static bool BufferLockAttempt ( BufferDesc buf_hdr,
BufferLockMode  mode 
)
inlinestatic

Definition at line 5963 of file bufmgr.c.

5964{
5966
5967 /*
5968 * Read once outside the loop, later iterations will get the newer value
5969 * via compare & exchange.
5970 */
5972
5973 /* loop until we've determined whether we could acquire the lock or not */
5974 while (true)
5975 {
5977 bool lock_free;
5978
5980
5982 {
5983 lock_free = (old_state & BM_LOCK_MASK) == 0;
5984 if (lock_free)
5986 }
5988 {
5990 if (lock_free)
5992 }
5993 else
5994 {
5996 if (lock_free)
5998 }
5999
6000 /*
6001 * Attempt to swap in the state we are expecting. If we didn't see
6002 * lock to be free, that's just the old value. If we saw it as free,
6003 * we'll attempt to mark it acquired. The reason that we always swap
6004 * in the value is that this doubles as a memory barrier. We could try
6005 * to be smarter and only swap in values if we saw the lock as free,
6006 * but benchmark haven't shown it as beneficial so far.
6007 *
6008 * Retry if the value changed since we last looked at it.
6009 */
6012 {
6013 if (lock_free)
6014 {
6015 /* Great! Got the lock. */
6016 return false;
6017 }
6018 else
6019 return true; /* somebody else has the lock */
6020 }
6021 }
6022
6024}
static bool pg_atomic_compare_exchange_u64(volatile pg_atomic_uint64 *ptr, uint64 *expected, uint64 newval)
Definition atomics.h:522
#define BM_LOCK_VAL_SHARED
#define BM_LOCK_VAL_EXCLUSIVE
#define BM_LOCK_MASK
#define BM_LOCK_VAL_SHARE_EXCLUSIVE

References BM_LOCK_MASK, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_VAL_SHARE_EXCLUSIVE, BM_LOCK_VAL_SHARED, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, fb(), likely, mode, pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), and pg_unreachable.

Referenced by BufferLockAcquire(), and BufferLockConditional().

◆ BufferLockConditional()

static bool BufferLockConditional ( Buffer  buffer,
BufferDesc buf_hdr,
BufferLockMode  mode 
)
static

Definition at line 5917 of file bufmgr.c.

5918{
5919 PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true);
5920 bool mustwait;
5921
5922 /*
5923 * As described above, if we're trying to lock a buffer this backend
5924 * already has locked, return false, independent of the existing and
5925 * desired lock level.
5926 */
5927 if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
5928 return false;
5929
5930 /*
5931 * Lock out cancel/die interrupts until we exit the code section protected
5932 * by the content lock. This ensures that interrupts will not interfere
5933 * with manipulations of data structures in shared memory.
5934 */
5936
5937 /* Check for the lock */
5939
5940 if (mustwait)
5941 {
5942 /* Failed to get lock, so release interrupt holdoff */
5944 }
5945 else
5946 {
5947 entry->data.lockmode = mode;
5948 }
5949
5950 return !mustwait;
5951}
#define RESUME_INTERRUPTS()
Definition miscadmin.h:136

References PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferLockAttempt(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), HOLD_INTERRUPTS, PrivateRefCountData::lockmode, mode, and RESUME_INTERRUPTS.

Referenced by ConditionalLockBuffer(), and GetVictimBuffer().

◆ BufferLockDequeueSelf()

static void BufferLockDequeueSelf ( BufferDesc buf_hdr)
static

Definition at line 6070 of file bufmgr.c.

6071{
6072 bool on_waitlist;
6073
6075
6077 if (on_waitlist)
6078 proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6079
6080 if (proclist_is_empty(&buf_hdr->lock_waiters) &&
6082 {
6084 }
6085
6086 /* XXX: combine with fetch_and above? */
6088
6089 /* clear waiting state again, nice for debugging */
6090 if (on_waitlist)
6092 else
6093 {
6094 int extraWaits = 0;
6095
6096
6097 /*
6098 * Somebody else dequeued us and has or will wake us up. Deal with the
6099 * superfluous absorption of a wakeup.
6100 */
6101
6102 /*
6103 * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
6104 * removed ourselves - they'll have set it.
6105 */
6107
6108 /*
6109 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
6110 * get reset at some inconvenient point later. Most of the time this
6111 * will immediately return.
6112 */
6113 for (;;)
6114 {
6117 break;
6118 extraWaits++;
6119 }
6120
6121 /*
6122 * Fix the process wait semaphore's count for any absorbed wakeups.
6123 */
6124 while (extraWaits-- > 0)
6126 }
6127}
#define BM_LOCK_HAS_WAITERS
@ LW_WS_WAITING
Definition lwlock.h:31
#define proclist_delete(list, procno, link_member)
Definition proclist.h:187
static bool proclist_is_empty(const proclist_head *list)
Definition proclist.h:38

References BM_LOCK_HAS_WAITERS, BM_LOCK_WAKE_IN_PROGRESS, fb(), LockBufHdr(), LW_WS_NOT_WAITING, LW_WS_WAITING, PGPROC::lwWaiting, MyProc, MyProcNumber, pg_atomic_fetch_and_u64(), pg_atomic_read_u64(), PGSemaphoreLock(), PGSemaphoreUnlock(), proclist_delete, proclist_is_empty(), PGPROC::sem, and UnlockBufHdr().

Referenced by BufferLockAcquire().

◆ BufferLockDisown()

static void BufferLockDisown ( Buffer  buffer,
BufferDesc buf_hdr 
)
inlinestatic

Definition at line 6138 of file bufmgr.c.

6139{
6142}
static int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6152

References PrivateRefCountEntry::buffer, BufferLockDisownInternal(), fb(), and RESUME_INTERRUPTS.

Referenced by buffer_stage_common().

◆ BufferLockDisownInternal()

static int BufferLockDisownInternal ( Buffer  buffer,
BufferDesc buf_hdr 
)
inlinestatic

Definition at line 6152 of file bufmgr.c.

6153{
6156
6157 ref = GetPrivateRefCountEntry(buffer, false);
6158 if (ref == NULL)
6159 elog(ERROR, "lock %d is not held", buffer);
6160 mode = ref->data.lockmode;
6161 ref->data.lockmode = BUFFER_LOCK_UNLOCK;
6162
6163 return mode;
6164}
BufferLockMode
Definition bufmgr.h:204
#define ERROR
Definition elog.h:39

References PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, elog, ERROR, fb(), GetPrivateRefCountEntry(), and mode.

Referenced by BufferLockDisown(), and BufferLockUnlock().

◆ BufferLockHeldByMe()

static bool BufferLockHeldByMe ( BufferDesc buf_hdr)
static

Definition at line 6410 of file bufmgr.c.

6411{
6412 PrivateRefCountEntry *entry =
6414
6415 if (!entry)
6416 return false;
6417 else
6418 return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
6419}
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)

References BUFFER_LOCK_UNLOCK, BufferDescriptorGetBuffer(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), and PrivateRefCountData::lockmode.

Referenced by buffer_stage_common(), BufferIsLockedByMe(), and UnpinBufferNoOwner().

◆ BufferLockHeldByMeInMode()

static bool BufferLockHeldByMeInMode ( BufferDesc buf_hdr,
BufferLockMode  mode 
)
static

Definition at line 6392 of file bufmgr.c.

6393{
6394 PrivateRefCountEntry *entry =
6396
6397 if (!entry)
6398 return false;
6399 else
6400 return entry->data.lockmode == mode;
6401}

References BufferDescriptorGetBuffer(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), PrivateRefCountData::lockmode, and mode.

Referenced by BufferIsLockedByMeInMode(), FlushBuffer(), and MarkSharedBufferDirtyHint().

◆ BufferLockProcessRelease()

static void BufferLockProcessRelease ( BufferDesc buf_hdr,
BufferLockMode  mode,
uint64  lockstate 
)
static

Definition at line 6337 of file bufmgr.c.

6338{
6339 bool check_waiters = false;
6340 bool wake_exclusive = false;
6341
6342 /* nobody else can have that kind of lock */
6344
6345 /*
6346 * If we're still waiting for backends to get scheduled, don't wake them
6347 * up again. Otherwise check if we need to look through the waitqueue to
6348 * wake other backends.
6349 */
6352 {
6353 if ((lockstate & BM_LOCK_MASK) == 0)
6354 {
6355 /*
6356 * We released a lock and the lock was, in that moment, free. We
6357 * therefore can wake waiters for any kind of lock.
6358 */
6359 check_waiters = true;
6360 wake_exclusive = true;
6361 }
6363 {
6364 /*
6365 * We released the lock, but another backend still holds a lock.
6366 * We can't have released an exclusive lock, as there couldn't
6367 * have been other lock holders. If we released a share lock, no
6368 * waiters need to be woken up, as there must be other share
6369 * lockers. However, if we held a share-exclusive lock, another
6370 * backend now could acquire a share-exclusive lock.
6371 */
6372 check_waiters = true;
6373 wake_exclusive = false;
6374 }
6375 }
6376
6377 /*
6378 * As waking up waiters requires the spinlock to be acquired, only do so
6379 * if necessary.
6380 */
6381 if (check_waiters)
6383}
static void BufferLockWakeup(BufferDesc *buf_hdr, bool unlocked)
Definition bufmgr.c:6172

References Assert, BM_LOCK_HAS_WAITERS, BM_LOCK_MASK, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_WAKE_IN_PROGRESS, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferLockWakeup(), fb(), and mode.

Referenced by BufferLockUnlock().

◆ BufferLockQueueSelf()

static void BufferLockQueueSelf ( BufferDesc buf_hdr,
BufferLockMode  mode 
)
static

Definition at line 6030 of file bufmgr.c.

6031{
6032 /*
6033 * If we don't have a PGPROC structure, there's no way to wait. This
6034 * should never occur, since MyProc should only be null during shared
6035 * memory initialization.
6036 */
6037 if (MyProc == NULL)
6038 elog(PANIC, "cannot wait without a PGPROC structure");
6039
6041 elog(PANIC, "queueing for lock while waiting on another one");
6042
6044
6045 /* setting the flag is protected by the spinlock */
6047
6048 /*
6049 * These are currently used both for lwlocks and buffer content locks,
6050 * which is acceptable, although not pretty, because a backend can't wait
6051 * for both types of locks at the same time.
6052 */
6055
6056 proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6057
6058 /* Can release the mutex now */
6060}
static uint64 pg_atomic_fetch_or_u64(volatile pg_atomic_uint64 *ptr, uint64 or_)
Definition atomics.h:560
#define PANIC
Definition elog.h:42
#define proclist_push_tail(list, procno, link_member)
Definition proclist.h:191
uint8 lwWaitMode
Definition proc.h:281

References BM_LOCK_HAS_WAITERS, elog, fb(), LockBufHdr(), LW_WS_NOT_WAITING, LW_WS_WAITING, PGPROC::lwWaiting, PGPROC::lwWaitMode, mode, MyProc, MyProcNumber, PANIC, pg_atomic_fetch_or_u64(), proclist_push_tail, and UnlockBufHdr().

Referenced by BufferLockAcquire().

◆ BufferLockReleaseSub()

static uint64 BufferLockReleaseSub ( BufferLockMode  mode)
inlinestatic

Definition at line 6308 of file bufmgr.c.

6309{
6310 /*
6311 * Turns out that a switch() leads gcc to generate sufficiently worse code
6312 * for this to show up in profiles...
6313 */
6315 return BM_LOCK_VAL_EXCLUSIVE;
6318 else
6319 {
6321 return BM_LOCK_VAL_SHARED;
6322 }
6323
6324 return 0; /* keep compiler quiet */
6325}

References Assert, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_VAL_SHARE_EXCLUSIVE, BM_LOCK_VAL_SHARED, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, and mode.

Referenced by BufferLockUnlock().

◆ BufferLockUnlock()

static void BufferLockUnlock ( Buffer  buffer,
BufferDesc buf_hdr 
)
static

Definition at line 5881 of file bufmgr.c.

5882{
5885 uint64 sub;
5886
5888
5889 /*
5890 * Release my hold on lock, after that it can immediately be acquired by
5891 * others, even if we still have to wakeup other waiters.
5892 */
5894
5896
5898
5899 /*
5900 * Now okay to allow cancel/die interrupts.
5901 */
5903}
static uint64 pg_atomic_sub_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:578
static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
Definition bufmgr.c:6337
static uint64 BufferLockReleaseSub(BufferLockMode mode)
Definition bufmgr.c:6308

References PrivateRefCountEntry::buffer, BufferLockDisownInternal(), BufferLockProcessRelease(), BufferLockReleaseSub(), fb(), mode, pg_atomic_sub_fetch_u64(), and RESUME_INTERRUPTS.

Referenced by FlushUnlockedBuffer(), MarkDirtyUnpinnedBufferInternal(), ResOwnerReleaseBuffer(), and UnlockBuffer().

◆ BufferLockWakeup()

static void BufferLockWakeup ( BufferDesc buf_hdr,
bool  unlocked 
)
static

Definition at line 6172 of file bufmgr.c.

6173{
6174 bool new_wake_in_progress = false;
6175 bool wake_share_exclusive = true;
6178
6180
6181 /* lock wait list while collecting backends to wake up */
6183
6184 proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
6185 {
6186 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6187
6188 /*
6189 * Already woke up a conflicting lock, so skip over this wait list
6190 * entry.
6191 */
6193 continue;
6195 continue;
6196
6197 proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
6198 proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
6199
6200 /*
6201 * Prevent additional wakeups until retryer gets to run. Backends that
6202 * are just waiting for the lock to become free don't retry
6203 * automatically.
6204 */
6205 new_wake_in_progress = true;
6206
6207 /*
6208 * Signal that the process isn't on the wait list anymore. This allows
6209 * BufferLockDequeueSelf() to remove itself from the waitlist with a
6210 * proclist_delete(), rather than having to check if it has been
6211 * removed from the list.
6212 */
6213 Assert(waiter->lwWaiting == LW_WS_WAITING);
6215
6216 /*
6217 * Don't wakeup further waiters after waking a conflicting waiter.
6218 */
6219 if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
6220 {
6221 /*
6222 * Share locks conflict with exclusive locks.
6223 */
6224 wake_exclusive = false;
6225 }
6226 else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6227 {
6228 /*
6229 * Share-exclusive locks conflict with share-exclusive and
6230 * exclusive locks.
6231 */
6232 wake_exclusive = false;
6233 wake_share_exclusive = false;
6234 }
6235 else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6236 {
6237 /*
6238 * Exclusive locks conflict with all other locks, there's no point
6239 * in waking up anybody else.
6240 */
6241 break;
6242 }
6243 }
6244
6246
6247 /* unset required flags, and release lock, in one fell swoop */
6248 {
6251
6253 while (true)
6254 {
6256
6257 /* compute desired flags */
6258
6261 else
6263
6264 if (proclist_is_empty(&buf_hdr->lock_waiters))
6266
6267 desired_state &= ~BM_LOCKED; /* release lock */
6268
6271 break;
6272 }
6273 }
6274
6275 /* Awaken any waiters I removed from the queue. */
6276 proclist_foreach_modify(iter, &wakeup, lwWaitLink)
6277 {
6278 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6279
6280 proclist_delete(&wakeup, iter.cur, lwWaitLink);
6281
6282 /*
6283 * Guarantee that lwWaiting being unset only becomes visible once the
6284 * unlink from the link has completed. Otherwise the target backend
6285 * could be woken up for other reason and enqueue for a new lock - if
6286 * that happens before the list unlink happens, the list would end up
6287 * being corrupted.
6288 *
6289 * The barrier pairs with the LockBufHdr() when enqueuing for another
6290 * lock.
6291 */
6293 waiter->lwWaiting = LW_WS_NOT_WAITING;
6294 PGSemaphoreUnlock(waiter->sem);
6295 }
6296}
#define pg_write_barrier()
Definition atomics.h:155
@ LW_WS_PENDING_WAKEUP
Definition lwlock.h:32
#define GetPGProcByNumber(n)
Definition proc.h:501
static void proclist_init(proclist_head *list)
Definition proclist.h:29
#define proclist_foreach_modify(iter, lhead, link_member)
Definition proclist.h:206
Definition proc.h:176
static TimestampTz wakeup[NUM_WALRCV_WAKEUPS]

References Assert, BM_LOCK_HAS_WAITERS, BM_LOCK_WAKE_IN_PROGRESS, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, proclist_mutable_iter::cur, fb(), GetPGProcByNumber, LockBufHdr(), LW_WS_NOT_WAITING, LW_WS_PENDING_WAKEUP, LW_WS_WAITING, PGPROC::lwWaiting, PGPROC::lwWaitMode, pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), pg_write_barrier, PGSemaphoreUnlock(), proclist_delete, proclist_foreach_modify, proclist_init(), proclist_is_empty(), proclist_push_tail, PGPROC::sem, and wakeup.

Referenced by BufferLockProcessRelease().

◆ BufferSetHintBits16()

bool BufferSetHintBits16 ( uint16 ptr,
uint16  val,
Buffer  buffer 
)

Definition at line 6960 of file bufmgr.c.

6961{
6964#ifdef USE_ASSERT_CHECKING
6965 char *page;
6966
6967 /* verify that the address is on the page */
6968 page = BufferGetPage(buffer);
6969 Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ));
6970#endif
6971
6972 if (BufferIsLocal(buffer))
6973 {
6974 *ptr = val;
6975
6976 MarkLocalBufferDirty(buffer);
6977
6978 return true;
6979 }
6980
6981 buf_hdr = GetBufferDescriptor(buffer - 1);
6982
6984 {
6985 *ptr = val;
6986
6988
6989 return true;
6990 }
6991
6992 return false;
6993}
static void MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, bool buffer_std)
Definition bufmgr.c:5563
long val
Definition informix.c:689
void MarkLocalBufferDirty(Buffer buffer)
Definition localbuf.c:491

References Assert, PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, fb(), GetBufferDescriptor(), MarkLocalBufferDirty(), MarkSharedBufferDirtyHint(), SharedBufferBeginSetHintBits(), and val.

Referenced by SetHintBitsExt().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 3463 of file bufmgr.c.

3464{
3466 int buf_id;
3467 int num_to_scan;
3468 int num_spaces;
3469 int num_processed;
3470 int num_written;
3472 Oid last_tsid;
3474 int i;
3475 uint64 mask = BM_DIRTY;
3477
3478 /*
3479 * Unless this is a shutdown checkpoint or we have been explicitly told,
3480 * we write only permanent, dirty buffers. But at shutdown or end of
3481 * recovery, we write all dirty buffers.
3482 */
3485 mask |= BM_PERMANENT;
3486
3487 /*
3488 * Loop over all buffers, and mark the ones that need to be written with
3489 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3490 * can estimate how much work needs to be done.
3491 *
3492 * This allows us to write only those pages that were dirty when the
3493 * checkpoint began, and not those that get dirtied while it proceeds.
3494 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3495 * later in this function, or by normal backends or the bgwriter cleaning
3496 * scan, the flag is cleared. Any buffer dirtied after this point won't
3497 * have the flag set.
3498 *
3499 * Note that if we fail to write some buffer, we may leave buffers with
3500 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3501 * certainly need to be written for the next checkpoint attempt, too.
3502 */
3503 num_to_scan = 0;
3504 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3505 {
3507 uint64 set_bits = 0;
3508
3509 /*
3510 * Header spinlock is enough to examine BM_DIRTY, see comment in
3511 * SyncOneBuffer.
3512 */
3514
3515 if ((buf_state & mask) == mask)
3516 {
3517 CkptSortItem *item;
3518
3520
3521 item = &CkptBufferIds[num_to_scan++];
3522 item->buf_id = buf_id;
3523 item->tsId = bufHdr->tag.spcOid;
3524 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3525 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3526 item->blockNum = bufHdr->tag.blockNum;
3527 }
3528
3530 set_bits, 0,
3531 0);
3532
3533 /* Check for barrier events in case NBuffers is large. */
3536 }
3537
3538 if (num_to_scan == 0)
3539 return; /* nothing to do */
3540
3542
3544
3545 /*
3546 * Sort buffers that need to be written to reduce the likelihood of random
3547 * IO. The sorting is also important for the implementation of balancing
3548 * writes between tablespaces. Without balancing writes we'd potentially
3549 * end up writing to the tablespaces one-by-one; possibly overloading the
3550 * underlying system.
3551 */
3553
3554 num_spaces = 0;
3555
3556 /*
3557 * Allocate progress status for each tablespace with buffers that need to
3558 * be flushed. This requires the to-be-flushed array to be sorted.
3559 */
3561 for (i = 0; i < num_to_scan; i++)
3562 {
3563 CkptTsStatus *s;
3564 Oid cur_tsid;
3565
3567
3568 /*
3569 * Grow array of per-tablespace status structs, every time a new
3570 * tablespace is found.
3571 */
3573 {
3574 Size sz;
3575
3576 num_spaces++;
3577
3578 /*
3579 * Not worth adding grow-by-power-of-2 logic here - even with a
3580 * few hundred tablespaces this should be fine.
3581 */
3582 sz = sizeof(CkptTsStatus) * num_spaces;
3583
3584 if (per_ts_stat == NULL)
3586 else
3588
3589 s = &per_ts_stat[num_spaces - 1];
3590 memset(s, 0, sizeof(*s));
3591 s->tsId = cur_tsid;
3592
3593 /*
3594 * The first buffer in this tablespace. As CkptBufferIds is sorted
3595 * by tablespace all (s->num_to_scan) buffers in this tablespace
3596 * will follow afterwards.
3597 */
3598 s->index = i;
3599
3600 /*
3601 * progress_slice will be determined once we know how many buffers
3602 * are in each tablespace, i.e. after this loop.
3603 */
3604
3606 }
3607 else
3608 {
3609 s = &per_ts_stat[num_spaces - 1];
3610 }
3611
3612 s->num_to_scan++;
3613
3614 /* Check for barrier events. */
3617 }
3618
3619 Assert(num_spaces > 0);
3620
3621 /*
3622 * Build a min-heap over the write-progress in the individual tablespaces,
3623 * and compute how large a portion of the total progress a single
3624 * processed buffer is.
3625 */
3628 NULL);
3629
3630 for (i = 0; i < num_spaces; i++)
3631 {
3633
3634 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3635
3637 }
3638
3640
3641 /*
3642 * Iterate through to-be-checkpointed buffers and write the ones (still)
3643 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3644 * tablespaces; otherwise the sorting would lead to only one tablespace
3645 * receiving writes at a time, making inefficient use of the hardware.
3646 */
3647 num_processed = 0;
3648 num_written = 0;
3649 while (!binaryheap_empty(ts_heap))
3650 {
3654
3655 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3656 Assert(buf_id != -1);
3657
3658 bufHdr = GetBufferDescriptor(buf_id);
3659
3660 num_processed++;
3661
3662 /*
3663 * We don't need to acquire the lock here, because we're only looking
3664 * at a single bit. It's possible that someone else writes the buffer
3665 * and clears the flag right after we check, but that doesn't matter
3666 * since SyncOneBuffer will then do nothing. However, there is a
3667 * further race condition: it's conceivable that between the time we
3668 * examine the bit here and the time SyncOneBuffer acquires the lock,
3669 * someone else not only wrote the buffer but replaced it with another
3670 * page and dirtied it. In that improbable case, SyncOneBuffer will
3671 * write the buffer though we didn't need to. It doesn't seem worth
3672 * guarding against this, though.
3673 */
3675 {
3676 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3677 {
3680 num_written++;
3681 }
3682 }
3683
3684 /*
3685 * Measure progress independent of actually having to flush the buffer
3686 * - otherwise writing become unbalanced.
3687 */
3688 ts_stat->progress += ts_stat->progress_slice;
3689 ts_stat->num_scanned++;
3690 ts_stat->index++;
3691
3692 /* Have all the buffers from the tablespace been processed? */
3693 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3694 {
3696 }
3697 else
3698 {
3699 /* update heap with the new progress */
3701 }
3702
3703 /*
3704 * Sleep to throttle our I/O rate.
3705 *
3706 * (This will check for barrier events even if it doesn't sleep.)
3707 */
3708 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3709 }
3710
3711 /*
3712 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3713 * IOContext will always be IOCONTEXT_NORMAL.
3714 */
3716
3718 per_ts_stat = NULL;
3720
3721 /*
3722 * Update checkpoint statistics. As noted above, this doesn't include
3723 * buffers written by other backends or bgwriter scan.
3724 */
3726
3728}
void binaryheap_build(binaryheap *heap)
Definition binaryheap.c:136
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:253
bh_node_type binaryheap_first(binaryheap *heap)
Definition binaryheap.c:175
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition binaryheap.c:190
void binaryheap_free(binaryheap *heap)
Definition binaryheap.c:73
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:114
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition binaryheap.c:37
#define binaryheap_empty(h)
Definition binaryheap.h:65
CkptSortItem * CkptBufferIds
Definition buf_init.c:26
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BM_CHECKPOINT_NEEDED
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition bufmgr.c:7438
int checkpoint_flush_after
Definition bufmgr.c:223
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition bufmgr.c:7461
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition bufmgr.c:7523
double float8
Definition c.h:716
size_t Size
Definition c.h:691
void CheckpointWriteDelay(int flags, double progress)
volatile sig_atomic_t ProcSignalBarrierPending
Definition globals.c:40
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1632
void pfree(void *pointer)
Definition mcxt.c:1616
void * palloc(Size size)
Definition mcxt.c:1387
PgStat_CheckpointerStats PendingCheckpointerStats
static Datum PointerGetDatum(const void *X)
Definition postgres.h:342
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
#define InvalidOid
unsigned int Oid
void ProcessProcSignalBarrier(void)
Definition procsignal.c:502
int ckpt_bufs_written
Definition xlog.h:178
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition bufmgr.c:164
int num_to_scan
Definition bufmgr.c:167
PgStat_Counter buffers_written
Definition pgstat.h:270
CheckpointStatsData CheckpointStats
Definition xlog.c:213
#define CHECKPOINT_FLUSH_UNLOGGED
Definition xlog.h:154
#define CHECKPOINT_END_OF_RECOVERY
Definition xlog.h:151
#define CHECKPOINT_IS_SHUTDOWN
Definition xlog.h:150

References Assert, binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buffers_written, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_UNLOGGED, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, DatumGetPointer(), fb(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IOCONTEXT_NORMAL, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u64(), PointerGetDatum(), ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), SyncOneBuffer(), ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdrExt(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 7373 of file bufmgr.c.

7374{
7375 int ret;
7378
7381
7383
7384 if (ret != 0)
7385 return ret;
7386
7388 return -1;
7390 return 1;
7391
7392 if (ba->blockNum < bb->blockNum)
7393 return -1;
7394 if (ba->blockNum > bb->blockNum)
7395 return 1;
7396
7397 return 0;
7398}
static int rlocator_comparator(const void *p1, const void *p2)
Definition bufmgr.c:7274

References BufTagGetForkNum(), BufTagGetRelFileLocator(), fb(), and rlocator_comparator().

◆ CheckBufferIsPinnedOnce()

void CheckBufferIsPinnedOnce ( Buffer  buffer)

Definition at line 6504 of file bufmgr.c.

6505{
6506 if (BufferIsLocal(buffer))
6507 {
6508 if (LocalRefCount[-buffer - 1] != 1)
6509 elog(ERROR, "incorrect local pin count: %d",
6510 LocalRefCount[-buffer - 1]);
6511 }
6512 else
6513 {
6514 if (GetPrivateRefCount(buffer) != 1)
6515 elog(ERROR, "incorrect local pin count: %d",
6516 GetPrivateRefCount(buffer));
6517 }
6518}

References PrivateRefCountEntry::buffer, BufferIsLocal, elog, ERROR, GetPrivateRefCount(), and LocalRefCount.

Referenced by GetVictimBuffer(), lazy_scan_heap(), and LockBufferForCleanup().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 4174 of file bufmgr.c.

4175{
4176#ifdef USE_ASSERT_CHECKING
4177 int RefCountErrors = 0;
4179 int i;
4180 char *s;
4181
4182 /* check the array */
4183 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4184 {
4186 {
4187 res = &PrivateRefCountArray[i];
4188
4190 elog(WARNING, "buffer refcount leak: %s", s);
4191 pfree(s);
4192
4194 }
4195 }
4196
4197 /* if necessary search the hash */
4199 {
4200 refcount_iterator iter;
4201
4203 while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4204 {
4206 elog(WARNING, "buffer refcount leak: %s", s);
4207 pfree(s);
4209 }
4210 }
4211
4212 Assert(RefCountErrors == 0);
4213#endif
4214}
#define InvalidBuffer
Definition buf.h:25
static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:263
static refcount_hash * PrivateRefCountHash
Definition bufmgr.c:265
char * DebugPrintBufferRefcount(Buffer buffer)
Definition bufmgr.c:4300
#define REFCOUNT_ARRAY_ENTRIES
Definition bufmgr.c:145
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:264

References Assert, PrivateRefCountEntry::buffer, DebugPrintBufferRefcount(), elog, fb(), i, InvalidBuffer, pfree(), PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and WARNING.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 4343 of file bufmgr.c.

4344{
4345 BufferSync(flags);
4346}
static void BufferSync(int flags)
Definition bufmgr.c:3463

References BufferSync().

Referenced by CheckPointGuts().

◆ CheckReadBuffersOperation()

static void CheckReadBuffersOperation ( ReadBuffersOperation operation,
bool  is_complete 
)
static

Definition at line 1637 of file bufmgr.c.

1638{
1639#ifdef USE_ASSERT_CHECKING
1640 Assert(operation->nblocks_done <= operation->nblocks);
1641 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1642
1643 for (int i = 0; i < operation->nblocks; i++)
1644 {
1645 Buffer buffer = operation->buffers[i];
1646 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1647 GetLocalBufferDescriptor(-buffer - 1) :
1648 GetBufferDescriptor(buffer - 1);
1649
1650 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1652
1653 if (i < operation->nblocks_done)
1655 }
1656#endif
1657}

References Assert, ReadBuffersOperation::blocknum, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufferGetBlockNumber(), BufferIsLocal, ReadBuffersOperation::buffers, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, and pg_atomic_read_u64().

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 7407 of file bufmgr.c.

7408{
7409 /* compare tablespace */
7410 if (a->tsId < b->tsId)
7411 return -1;
7412 else if (a->tsId > b->tsId)
7413 return 1;
7414 /* compare relation */
7415 if (a->relNumber < b->relNumber)
7416 return -1;
7417 else if (a->relNumber > b->relNumber)
7418 return 1;
7419 /* compare fork */
7420 else if (a->forkNum < b->forkNum)
7421 return -1;
7422 else if (a->forkNum > b->forkNum)
7423 return 1;
7424 /* compare block number */
7425 else if (a->blockNum < b->blockNum)
7426 return -1;
7427 else if (a->blockNum > b->blockNum)
7428 return 1;
7429 /* equal page IDs are unlikely, but not impossible */
7430 return 0;
7431}
int b
Definition isn.c:74
int a
Definition isn.c:73

References a, and b.

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 6484 of file bufmgr.c.

6485{
6486 BufferDesc *buf;
6487
6488 Assert(BufferIsPinned(buffer));
6489 if (BufferIsLocal(buffer))
6490 return true; /* act as though we got it */
6491
6492 buf = GetBufferDescriptor(buffer - 1);
6493
6495}
static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5917

References Assert, buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsPinned, BufferLockConditional(), and GetBufferDescriptor().

Referenced by _bt_conditionallockbuf(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 6710 of file bufmgr.c.

6711{
6714 refcount;
6715
6716 Assert(BufferIsValid(buffer));
6717
6718 /* see AIO related comment in LockBufferForCleanup() */
6719
6720 if (BufferIsLocal(buffer))
6721 {
6722 refcount = LocalRefCount[-buffer - 1];
6723 /* There should be exactly one pin */
6724 Assert(refcount > 0);
6725 if (refcount != 1)
6726 return false;
6727 /* Nobody else to wait for */
6728 return true;
6729 }
6730
6731 /* There should be exactly one local pin */
6732 refcount = GetPrivateRefCount(buffer);
6733 Assert(refcount);
6734 if (refcount != 1)
6735 return false;
6736
6737 /* Try to acquire lock */
6738 if (!ConditionalLockBuffer(buffer))
6739 return false;
6740
6741 bufHdr = GetBufferDescriptor(buffer - 1);
6744
6745 Assert(refcount > 0);
6746 if (refcount == 1)
6747 {
6748 /* Successfully acquired exclusive lock with pincount 1 */
6750 return true;
6751 }
6752
6753 /* Failed, so release the lock */
6756 return false;
6757}
bool ConditionalLockBuffer(Buffer buffer)
Definition bufmgr.c:6484
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:332

References Assert, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), fb(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 5381 of file bufmgr.c.

5383{
5384 char relpersistence;
5387
5388 /* Set the relpersistence. */
5389 relpersistence = permanent ?
5391
5394
5395 /*
5396 * Create and copy all forks of the relation. During create database we
5397 * have a separate cleanup mechanism which deletes complete database
5398 * directory. Therefore, each individual relation doesn't need to be
5399 * registered for cleanup.
5400 */
5401 RelationCreateStorage(dst_rlocator, relpersistence, false);
5402
5403 /* copy main fork. */
5405 permanent);
5406
5407 /* copy those extra forks that exist */
5408 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5409 forkNum <= MAX_FORKNUM; forkNum++)
5410 {
5411 if (smgrexists(src_rel, forkNum))
5412 {
5413 smgrcreate(dst_rel, forkNum, false);
5414
5415 /*
5416 * WAL log creation if the relation is persistent, or this is the
5417 * init fork of an unlogged relation.
5418 */
5419 if (permanent || forkNum == INIT_FORKNUM)
5420 log_smgrcreate(&dst_rlocator, forkNum);
5421
5422 /* Copy a fork's data, block by block. */
5424 permanent);
5425 }
5426 }
5427}
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition bufmgr.c:5267
@ MAIN_FORKNUM
Definition relpath.h:58
#define MAX_FORKNUM
Definition relpath.h:70
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition smgr.c:481
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:462
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition storage.c:187

References fb(), INIT_FORKNUM, INVALID_PROC_NUMBER, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DebugPrintBufferRefcount()

char * DebugPrintBufferRefcount ( Buffer  buffer)

Definition at line 4300 of file bufmgr.c.

4301{
4302 BufferDesc *buf;
4304 char *result;
4305 ProcNumber backend;
4307
4308 Assert(BufferIsValid(buffer));
4309 if (BufferIsLocal(buffer))
4310 {
4311 buf = GetLocalBufferDescriptor(-buffer - 1);
4312 loccount = LocalRefCount[-buffer - 1];
4313 backend = MyProcNumber;
4314 }
4315 else
4316 {
4317 buf = GetBufferDescriptor(buffer - 1);
4318 loccount = GetPrivateRefCount(buffer);
4319 backend = INVALID_PROC_NUMBER;
4320 }
4321
4322 /* theoretically we should lock the bufHdr here */
4323 buf_state = pg_atomic_read_u64(&buf->state);
4324
4325 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
4326 buffer,
4328 BufTagGetForkNum(&buf->tag)).str,
4329 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4331 return result;
4332}
#define BUF_FLAG_MASK
char * psprintf(const char *fmt,...)
Definition psprintf.c:43

References Assert, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), INVALID_PROC_NUMBER, LocalRefCount, MyProcNumber, pg_atomic_read_u64(), psprintf(), and relpathbackend.

Referenced by buffer_call_start_io(), buffer_call_terminate_io(), CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResOwnerPrintBuffer().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 5034 of file bufmgr.c.

5035{
5036 int i;
5037
5038 /*
5039 * We needn't consider local buffers, since by assumption the target
5040 * database isn't our own.
5041 */
5042
5043 for (i = 0; i < NBuffers; i++)
5044 {
5046
5047 /*
5048 * As in DropRelationBuffers, an unlocked precheck should be safe and
5049 * saves some cycles.
5050 */
5051 if (bufHdr->tag.dbOid != dbid)
5052 continue;
5053
5055 if (bufHdr->tag.dbOid == dbid)
5056 InvalidateBuffer(bufHdr); /* releases spinlock */
5057 else
5059 }
5060}
static void InvalidateBuffer(BufferDesc *buf)
Definition bufmgr.c:2283

References fb(), GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 4684 of file bufmgr.c.

4686{
4687 int i;
4688 int j;
4689 RelFileLocatorBackend rlocator;
4692
4693 rlocator = smgr_reln->smgr_rlocator;
4694
4695 /* If it's a local relation, it's localbuf.c's problem. */
4696 if (RelFileLocatorBackendIsTemp(rlocator))
4697 {
4698 if (rlocator.backend == MyProcNumber)
4699 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4701
4702 return;
4703 }
4704
4705 /*
4706 * To remove all the pages of the specified relation forks from the buffer
4707 * pool, we need to scan the entire buffer pool but we can optimize it by
4708 * finding the buffers from BufMapping table provided we know the exact
4709 * size of each fork of the relation. The exact size is required to ensure
4710 * that we don't leave any buffer for the relation being dropped as
4711 * otherwise the background writer or checkpointer can lead to a PANIC
4712 * error while flushing buffers corresponding to files that don't exist.
4713 *
4714 * To know the exact size, we rely on the size cached for each fork by us
4715 * during recovery which limits the optimization to recovery and on
4716 * standbys but we can easily extend it once we have shared cache for
4717 * relation size.
4718 *
4719 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4720 * and the future writes keeps the cached value up-to-date. See
4721 * smgrextend. It is possible that the value of the first lseek is smaller
4722 * than the actual number of existing blocks in the file due to buggy
4723 * Linux kernels that might not have accounted for the recent write. But
4724 * that should be fine because there must not be any buffers after that
4725 * file size.
4726 */
4727 for (i = 0; i < nforks; i++)
4728 {
4729 /* Get the number of blocks for a relation's fork */
4731
4733 {
4735 break;
4736 }
4737
4738 /* calculate the number of blocks to be invalidated */
4740 }
4741
4742 /*
4743 * We apply the optimization iff the total number of blocks to invalidate
4744 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4745 */
4748 {
4749 for (j = 0; j < nforks; j++)
4750 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4752 return;
4753 }
4754
4755 for (i = 0; i < NBuffers; i++)
4756 {
4758
4759 /*
4760 * We can make this a tad faster by prechecking the buffer tag before
4761 * we attempt to lock the buffer; this saves a lot of lock
4762 * acquisitions in typical cases. It should be safe because the
4763 * caller must have AccessExclusiveLock on the relation, or some other
4764 * reason to be certain that no one is loading new pages of the rel
4765 * into the buffer pool. (Otherwise we might well miss such pages
4766 * entirely.) Therefore, while the tag might be changing while we
4767 * look at it, it can't be changing *to* a value we care about, only
4768 * *away* from such a value. So false negatives are impossible, and
4769 * false positives are safe because we'll recheck after getting the
4770 * buffer lock.
4771 *
4772 * We could check forkNum and blockNum as well as the rlocator, but
4773 * the incremental win from doing so seems small.
4774 */
4775 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4776 continue;
4777
4779
4780 for (j = 0; j < nforks; j++)
4781 {
4782 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4783 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4784 bufHdr->tag.blockNum >= firstDelBlock[j])
4785 {
4786 InvalidateBuffer(bufHdr); /* releases spinlock */
4787 break;
4788 }
4789 }
4790 if (j >= nforks)
4792 }
4793}
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition bufmgr.c:95
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition bufmgr.c:4974
int j
Definition isn.c:78
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition localbuf.c:665
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:847

References RelFileLocatorBackend::backend, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), fb(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, RelFileLocatorBackendIsTemp, smgrnblocks_cached(), and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 4804 of file bufmgr.c.

4805{
4806 int i;
4807 int n = 0;
4808 SMgrRelation *rels;
4809 BlockNumber (*block)[MAX_FORKNUM + 1];
4812 bool cached = true;
4813 bool use_bsearch;
4814
4815 if (nlocators == 0)
4816 return;
4817
4818 rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4819
4820 /* If it's a local relation, it's localbuf.c's problem. */
4821 for (i = 0; i < nlocators; i++)
4822 {
4823 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4824 {
4825 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4826 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4827 }
4828 else
4829 rels[n++] = smgr_reln[i];
4830 }
4831
4832 /*
4833 * If there are no non-local relations, then we're done. Release the
4834 * memory and return.
4835 */
4836 if (n == 0)
4837 {
4838 pfree(rels);
4839 return;
4840 }
4841
4842 /*
4843 * This is used to remember the number of blocks for all the relations
4844 * forks.
4845 */
4846 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4847 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4848
4849 /*
4850 * We can avoid scanning the entire buffer pool if we know the exact size
4851 * of each of the given relation forks. See DropRelationBuffers.
4852 */
4853 for (i = 0; i < n && cached; i++)
4854 {
4855 for (int j = 0; j <= MAX_FORKNUM; j++)
4856 {
4857 /* Get the number of blocks for a relation's fork. */
4858 block[i][j] = smgrnblocks_cached(rels[i], j);
4859
4860 /* We need to only consider the relation forks that exists. */
4861 if (block[i][j] == InvalidBlockNumber)
4862 {
4863 if (!smgrexists(rels[i], j))
4864 continue;
4865 cached = false;
4866 break;
4867 }
4868
4869 /* calculate the total number of blocks to be invalidated */
4870 nBlocksToInvalidate += block[i][j];
4871 }
4872 }
4873
4874 /*
4875 * We apply the optimization iff the total number of blocks to invalidate
4876 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4877 */
4879 {
4880 for (i = 0; i < n; i++)
4881 {
4882 for (int j = 0; j <= MAX_FORKNUM; j++)
4883 {
4884 /* ignore relation forks that doesn't exist */
4885 if (!BlockNumberIsValid(block[i][j]))
4886 continue;
4887
4888 /* drop all the buffers for a particular relation fork */
4889 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4890 j, block[i][j], 0);
4891 }
4892 }
4893
4894 pfree(block);
4895 pfree(rels);
4896 return;
4897 }
4898
4899 pfree(block);
4900 locators = palloc_array(RelFileLocator, n); /* non-local relations */
4901 for (i = 0; i < n; i++)
4902 locators[i] = rels[i]->smgr_rlocator.locator;
4903
4904 /*
4905 * For low number of relations to drop just use a simple walk through, to
4906 * save the bsearch overhead. The threshold to use is rather a guess than
4907 * an exactly determined value, as it depends on many factors (CPU and RAM
4908 * speeds, amount of shared buffers etc.).
4909 */
4911
4912 /* sort the list of rlocators if necessary */
4913 if (use_bsearch)
4915
4916 for (i = 0; i < NBuffers; i++)
4917 {
4918 RelFileLocator *rlocator = NULL;
4920
4921 /*
4922 * As in DropRelationBuffers, an unlocked precheck should be safe and
4923 * saves some cycles.
4924 */
4925
4926 if (!use_bsearch)
4927 {
4928 int j;
4929
4930 for (j = 0; j < n; j++)
4931 {
4933 {
4934 rlocator = &locators[j];
4935 break;
4936 }
4937 }
4938 }
4939 else
4940 {
4941 RelFileLocator locator;
4942
4943 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4944 rlocator = bsearch(&locator,
4945 locators, n, sizeof(RelFileLocator),
4947 }
4948
4949 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4950 if (rlocator == NULL)
4951 continue;
4952
4954 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4955 InvalidateBuffer(bufHdr); /* releases spinlock */
4956 else
4958 }
4959
4960 pfree(locators);
4961 pfree(rels);
4962}
#define RELS_BSEARCH_THRESHOLD
Definition bufmgr.c:87
#define palloc_array(type, count)
Definition fe_memutils.h:76
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition localbuf.c:702
#define qsort(a, b, c, d)
Definition port.h:495

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), fb(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, palloc(), palloc_array, pfree(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ EvictAllUnpinnedBuffers()

void EvictAllUnpinnedBuffers ( int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 7765 of file bufmgr.c.

7767{
7768 *buffers_evicted = 0;
7769 *buffers_skipped = 0;
7770 *buffers_flushed = 0;
7771
7772 for (int buf = 1; buf <= NBuffers; buf++)
7773 {
7774 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7776 bool buffer_flushed;
7777
7779
7781 if (!(buf_state & BM_VALID))
7782 continue;
7783
7786
7787 LockBufHdr(desc);
7788
7790 (*buffers_evicted)++;
7791 else
7792 (*buffers_skipped)++;
7793
7794 if (buffer_flushed)
7795 (*buffers_flushed)++;
7796 }
7797}
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition bufmgr.c:7674
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:123
pg_atomic_uint64 state

References BM_VALID, buf, CHECK_FOR_INTERRUPTS, CurrentResourceOwner, EvictUnpinnedBufferInternal(), fb(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u64(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), and BufferDesc::state.

Referenced by pg_buffercache_evict_all().

◆ EvictRelUnpinnedBuffers()

void EvictRelUnpinnedBuffers ( Relation  rel,
int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 7815 of file bufmgr.c.

7817{
7819
7820 *buffers_skipped = 0;
7821 *buffers_evicted = 0;
7822 *buffers_flushed = 0;
7823
7824 for (int buf = 1; buf <= NBuffers; buf++)
7825 {
7826 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7828 bool buffer_flushed;
7829
7831
7832 /* An unlocked precheck should be safe and saves some cycles. */
7833 if ((buf_state & BM_VALID) == 0 ||
7835 continue;
7836
7837 /* Make sure we can pin the buffer. */
7840
7841 buf_state = LockBufHdr(desc);
7842
7843 /* recheck, could have changed without the lock */
7844 if ((buf_state & BM_VALID) == 0 ||
7846 {
7847 UnlockBufHdr(desc);
7848 continue;
7849 }
7850
7852 (*buffers_evicted)++;
7853 else
7854 (*buffers_skipped)++;
7855
7856 if (buffer_flushed)
7857 (*buffers_flushed)++;
7858 }
7859}
#define RelationUsesLocalBuffers(relation)
Definition rel.h:646
RelFileLocator rd_locator
Definition rel.h:57

References Assert, BM_VALID, buf, BufTagMatchesRelFileLocator(), CHECK_FOR_INTERRUPTS, CurrentResourceOwner, EvictUnpinnedBufferInternal(), fb(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u64(), RelationData::rd_locator, RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by pg_buffercache_evict_relation().

◆ EvictUnpinnedBuffer()

bool EvictUnpinnedBuffer ( Buffer  buf,
bool buffer_flushed 
)

◆ EvictUnpinnedBufferInternal()

static bool EvictUnpinnedBufferInternal ( BufferDesc desc,
bool buffer_flushed 
)
static

Definition at line 7674 of file bufmgr.c.

7675{
7677 bool result;
7678
7679 *buffer_flushed = false;
7680
7683
7684 if ((buf_state & BM_VALID) == 0)
7685 {
7686 UnlockBufHdr(desc);
7687 return false;
7688 }
7689
7690 /* Check that it's not pinned already. */
7692 {
7693 UnlockBufHdr(desc);
7694 return false;
7695 }
7696
7697 PinBuffer_Locked(desc); /* releases spinlock */
7698
7699 /* If it was dirty, try to clean it once. */
7700 if (buf_state & BM_DIRTY)
7701 {
7703 *buffer_flushed = true;
7704 }
7705
7706 /* This will return false if it becomes dirty or someone else pins it. */
7707 result = InvalidateVictimBuffer(desc);
7708
7709 UnpinBuffer(desc);
7710
7711 return result;
7712}
#define BM_LOCKED
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4545
static void PinBuffer_Locked(BufferDesc *buf)
Definition bufmgr.c:3299
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition bufmgr.c:2384

References Assert, BM_DIRTY, BM_LOCKED, BM_VALID, BUF_STATE_GET_REFCOUNT, fb(), FlushUnlockedBuffer(), InvalidateVictimBuffer(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, pg_atomic_read_u64(), PinBuffer_Locked(), BufferDesc::state, UnlockBufHdr(), and UnpinBuffer().

Referenced by EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), and EvictUnpinnedBuffer().

◆ ExtendBufferedRel()

Buffer ExtendBufferedRel ( BufferManagerRelation  bmr,
ForkNumber  forkNum,
BufferAccessStrategy  strategy,
uint32  flags 
)

Definition at line 974 of file bufmgr.c.

978{
979 Buffer buf;
980 uint32 extend_by = 1;
981
982 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
983 &buf, &extend_by);
984
985 return buf;
986}
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:1006

References buf, ExtendBufferedRelBy(), and fb().

Referenced by _bt_allocbuf(), _hash_getnewbuf(), BloomNewBuffer(), brinbuild(), brinbuildempty(), fill_seq_fork_with_data(), ginbuildempty(), GinNewBuffer(), gistbuildempty(), gistNewBuffer(), ReadBuffer_common(), revmap_physical_extend(), and SpGistNewBuffer().

◆ ExtendBufferedRelBy()

BlockNumber ExtendBufferedRelBy ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
Buffer buffers,
uint32 extended_by 
)

Definition at line 1006 of file bufmgr.c.

1013{
1014 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1015 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1016 Assert(extend_by > 0);
1017
1018 if (bmr.relpersistence == '\0')
1019 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1020
1021 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1023 buffers, extended_by);
1024}
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2665

References Assert, ExtendBufferedRelCommon(), fb(), and InvalidBlockNumber.

Referenced by ExtendBufferedRel(), grow_rel(), and RelationAddBlocks().

◆ ExtendBufferedRelCommon()

static BlockNumber ExtendBufferedRelCommon ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2665 of file bufmgr.c.

2673{
2675
2677 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2678 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2679 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2680 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2681 extend_by);
2682
2683 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2686 buffers, &extend_by);
2687 else
2688 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2690 buffers, &extend_by);
2692
2694 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2695 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2696 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2697 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2698 *extended_by,
2699 first_block);
2700
2701 return first_block;
2702}
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2709
#define BMR_GET_SMGR(bmr)
Definition bufmgr.h:118
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition localbuf.c:346

References BMR_GET_SMGR, ExtendBufferedRelLocal(), ExtendBufferedRelShared(), and fb().

Referenced by ExtendBufferedRelBy(), and ExtendBufferedRelTo().

◆ ExtendBufferedRelShared()

static BlockNumber ExtendBufferedRelShared ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2709 of file bufmgr.c.

2717{
2721
2723
2724 /*
2725 * Acquire victim buffers for extension without holding extension lock.
2726 * Writing out victim buffers is the most expensive part of extending the
2727 * relation, particularly when doing so requires WAL flushes. Zeroing out
2728 * the buffers is also quite expensive, so do that before holding the
2729 * extension lock as well.
2730 *
2731 * These pages are pinned by us and not valid. While we hold the pin they
2732 * can't be acquired as victim buffers by another backend.
2733 */
2734 for (uint32 i = 0; i < extend_by; i++)
2735 {
2737
2738 buffers[i] = GetVictimBuffer(strategy, io_context);
2740
2741 /* new buffers are zero-filled */
2742 MemSet(buf_block, 0, BLCKSZ);
2743 }
2744
2745 /*
2746 * Lock relation against concurrent extensions, unless requested not to.
2747 *
2748 * We use the same extension lock for all forks. That's unnecessarily
2749 * restrictive, but currently extensions for forks don't happen often
2750 * enough to make it worth locking more granularly.
2751 *
2752 * Note that another backend might have extended the relation by the time
2753 * we get the lock.
2754 */
2755 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2757
2758 /*
2759 * If requested, invalidate size cache, so that smgrnblocks asks the
2760 * kernel.
2761 */
2762 if (flags & EB_CLEAR_SIZE_CACHE)
2763 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2764
2766
2767 /*
2768 * Now that we have the accurate relation size, check if the caller wants
2769 * us to extend to only up to a specific size. If there were concurrent
2770 * extensions, we might have acquired too many buffers and need to release
2771 * them.
2772 */
2774 {
2776
2778 extend_by = 0;
2779 else if ((uint64) first_block + extend_by > extend_upto)
2781
2782 for (uint32 i = extend_by; i < orig_extend_by; i++)
2783 {
2784 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2785
2787 }
2788
2789 if (extend_by == 0)
2790 {
2791 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2794 return first_block;
2795 }
2796 }
2797
2798 /* Fail if relation is already at maximum possible length */
2800 ereport(ERROR,
2802 errmsg("cannot extend relation %s beyond %u blocks",
2803 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2804 MaxBlockNumber)));
2805
2806 /*
2807 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2808 *
2809 * This needs to happen before we extend the relation, because as soon as
2810 * we do, other backends can start to read in those pages.
2811 */
2812 for (uint32 i = 0; i < extend_by; i++)
2813 {
2814 Buffer victim_buf = buffers[i];
2816 BufferTag tag;
2817 uint32 hash;
2819 int existing_id;
2820
2821 /* in case we need to pin an existing buffer below */
2824
2825 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2826 first_block + i);
2827 hash = BufTableHashCode(&tag);
2829
2831
2833
2834 /*
2835 * We get here only in the corner case where we are trying to extend
2836 * the relation but we found a pre-existing buffer. This can happen
2837 * because a prior attempt at extending the relation failed, and
2838 * because mdread doesn't complain about reads beyond EOF (when
2839 * zero_damaged_pages is ON) and so a previous attempt to read a block
2840 * beyond EOF could have left a "valid" zero-filled buffer.
2841 *
2842 * This has also been observed when relation was overwritten by
2843 * external process. Since the legitimate cases should always have
2844 * left a zero-filled buffer, complain if not PageIsNew.
2845 */
2846 if (existing_id >= 0)
2847 {
2850 bool valid;
2851
2852 /*
2853 * Pin the existing buffer before releasing the partition lock,
2854 * preventing it from being evicted.
2855 */
2856 valid = PinBuffer(existing_hdr, strategy, false);
2857
2860
2863
2864 if (valid && !PageIsNew((Page) buf_block))
2865 ereport(ERROR,
2866 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2867 existing_hdr->tag.blockNum,
2868 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2869
2870 /*
2871 * We *must* do smgr[zero]extend before succeeding, else the page
2872 * will not be reserved by the kernel, and the next P_NEW call
2873 * will decide to return the same page. Clear the BM_VALID bit,
2874 * do StartBufferIO() and proceed.
2875 *
2876 * Loop to handle the very small possibility that someone re-sets
2877 * BM_VALID between our clearing it and StartBufferIO inspecting
2878 * it.
2879 */
2880 do
2881 {
2883 } while (!StartBufferIO(existing_hdr, true, false));
2884 }
2885 else
2886 {
2888 uint64 set_bits = 0;
2889
2891
2892 /* some sanity checks while we hold the buffer header lock */
2895
2896 victim_buf_hdr->tag = tag;
2897
2899 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2901
2903 set_bits, 0,
2904 0);
2905
2907
2908 /* XXX: could combine the locked operations in it with the above */
2909 StartBufferIO(victim_buf_hdr, true, false);
2910 }
2911 }
2912
2914
2915 /*
2916 * Note: if smgrzeroextend fails, we will end up with buffers that are
2917 * allocated but not marked BM_VALID. The next relation extension will
2918 * still select the same block number (because the relation didn't get any
2919 * longer on disk) and so future attempts to extend the relation will find
2920 * the same buffers (if they have not been recycled) but come right back
2921 * here to try smgrzeroextend again.
2922 *
2923 * We don't need to set checksum for all-zero pages.
2924 */
2926
2927 /*
2928 * Release the file-extension lock; it's now OK for someone else to extend
2929 * the relation some more.
2930 *
2931 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2932 * take noticeable time.
2933 */
2934 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2936
2938 io_start, 1, extend_by * BLCKSZ);
2939
2940 /* Set BM_VALID, terminate IO, and wake up any waiters */
2941 for (uint32 i = 0; i < extend_by; i++)
2942 {
2943 Buffer buf = buffers[i];
2945 bool lock = false;
2946
2947 if (flags & EB_LOCK_FIRST && i == 0)
2948 lock = true;
2949 else if (flags & EB_LOCK_TARGET)
2950 {
2952 if (first_block + i + 1 == extend_upto)
2953 lock = true;
2954 }
2955
2956 if (lock)
2958
2959 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2960 }
2961
2963
2965
2966 return first_block;
2967}
#define MaxBlockNumber
Definition block.h:35
#define BufHdrGetBlock(bufHdr)
Definition bufmgr.c:76
void LimitAdditionalPins(uint32 *additional_pins)
Definition bufmgr.c:2647
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition bufmgr.c:7085
void * Block
Definition bufmgr.h:26
@ EB_LOCK_TARGET
Definition bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition bufmgr.h:90
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
static bool PageIsNew(const PageData *page)
Definition bufpage.h:259
#define MemSet(start, val, len)
Definition c.h:1109
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
#define ExclusiveLock
Definition lockdefs.h:42
@ IOOP_EXTEND
Definition pgstat.h:318
static unsigned hash(unsigned *uv, int n)
Definition rege_dfa.c:715
#define relpath(rlocator, forknum)
Definition relpath.h:150
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:819
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition smgr.c:649
int64 shared_blks_written
Definition instrument.h:29

References Assert, BM_DIRTY, BM_PERMANENT, BM_TAG_VALID, BM_VALID, BMR_GET_SMGR, buf, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BUFFER_LOCK_EXCLUSIVE, BufferDescriptorGetBuffer(), BufHdrGetBlock, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), CurrentResourceOwner, EB_CLEAR_SIZE_CACHE, EB_LOCK_FIRST, EB_LOCK_TARGET, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errmsg, ERROR, ExclusiveLock, fb(), GetBufferDescriptor(), GetVictimBuffer(), hash(), i, INIT_FORKNUM, InitBufferTag(), InvalidBlockNumber, IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EXTEND, LimitAdditionalPins(), LockBuffer(), LockBufHdr(), LockRelationForExtension(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MaxBlockNumber, MemSet, PageIsNew(), pg_atomic_fetch_and_u64(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer(), relpath, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_written, smgrnblocks(), smgrzeroextend(), StartBufferIO(), str, TerminateBufferIO(), track_io_timing, UnlockBufHdrExt(), UnlockRelationForExtension(), and UnpinBuffer().

Referenced by ExtendBufferedRelCommon().

◆ ExtendBufferedRelTo()

Buffer ExtendBufferedRelTo ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
BlockNumber  extend_to,
ReadBufferMode  mode 
)

Definition at line 1035 of file bufmgr.c.

1041{
1043 uint32 extended_by = 0;
1044 Buffer buffer = InvalidBuffer;
1045 Buffer buffers[64];
1046
1047 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1048 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1050
1051 if (bmr.relpersistence == '\0')
1052 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1053
1054 /*
1055 * If desired, create the file if it doesn't exist. If
1056 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1057 * an smgrexists call.
1058 */
1059 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
1060 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1061 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1063 {
1065
1066 /* recheck, fork might have been created concurrently */
1069
1071 }
1072
1073 /*
1074 * If requested, invalidate size cache, so that smgrnblocks asks the
1075 * kernel.
1076 */
1077 if (flags & EB_CLEAR_SIZE_CACHE)
1078 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1079
1080 /*
1081 * Estimate how many pages we'll need to extend by. This avoids acquiring
1082 * unnecessarily many victim buffers.
1083 */
1085
1086 /*
1087 * Since no-one else can be looking at the page contents yet, there is no
1088 * difference between an exclusive lock and a cleanup-strength lock. Note
1089 * that we pass the original mode to ReadBuffer_common() below, when
1090 * falling back to reading the buffer to a concurrent relation extension.
1091 */
1093 flags |= EB_LOCK_TARGET;
1094
1095 while (current_size < extend_to)
1096 {
1097 uint32 num_pages = lengthof(buffers);
1099
1100 if ((uint64) current_size + num_pages > extend_to)
1101 num_pages = extend_to - current_size;
1102
1103 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1104 num_pages, extend_to,
1105 buffers, &extended_by);
1106
1108 Assert(num_pages != 0 || current_size >= extend_to);
1109
1110 for (uint32 i = 0; i < extended_by; i++)
1111 {
1112 if (first_block + i != extend_to - 1)
1113 ReleaseBuffer(buffers[i]);
1114 else
1115 buffer = buffers[i];
1116 }
1117 }
1118
1119 /*
1120 * It's possible that another backend concurrently extended the relation.
1121 * In that case read the buffer.
1122 *
1123 * XXX: Should we control this via a flag?
1124 */
1125 if (buffer == InvalidBuffer)
1126 {
1127 Assert(extended_by == 0);
1128 buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1129 fork, extend_to - 1, mode, strategy);
1130 }
1131
1132 return buffer;
1133}
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:1303
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5505
@ EB_PERFORMING_RECOVERY
Definition bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition bufmgr.h:84
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition bufmgr.h:47
#define lengthof(array)
Definition c.h:875
static int64 current_size

References Assert, BMR_GET_SMGR, PrivateRefCountEntry::buffer, current_size, EB_CLEAR_SIZE_CACHE, EB_CREATE_FORK_IF_NEEDED, EB_LOCK_TARGET, EB_PERFORMING_RECOVERY, ExclusiveLock, ExtendBufferedRelCommon(), fb(), i, InvalidBlockNumber, InvalidBuffer, lengthof, LockRelationForExtension(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, ReadBuffer_common(), ReleaseBuffer(), smgrcreate(), smgrexists(), smgrnblocks(), and UnlockRelationForExtension().

Referenced by fsm_extend(), vm_extend(), and XLogReadBufferExtended().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 4974 of file bufmgr.c.

4977{
4978 BlockNumber curBlock;
4979
4980 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4981 {
4982 uint32 bufHash; /* hash value for tag */
4983 BufferTag bufTag; /* identity of requested block */
4984 LWLock *bufPartitionLock; /* buffer partition lock for it */
4985 int buf_id;
4987
4988 /* create a tag so we can lookup the buffer */
4989 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4990
4991 /* determine its hash code and partition lock ID */
4994
4995 /* Check that it is in the buffer pool. If not, do nothing. */
4997 buf_id = BufTableLookup(&bufTag, bufHash);
4999
5000 if (buf_id < 0)
5001 continue;
5002
5003 bufHdr = GetBufferDescriptor(buf_id);
5004
5005 /*
5006 * We need to lock the buffer header and recheck if the buffer is
5007 * still associated with the same block because the buffer could be
5008 * evicted by some other backend loading blocks for a different
5009 * relation after we release lock on the BufMapping table.
5010 */
5012
5013 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
5014 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
5015 bufHdr->tag.blockNum >= firstDelBlock)
5016 InvalidateBuffer(bufHdr); /* releases spinlock */
5017 else
5019 }
5020}

References BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), fb(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 4414 of file bufmgr.c.

4416{
4418 ErrorContextCallback errcallback;
4421 char *bufToWrite;
4422
4425
4426 /*
4427 * Try to start an I/O operation. If StartBufferIO returns false, then
4428 * someone else flushed the buffer before we could, so we need not do
4429 * anything.
4430 */
4431 if (!StartBufferIO(buf, false, false))
4432 return;
4433
4434 /* Setup error traceback support for ereport() */
4436 errcallback.arg = buf;
4437 errcallback.previous = error_context_stack;
4438 error_context_stack = &errcallback;
4439
4440 /* Find smgr relation for buffer */
4441 if (reln == NULL)
4443
4445 buf->tag.blockNum,
4446 reln->smgr_rlocator.locator.spcOid,
4447 reln->smgr_rlocator.locator.dbOid,
4448 reln->smgr_rlocator.locator.relNumber);
4449
4450 /*
4451 * As we hold at least a share-exclusive lock on the buffer, the LSN
4452 * cannot change during the flush (and thus can't be torn).
4453 */
4455
4456 /*
4457 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4458 * rule that log updates must hit disk before any of the data-file changes
4459 * they describe do.
4460 *
4461 * However, this rule does not apply to unlogged relations, which will be
4462 * lost after a crash anyway. Most unlogged relation pages do not bear
4463 * LSNs since we never emit WAL records for them, and therefore flushing
4464 * up through the buffer LSN would be useless, but harmless. However,
4465 * some index AMs use LSNs internally to detect concurrent page
4466 * modifications, and therefore unlogged index pages bear "fake" LSNs
4467 * generated by XLogGetFakeLSN. It is unlikely but possible that the fake
4468 * LSN counter could advance past the WAL insertion point; and if it did
4469 * happen, attempting to flush WAL through that location would fail, with
4470 * disastrous system-wide consequences. To make sure that can't happen,
4471 * skip the flush if the buffer isn't permanent.
4472 */
4473 if (pg_atomic_read_u64(&buf->state) & BM_PERMANENT)
4475
4476 /*
4477 * Now it's safe to write the buffer to disk. Note that no one else should
4478 * have been able to write it, while we were busy with log flushing,
4479 * because we got the exclusive right to perform I/O by setting the
4480 * BM_IO_IN_PROGRESS bit.
4481 */
4483
4484 /*
4485 * Update page checksum if desired. Since we have only shared lock on the
4486 * buffer, other processes might be updating hint bits in it, so we must
4487 * copy the page to private storage if we do checksumming.
4488 */
4489 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4490
4492
4493 /*
4494 * bufToWrite is either the shared buffer or a copy, as appropriate.
4495 */
4497 BufTagGetForkNum(&buf->tag),
4498 buf->tag.blockNum,
4499 bufToWrite,
4500 false);
4501
4502 /*
4503 * When a strategy is in use, only flushes of dirty buffers already in the
4504 * strategy ring are counted as strategy writes (IOCONTEXT
4505 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4506 * statistics tracking.
4507 *
4508 * If a shared buffer initially added to the ring must be flushed before
4509 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4510 *
4511 * If a shared buffer which was added to the ring later because the
4512 * current strategy buffer is pinned or in use or because all strategy
4513 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4514 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4515 * (from_ring will be false).
4516 *
4517 * When a strategy is not in use, the write can only be a "regular" write
4518 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4519 */
4522
4524
4525 /*
4526 * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
4527 */
4528 TerminateBufferIO(buf, true, 0, true, false);
4529
4531 buf->tag.blockNum,
4532 reln->smgr_rlocator.locator.spcOid,
4533 reln->smgr_rlocator.locator.dbOid,
4534 reln->smgr_rlocator.locator.relNumber);
4535
4536 /* Pop the error context stack */
4537 error_context_stack = errcallback.previous;
4538}
#define BufferGetLSN(bufHdr)
Definition bufmgr.c:77
static void shared_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7242
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition bufpage.c:1509
ErrorContextCallback * error_context_stack
Definition elog.c:99
@ IOOP_WRITE
Definition pgstat.h:320
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.h:131
struct ErrorContextCallback * previous
Definition elog.h:297
void(* callback)(void *arg)
Definition elog.h:298
void XLogFlush(XLogRecPtr record)
Definition xlog.c:2767

References ErrorContextCallback::arg, Assert, BM_PERMANENT, buf, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferGetLSN, BufferLockHeldByMeInMode(), BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, error_context_stack, fb(), INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITE, PageSetChecksumCopy(), pg_atomic_read_u64(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), ErrorContextCallback::previous, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), smgropen(), smgrwrite(), StartBufferIO(), TerminateBufferIO(), track_io_timing, and XLogFlush().

Referenced by FlushOneBuffer(), FlushUnlockedBuffer(), and GetVictimBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 5445 of file bufmgr.c.

5446{
5447 int i;
5449
5450 for (i = 0; i < NBuffers; i++)
5451 {
5453
5455
5456 /*
5457 * As in DropRelationBuffers, an unlocked precheck should be safe and
5458 * saves some cycles.
5459 */
5460 if (bufHdr->tag.dbOid != dbid)
5461 continue;
5462
5463 /* Make sure we can handle the pin */
5466
5468 if (bufHdr->tag.dbOid == dbid &&
5470 {
5474 }
5475 else
5477 }
5478}

References BM_DIRTY, BM_VALID, CurrentResourceOwner, fb(), FlushUnlockedBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 5485 of file bufmgr.c.

5486{
5488
5489 /* currently not needed, but no fundamental reason not to support */
5490 Assert(!BufferIsLocal(buffer));
5491
5492 Assert(BufferIsPinned(buffer));
5493
5494 bufHdr = GetBufferDescriptor(buffer - 1);
5495
5496 Assert(BufferIsLockedByMe(buffer));
5497
5499}
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4414
bool BufferIsLockedByMe(Buffer buffer)
Definition bufmgr.c:2977

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsLockedByMe(), BufferIsPinned, fb(), FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, and IOOBJECT_RELATION.

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), invalidate_rel_block(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 5081 of file bufmgr.c.

5082{
5083 int i;
5085 SMgrRelation srel = RelationGetSmgr(rel);
5086
5087 if (RelationUsesLocalBuffers(rel))
5088 {
5089 for (i = 0; i < NLocBuffer; i++)
5090 {
5092
5094 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5095 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
5096 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5097 {
5098 ErrorContextCallback errcallback;
5099
5100 /* Setup error traceback support for ereport() */
5102 errcallback.arg = bufHdr;
5103 errcallback.previous = error_context_stack;
5104 error_context_stack = &errcallback;
5105
5106 /* Make sure we can handle the pin */
5109
5110 /*
5111 * Pin/unpin mostly to make valgrind work, but it also seems
5112 * like the right thing to do.
5113 */
5114 PinLocalBuffer(bufHdr, false);
5115
5116
5117 FlushLocalBuffer(bufHdr, srel);
5118
5120
5121 /* Pop the error context stack */
5122 error_context_stack = errcallback.previous;
5123 }
5124 }
5125
5126 return;
5127 }
5128
5129 for (i = 0; i < NBuffers; i++)
5130 {
5132
5134
5135 /*
5136 * As in DropRelationBuffers, an unlocked precheck should be safe and
5137 * saves some cycles.
5138 */
5140 continue;
5141
5142 /* Make sure we can handle the pin */
5145
5147 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5149 {
5153 }
5154 else
5156 }
5157}
static void local_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7258
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition localbuf.c:183
void UnpinLocalBuffer(Buffer buffer)
Definition localbuf.c:841
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition localbuf.c:805
int NLocBuffer
Definition localbuf.c:45
static SMgrRelation RelationGetSmgr(Relation rel)
Definition rel.h:576

References ErrorContextCallback::arg, BM_DIRTY, BM_VALID, BufferDescriptorGetBuffer(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, fb(), FlushLocalBuffer(), FlushUnlockedBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, local_buffer_write_error_callback(), LockBufHdr(), NBuffers, NLocBuffer, pg_atomic_read_u64(), PinBuffer_Locked(), PinLocalBuffer(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), UnlockBufHdr(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 5169 of file bufmgr.c.

5170{
5171 int i;
5173 bool use_bsearch;
5174
5175 if (nrels == 0)
5176 return;
5177
5178 /* fill-in array for qsort */
5180
5181 for (i = 0; i < nrels; i++)
5182 {
5183 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5184
5185 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5186 srels[i].srel = smgrs[i];
5187 }
5188
5189 /*
5190 * Save the bsearch overhead for low number of relations to sync. See
5191 * DropRelationsAllBuffers for details.
5192 */
5194
5195 /* sort the list of SMgrRelations if necessary */
5196 if (use_bsearch)
5197 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5198
5199 for (i = 0; i < NBuffers; i++)
5200 {
5204
5205 /*
5206 * As in DropRelationBuffers, an unlocked precheck should be safe and
5207 * saves some cycles.
5208 */
5209
5210 if (!use_bsearch)
5211 {
5212 int j;
5213
5214 for (j = 0; j < nrels; j++)
5215 {
5216 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5217 {
5218 srelent = &srels[j];
5219 break;
5220 }
5221 }
5222 }
5223 else
5224 {
5225 RelFileLocator rlocator;
5226
5227 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5228 srelent = bsearch(&rlocator,
5229 srels, nrels, sizeof(SMgrSortArray),
5231 }
5232
5233 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5234 if (srelent == NULL)
5235 continue;
5236
5237 /* Make sure we can handle the pin */
5240
5242 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5244 {
5248 }
5249 else
5251 }
5252
5253 pfree(srels);
5254}

References Assert, BM_DIRTY, BM_VALID, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, fb(), FlushUnlockedBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, LockBufHdr(), NBuffers, palloc_array, pfree(), PinBuffer_Locked(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), rlocator_comparator(), UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ FlushUnlockedBuffer()

static void FlushUnlockedBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 4545 of file bufmgr.c.

4547{
4549
4552 BufferLockUnlock(buffer, buf);
4553}
static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5765
static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:5881

References buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferDescriptorGetBuffer(), BufferLockAcquire(), BufferLockUnlock(), fb(), FlushBuffer(), IOCONTEXT_NORMAL, and IOOBJECT_RELATION.

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 565 of file bufmgr.c.

566{
567 Assert(ref->data.refcount == 0);
568 Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
569
570 if (ref >= &PrivateRefCountArray[0] &&
572 {
573 ref->buffer = InvalidBuffer;
575
576
577 /*
578 * Mark the just used entry as reserved - in many scenarios that
579 * allows us to avoid ever having to search the array/hash for free
580 * entries.
581 */
583 }
584 else
585 {
589 }
590}
static int ReservedRefCountSlot
Definition bufmgr.c:268

References Assert, BUFFER_LOCK_UNLOCK, fb(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountSlot.

Referenced by UnpinBufferNoOwner().

◆ GetAdditionalPinLimit()

uint32 GetAdditionalPinLimit ( void  )

Definition at line 2621 of file bufmgr.c.

2622{
2624
2625 /*
2626 * We get the number of "overflowed" pins for free, but don't know the
2627 * number of pins in PrivateRefCountArray. The cost of calculating that
2628 * exactly doesn't seem worth it, so just assume the max.
2629 */
2631
2632 /* Is this backend already holding more than its fair share? */
2634 return 0;
2635
2637}
static uint32 MaxProportionalPins
Definition bufmgr.c:271

References fb(), MaxProportionalPins, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by LimitAdditionalPins(), and read_stream_start_pending_read().

◆ GetPinLimit()

uint32 GetPinLimit ( void  )

Definition at line 2609 of file bufmgr.c.

2610{
2611 return MaxProportionalPins;
2612}

References MaxProportionalPins.

Referenced by GetAccessStrategy(), and read_stream_begin_impl().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 542 of file bufmgr.c.

543{
545
546 Assert(BufferIsValid(buffer));
547 Assert(!BufferIsLocal(buffer));
548
549 /*
550 * Not moving the entry - that's ok for the current users, but we might
551 * want to change this one day.
552 */
553 ref = GetPrivateRefCountEntry(buffer, false);
554
555 if (ref == NULL)
556 return 0;
557 return ref->data.refcount;
558}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), fb(), and GetPrivateRefCountEntry().

Referenced by CheckBufferIsPinnedOnce(), ConditionalLockBufferForCleanup(), DebugPrintBufferRefcount(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), and MarkSharedBufferDirtyHint().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
inlinestatic

Definition at line 507 of file bufmgr.c.

508{
509 Assert(BufferIsValid(buffer));
510 Assert(!BufferIsLocal(buffer));
511
512 /*
513 * It's very common to look up the same buffer repeatedly. To make that
514 * fast, we have a one-entry cache.
515 *
516 * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
517 * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
518 * fewer addresses are computed and fewer cachelines are accessed. Whereas
519 * in GetPrivateRefCountEntrySlow()'s case, checking
520 * PrivateRefCountArrayKeys saves a lot of memory accesses.
521 */
522 if (likely(PrivateRefCountEntryLast != -1) &&
524 {
526 }
527
528 /*
529 * The code for the cached lookup is small enough to be worth inlining
530 * into the caller. In the miss case however, that empirically doesn't
531 * seem worth it.
532 */
533 return GetPrivateRefCountEntrySlow(buffer, do_move);
534}
static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
Definition bufmgr.c:419
static int PrivateRefCountEntryLast
Definition bufmgr.c:269

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), fb(), GetPrivateRefCountEntrySlow(), likely, PrivateRefCountArray, and PrivateRefCountEntryLast.

Referenced by BufferLockAcquire(), BufferLockConditional(), BufferLockDisownInternal(), BufferLockHeldByMe(), BufferLockHeldByMeInMode(), GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), ResOwnerReleaseBuffer(), SharedBufferBeginSetHintBits(), and UnpinBufferNoOwner().

◆ GetPrivateRefCountEntrySlow()

static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 419 of file bufmgr.c.

420{
422 int match = -1;
423 int i;
424
425 /*
426 * First search for references in the array, that'll be sufficient in the
427 * majority of cases.
428 */
429 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
430 {
431 if (PrivateRefCountArrayKeys[i] == buffer)
432 {
433 match = i;
434 /* see ReservePrivateRefCountEntry() for why we don't return */
435 }
436 }
437
438 if (likely(match != -1))
439 {
440 /* update cache for the next lookup */
442
443 return &PrivateRefCountArray[match];
444 }
445
446 /*
447 * By here we know that the buffer, if already pinned, isn't residing in
448 * the array.
449 *
450 * Only look up the buffer in the hashtable if we've previously overflowed
451 * into it.
452 */
454 return NULL;
455
457
458 if (res == NULL)
459 return NULL;
460 else if (!do_move)
461 {
462 /* caller doesn't want us to move the hash entry into the array */
463 return res;
464 }
465 else
466 {
467 /* move buffer from hashtable into the free array slot */
470
471 /* Save data and delete from hashtable while res is still valid */
472 data = res->data;
476
477 /* Ensure there's a free array slot */
479
480 /* Use up the reserved slot */
484 Assert(free->buffer == InvalidBuffer);
485
486 /* and fill it */
487 free->buffer = buffer;
488 free->data = data;
490 /* update cache for the next lookup */
492
494
495 return free;
496 }
497}
const void * data
#define free(a)

References Assert, PrivateRefCountEntry::buffer, PrivateRefCountEntry::data, data, fb(), free, i, InvalidBuffer, likely, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountEntryLast, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, ReservedRefCountSlot, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCountEntry().

◆ GetVictimBuffer()

static Buffer GetVictimBuffer ( BufferAccessStrategy  strategy,
IOContext  io_context 
)
static

Definition at line 2461 of file bufmgr.c.

2462{
2464 Buffer buf;
2466 bool from_ring;
2467
2468 /*
2469 * Ensure, before we pin a victim buffer, that there's a free refcount
2470 * entry and resource owner slot for the pin.
2471 */
2474
2475 /* we return here if a prospective victim buffer gets used concurrently */
2476again:
2477
2478 /*
2479 * Select a victim buffer. The buffer is returned pinned and owned by
2480 * this backend.
2481 */
2484
2485 /*
2486 * We shouldn't have any other pins for this buffer.
2487 */
2489
2490 /*
2491 * If the buffer was dirty, try to write it out. There is a race
2492 * condition here, another backend could dirty the buffer between
2493 * StrategyGetBuffer() checking that it is not in use and invalidating the
2494 * buffer below. That's addressed by InvalidateVictimBuffer() verifying
2495 * that the buffer is not dirty.
2496 */
2497 if (buf_state & BM_DIRTY)
2498 {
2501
2502 /*
2503 * We need a share-exclusive lock on the buffer contents to write it
2504 * out (else we might write invalid data, eg because someone else is
2505 * compacting the page contents while we write). We must use a
2506 * conditional lock acquisition here to avoid deadlock. Even though
2507 * the buffer was not pinned (and therefore surely not locked) when
2508 * StrategyGetBuffer returned it, someone else could have pinned and
2509 * (share-)exclusive-locked it by the time we get here. If we try to
2510 * get the lock unconditionally, we'd block waiting for them; if they
2511 * later block waiting for us, deadlock ensues. (This has been
2512 * observed to happen when two backends are both trying to split btree
2513 * index pages, and the second one just happens to be trying to split
2514 * the page the first one got from StrategyGetBuffer.)
2515 */
2517 {
2518 /*
2519 * Someone else has locked the buffer, so give it up and loop back
2520 * to get another one.
2521 */
2523 goto again;
2524 }
2525
2526 /*
2527 * If using a nondefault strategy, and this victim came from the
2528 * strategy ring, let the strategy decide whether to reject it when
2529 * reusing it would require a WAL flush. This only applies to
2530 * permanent buffers; unlogged buffers can have fake LSNs, so
2531 * XLogNeedsFlush() is not meaningful for them.
2532 *
2533 * We need to hold the content lock in at least share-exclusive mode
2534 * to safely inspect the page LSN, so this couldn't have been done
2535 * inside StrategyGetBuffer().
2536 */
2537 if (strategy && from_ring &&
2541 {
2544 goto again;
2545 }
2546
2547 /* OK, do the I/O */
2550
2552 &buf_hdr->tag);
2553 }
2554
2555
2556 if (buf_state & BM_VALID)
2557 {
2558 /*
2559 * When a BufferAccessStrategy is in use, blocks evicted from shared
2560 * buffers are counted as IOOP_EVICT in the corresponding context
2561 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2562 * strategy in two cases: 1) while initially claiming buffers for the
2563 * strategy ring 2) to replace an existing strategy ring buffer
2564 * because it is pinned or in use and cannot be reused.
2565 *
2566 * Blocks evicted from buffers already in the strategy ring are
2567 * counted as IOOP_REUSE in the corresponding strategy context.
2568 *
2569 * At this point, we can accurately count evictions and reuses,
2570 * because we have successfully claimed the valid buffer. Previously,
2571 * we may have been forced to release the buffer due to concurrent
2572 * pinners or erroring out.
2573 */
2575 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2576 }
2577
2578 /*
2579 * If the buffer has an entry in the buffer mapping table, delete it. This
2580 * can fail because another backend could have pinned or dirtied the
2581 * buffer.
2582 */
2584 {
2586 goto again;
2587 }
2588
2589 /* a final set of sanity checks */
2590#ifdef USE_ASSERT_CHECKING
2592
2595
2597#endif
2598
2599 return buf;
2600}
WritebackContext BackendWritebackContext
Definition buf_init.c:25
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition bufmgr.c:6504
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition bufmgr.c:7473
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
Definition freelist.c:174
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition freelist.c:787
@ IOOP_EVICT
Definition pgstat.h:311
@ IOOP_REUSE
Definition pgstat.h:314
bool XLogNeedsFlush(XLogRecPtr record)
Definition xlog.c:3129

References Assert, BackendWritebackContext, BM_DIRTY, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetBuffer(), BufferGetLSN, BufferLockConditional(), CheckBufferIsPinnedOnce(), CurrentResourceOwner, fb(), FlushBuffer(), InvalidateVictimBuffer(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, LockBuffer(), pg_atomic_read_u64(), pgstat_count_io_op(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), StrategyGetBuffer(), StrategyRejectBuffer(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by BufferAlloc(), and ExtendBufferedRelShared().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 6684 of file bufmgr.c.

6685{
6687
6688 /*
6689 * If we get woken slowly then it's possible that the Startup process was
6690 * already woken by other backends before we got here. Also possible that
6691 * we get here by multiple interrupts or interrupts at inappropriate
6692 * times, so make sure we do nothing if the bufid is not set.
6693 */
6694 if (bufid < 0)
6695 return false;
6696
6697 if (GetPrivateRefCount(bufid + 1) > 0)
6698 return true;
6699
6700 return false;
6701}
int GetStartupBufferPinWaitBufId(void)
Definition proc.c:759

References fb(), GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and ProcessRecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 5537 of file bufmgr.c.

5538{
5539 Assert(BufferIsPinned(buffer));
5541 if (BufferIsLocal(buffer))
5542 LocalRefCount[-buffer - 1]++;
5543 else
5544 {
5546
5547 ref = GetPrivateRefCountEntry(buffer, true);
5548 Assert(ref != NULL);
5549 ref->data.refcount++;
5550 }
5552}
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, fb(), GetPrivateRefCountEntry(), LocalRefCount, ResourceOwnerEnlarge(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), RelationAddBlocks(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

◆ InitBufferManagerAccess()

void InitBufferManagerAccess ( void  )

Definition at line 4127 of file bufmgr.c.

4128{
4129 /*
4130 * An advisory limit on the number of pins each backend should hold, based
4131 * on shared_buffers and the maximum number of connections possible.
4132 * That's very pessimistic, but outside toy-sized shared_buffers it should
4133 * allow plenty of pins. LimitAdditionalPins() and
4134 * GetAdditionalPinLimit() can be used to check the remaining balance.
4135 */
4137
4140
4142
4143 /*
4144 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4145 * the corresponding phase of backend shutdown.
4146 */
4147 Assert(MyProc != NULL);
4149}
static void AtProcExit_Buffers(int code, Datum arg)
Definition bufmgr.c:4156
int MaxBackends
Definition globals.c:146
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:372
MemoryContext CurrentMemoryContext
Definition mcxt.c:160
#define NUM_AUXILIARY_PROCS
Definition proc.h:524

References Assert, AtProcExit_Buffers(), CurrentMemoryContext, fb(), MaxBackends, MaxProportionalPins, MyProc, NBuffers, NUM_AUXILIARY_PROCS, on_shmem_exit(), PrivateRefCountArray, PrivateRefCountArrayKeys, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 2283 of file bufmgr.c.

2284{
2286 uint32 oldHash; /* hash value for oldTag */
2287 LWLock *oldPartitionLock; /* buffer partition lock for it */
2290
2291 /* Save the original buffer tag before dropping the spinlock */
2292 oldTag = buf->tag;
2293
2295
2296 /*
2297 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2298 * worth storing the hashcode in BufferDesc so we need not recompute it
2299 * here? Probably not.
2300 */
2303
2304retry:
2305
2306 /*
2307 * Acquire exclusive mapping lock in preparation for changing the buffer's
2308 * association.
2309 */
2311
2312 /* Re-lock the buffer header */
2314
2315 /* If it's changed while we were waiting for lock, do nothing */
2316 if (!BufferTagsEqual(&buf->tag, &oldTag))
2317 {
2320 return;
2321 }
2322
2323 /*
2324 * We assume the reason for it to be pinned is that either we were
2325 * asynchronously reading the page in before erroring out or someone else
2326 * is flushing the page out. Wait for the IO to finish. (This could be
2327 * an infinite loop if the refcount is messed up... it would be nice to
2328 * time out after awhile, but there seems no way to be sure how many loops
2329 * may be needed. Note that if the other guy has pinned the buffer but
2330 * not yet done StartBufferIO, WaitIO will fall through and we'll
2331 * effectively be busy-looping here.)
2332 */
2334 {
2337 /* safety check: should definitely not be our *own* pin */
2339 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2340 WaitIO(buf);
2341 goto retry;
2342 }
2343
2344 /*
2345 * An invalidated buffer should not have any backends waiting to lock the
2346 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2347 */
2349
2350 /*
2351 * Clear out the buffer's tag and flags. We must do this to ensure that
2352 * linear scans of the buffer array don't think the buffer is valid.
2353 */
2355 ClearBufferTag(&buf->tag);
2356
2358 0,
2360 0);
2361
2362 /*
2363 * Remove the buffer from the lookup hashtable, if it was in there.
2364 */
2365 if (oldFlags & BM_TAG_VALID)
2367
2368 /*
2369 * Done with mapping lock.
2370 */
2372}
#define BUF_USAGECOUNT_MASK
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static void ClearBufferTag(BufferTag *tag)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:148
static void WaitIO(BufferDesc *buf)
Definition bufmgr.c:7006

References Assert, BM_LOCK_WAKE_IN_PROGRESS, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog, ERROR, fb(), GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), UnlockBufHdr(), UnlockBufHdrExt(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ InvalidateVictimBuffer()

static bool InvalidateVictimBuffer ( BufferDesc buf_hdr)
static

Definition at line 2384 of file bufmgr.c.

2385{
2387 uint32 hash;
2389 BufferTag tag;
2390
2392
2393 /* have buffer pinned, so it's safe to read tag without lock */
2394 tag = buf_hdr->tag;
2395
2396 hash = BufTableHashCode(&tag);
2398
2400
2401 /* lock the buffer header */
2403
2404 /*
2405 * We have the buffer pinned nobody else should have been able to unset
2406 * this concurrently.
2407 */
2410 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2411
2412 /*
2413 * If somebody else pinned the buffer since, or even worse, dirtied it,
2414 * give up on this buffer: It's clearly in use.
2415 */
2417 {
2419
2422
2423 return false;
2424 }
2425
2426 /*
2427 * An invalidated buffer should not have any backends waiting to lock the
2428 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2429 */
2431
2432 /*
2433 * Clear out the buffer's tag and flags and usagecount. This is not
2434 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2435 * doing anything with the buffer. But currently it's beneficial, as the
2436 * cheaper pre-check for several linear scans of shared buffers use the
2437 * tag (see e.g. FlushDatabaseBuffers()).
2438 */
2439 ClearBufferTag(&buf_hdr->tag);
2441 0,
2443 0);
2444
2446
2447 /* finally delete buffer from the buffer mapping table */
2448 BufTableDelete(&tag, hash);
2449
2451
2456
2457 return true;
2458}

References Assert, BM_DIRTY, BM_LOCK_WAKE_IN_PROGRESS, BM_TAG_VALID, BM_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), fb(), GetPrivateRefCount(), hash(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u64(), UnlockBufHdr(), and UnlockBufHdrExt().

Referenced by EvictUnpinnedBufferInternal(), and GetVictimBuffer().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 6768 of file bufmgr.c.

6769{
6772
6773 Assert(BufferIsValid(buffer));
6774
6775 /* see AIO related comment in LockBufferForCleanup() */
6776
6777 if (BufferIsLocal(buffer))
6778 {
6779 /* There should be exactly one pin */
6780 if (LocalRefCount[-buffer - 1] != 1)
6781 return false;
6782 /* Nobody else to wait for */
6783 return true;
6784 }
6785
6786 /* There should be exactly one local pin */
6787 if (GetPrivateRefCount(buffer) != 1)
6788 return false;
6789
6790 bufHdr = GetBufferDescriptor(buffer - 1);
6791
6792 /* caller must hold exclusive lock on buffer */
6794
6796
6799 {
6800 /* pincount is OK. */
6802 return true;
6803 }
6804
6806 return false;
6807}

References Assert, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), BufferIsValid(), fb(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext wb_context,
IOContext  io_context 
)

Definition at line 7523 of file bufmgr.c.

7524{
7526 int i;
7527
7528 if (wb_context->nr_pending == 0)
7529 return;
7530
7531 /*
7532 * Executing the writes in-order can make them a lot faster, and allows to
7533 * merge writeback requests to consecutive blocks into larger writebacks.
7534 */
7535 sort_pending_writebacks(wb_context->pending_writebacks,
7536 wb_context->nr_pending);
7537
7539
7540 /*
7541 * Coalesce neighbouring writes, but nothing else. For that we iterate
7542 * through the, now sorted, array of pending flushes, and look forward to
7543 * find all neighbouring (or identical) writes.
7544 */
7545 for (i = 0; i < wb_context->nr_pending; i++)
7546 {
7550 int ahead;
7551 BufferTag tag;
7553 Size nblocks = 1;
7554
7555 cur = &wb_context->pending_writebacks[i];
7556 tag = cur->tag;
7558
7559 /*
7560 * Peek ahead, into following writeback requests, to see if they can
7561 * be combined with the current one.
7562 */
7563 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
7564 {
7565
7566 next = &wb_context->pending_writebacks[i + ahead + 1];
7567
7568 /* different file, stop */
7570 BufTagGetRelFileLocator(&next->tag)) ||
7571 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
7572 break;
7573
7574 /* ok, block queued twice, skip */
7575 if (cur->tag.blockNum == next->tag.blockNum)
7576 continue;
7577
7578 /* only merge consecutive writes */
7579 if (cur->tag.blockNum + 1 != next->tag.blockNum)
7580 break;
7581
7582 nblocks++;
7583 cur = next;
7584 }
7585
7586 i += ahead;
7587
7588 /* and finally tell the kernel to write the data to storage */
7590 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
7591 }
7592
7593 /*
7594 * Assume that writeback requests are only issued for buffers containing
7595 * blocks of permanent relations.
7596 */
7598 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
7599
7600 wb_context->nr_pending = 0;
7601}
static int32 next
Definition blutils.c:225
struct cursor * cur
Definition ecpg.c:29
@ IOOP_WRITEBACK
Definition pgstat.h:315
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition smgr.c:805

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, fb(), i, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITEBACK, next, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RelFileLocatorEquals, smgropen(), smgrwriteback(), and track_io_timing.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ LimitAdditionalPins()

void LimitAdditionalPins ( uint32 additional_pins)

Definition at line 2647 of file bufmgr.c.

2648{
2649 uint32 limit;
2650
2651 if (*additional_pins <= 1)
2652 return;
2653
2654 limit = GetAdditionalPinLimit();
2655 limit = Max(limit, 1);
2656 if (limit < *additional_pins)
2657 *additional_pins = limit;
2658}
uint32 GetAdditionalPinLimit(void)
Definition bufmgr.c:2621
#define Max(x, y)
Definition c.h:1087

References fb(), GetAdditionalPinLimit(), and Max.

Referenced by ExtendBufferedRelShared().

◆ local_buffer_readv_complete()

static PgAioResult local_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 8707 of file bufmgr.c.

8709{
8711}
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition bufmgr.c:8452

References buffer_readv_complete(), and fb().

◆ local_buffer_readv_stage()

static void local_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 8701 of file bufmgr.c.

8702{
8703 buffer_stage_common(ioh, false, true);
8704}
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition bufmgr.c:8064

References buffer_stage_common(), and fb().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void arg)
static

Definition at line 7258 of file bufmgr.c.

7259{
7261
7262 if (bufHdr != NULL)
7263 errcontext("writing block %u of relation \"%s\"",
7264 bufHdr->tag.blockNum,
7267 BufTagGetForkNum(&bufHdr->tag)).str);
7268}
Datum arg
Definition elog.c:1322
#define errcontext
Definition elog.h:198

References arg, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, fb(), MyProcNumber, and relpathbackend.

Referenced by FlushRelationBuffers().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 6537 of file bufmgr.c.

6538{
6540 TimestampTz waitStart = 0;
6541 bool waiting = false;
6542 bool logged_recovery_conflict = false;
6543
6544 Assert(BufferIsPinned(buffer));
6546
6548
6549 /*
6550 * We do not yet need to be worried about in-progress AIOs holding a pin,
6551 * as we, so far, only support doing reads via AIO and this function can
6552 * only be called once the buffer is valid (i.e. no read can be in
6553 * flight).
6554 */
6555
6556 /* Nobody else to wait for */
6557 if (BufferIsLocal(buffer))
6558 return;
6559
6560 bufHdr = GetBufferDescriptor(buffer - 1);
6561
6562 for (;;)
6563 {
6565 uint64 unset_bits = 0;
6566
6567 /* Try to acquire lock */
6570
6573 {
6574 /* Successfully acquired exclusive lock with pincount 1 */
6576
6577 /*
6578 * Emit the log message if recovery conflict on buffer pin was
6579 * resolved but the startup process waited longer than
6580 * deadlock_timeout for it.
6581 */
6584 waitStart, GetCurrentTimestamp(),
6585 NULL, false);
6586
6587 if (waiting)
6588 {
6589 /* reset ps display to remove the suffix if we added one */
6591 waiting = false;
6592 }
6593 return;
6594 }
6595 /* Failed, so mark myself as waiting for pincount 1 */
6597 {
6600 elog(ERROR, "multiple backends attempting to wait for pincount 1");
6601 }
6602 bufHdr->wait_backend_pgprocno = MyProcNumber;
6606 0);
6608
6609 /* Wait to be signaled by UnpinBuffer() */
6610 if (InHotStandby)
6611 {
6612 if (!waiting)
6613 {
6614 /* adjust the process title to indicate that it's waiting */
6615 set_ps_display_suffix("waiting");
6616 waiting = true;
6617 }
6618
6619 /*
6620 * Emit the log message if the startup process is waiting longer
6621 * than deadlock_timeout for recovery conflict on buffer pin.
6622 *
6623 * Skip this if first time through because the startup process has
6624 * not started waiting yet in this case. So, the wait start
6625 * timestamp is set after this logic.
6626 */
6627 if (waitStart != 0 && !logged_recovery_conflict)
6628 {
6630
6631 if (TimestampDifferenceExceeds(waitStart, now,
6633 {
6635 waitStart, now, NULL, true);
6637 }
6638 }
6639
6640 /*
6641 * Set the wait start timestamp if logging is enabled and first
6642 * time through.
6643 */
6644 if (log_recovery_conflict_waits && waitStart == 0)
6645 waitStart = GetCurrentTimestamp();
6646
6647 /* Publish the bufid that Startup process waits on */
6648 SetStartupBufferPinWaitBufId(buffer - 1);
6649 /* Set alarm and then wait to be signaled by UnpinBuffer() */
6651 /* Reset the published bufid */
6653 }
6654 else
6656
6657 /*
6658 * Remove flag marking us as waiter. Normally this will not be set
6659 * anymore, but ProcWaitForSignal() can return for other signals as
6660 * well. We take care to only reset the flag if we're the waiter, as
6661 * theoretically another backend could have started waiting. That's
6662 * impossible with the current usages due to table level locking, but
6663 * better be safe.
6664 */
6666 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
6667 bufHdr->wait_backend_pgprocno == MyProcNumber)
6669
6671 0, unset_bits,
6672 0);
6673
6675 /* Loop back and try again */
6676 }
6677}
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1775
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1639
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1603
#define BM_PIN_COUNT_WAITER
static BufferDesc * PinCountWaitBuf
Definition bufmgr.c:228
int64 TimestampTz
Definition timestamp.h:39
void set_ps_display_remove_suffix(void)
Definition ps_status.c:439
void set_ps_display_suffix(const char *suffix)
Definition ps_status.c:387
int DeadlockTimeout
Definition proc.c:59
void SetStartupBufferPinWaitBufId(int bufid)
Definition proc.c:747
void ProcWaitForSignal(uint32 wait_event_info)
Definition proc.c:2002
void ResolveRecoveryConflictWithBufferPin(void)
Definition standby.c:794
bool log_recovery_conflict_waits
Definition standby.c:43
void LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition standby.c:275
@ RECOVERY_CONFLICT_BUFFERPIN
Definition standby.h:46
static volatile sig_atomic_t waiting
#define InHotStandby
Definition xlogutils.h:60

References Assert, BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, CheckBufferIsPinnedOnce(), DeadlockTimeout, elog, ERROR, fb(), GetBufferDescriptor(), GetCurrentTimestamp(), InHotStandby, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProcNumber, now(), PinCountWaitBuf, ProcWaitForSignal(), RECOVERY_CONFLICT_BUFFERPIN, ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), UnlockBufHdrExt(), and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), XLogReadBufferForRedoExtended(), and ZeroAndLockBuffer().

◆ LockBufferInternal()

void LockBufferInternal ( Buffer  buffer,
BufferLockMode  mode 
)

Definition at line 6441 of file bufmgr.c.

6442{
6444
6445 /*
6446 * We can't wait if we haven't got a PGPROC. This should only occur
6447 * during bootstrap or shared memory initialization. Put an Assert here
6448 * to catch unsafe coding practices.
6449 */
6451
6452 /* handled in LockBuffer() wrapper */
6454
6455 Assert(BufferIsPinned(buffer));
6456 if (BufferIsLocal(buffer))
6457 return; /* local buffers need no lock */
6458
6459 buf_hdr = GetBufferDescriptor(buffer - 1);
6460
6461 /*
6462 * Test the most frequent lock modes first. While a switch (mode) would be
6463 * nice, at least gcc generates considerably worse code for it.
6464 *
6465 * Call BufferLockAcquire() with a constant argument for mode, to generate
6466 * more efficient code for the different lock modes.
6467 */
6468 if (mode == BUFFER_LOCK_SHARE)
6470 else if (mode == BUFFER_LOCK_EXCLUSIVE)
6474 else
6475 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
6476}
bool IsUnderPostmaster
Definition globals.c:120

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, BufferLockAcquire(), elog, ERROR, fb(), GetBufferDescriptor(), IsUnderPostmaster, mode, and MyProc.

Referenced by LockBuffer().

◆ LockBufHdr()

uint64 LockBufHdr ( BufferDesc desc)

Definition at line 7301 of file bufmgr.c.

7302{
7304
7306
7307 while (true)
7308 {
7309 /*
7310 * Always try once to acquire the lock directly, without setting up
7311 * the spin-delay infrastructure. The work necessary for that shows up
7312 * in profiles and is rarely necessary.
7313 */
7315 if (likely(!(old_buf_state & BM_LOCKED)))
7316 break; /* got lock */
7317
7318 /* and then spin without atomic operations until lock is released */
7319 {
7321
7323
7324 while (old_buf_state & BM_LOCKED)
7325 {
7328 }
7330 }
7331
7332 /*
7333 * Retry. The lock might obviously already be re-acquired by the time
7334 * we're attempting to get it again.
7335 */
7336 }
7337
7338 return old_buf_state | BM_LOCKED;
7339}
void perform_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:186
#define init_local_spin_delay(status)
Definition s_lock.h:749

References Assert, BM_LOCKED, BufferDescriptorGetBuffer(), BufferIsLocal, fb(), finish_spin_delay(), init_local_spin_delay, likely, perform_spin_delay(), pg_atomic_fetch_or_u64(), pg_atomic_read_u64(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), buffer_stage_common(), BufferAlloc(), BufferGetLSNAtomic(), BufferLockDequeueSelf(), BufferLockQueueSelf(), BufferLockWakeup(), BufferSync(), ConditionalLockBufferForCleanup(), create_toy_buffer(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FindAndDropRelationBuffers(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), MarkDirtyUnpinnedBuffer(), MarkSharedBufferDirtyHint(), pg_buffercache_os_pages_internal(), pg_buffercache_pages(), StartBufferIO(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), WaitIO(), and WakePinCountWaiter().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 3063 of file bufmgr.c.

3064{
3068
3069 if (!BufferIsValid(buffer))
3070 elog(ERROR, "bad buffer ID: %d", buffer);
3071
3072 if (BufferIsLocal(buffer))
3073 {
3074 MarkLocalBufferDirty(buffer);
3075 return;
3076 }
3077
3078 bufHdr = GetBufferDescriptor(buffer - 1);
3079
3080 Assert(BufferIsPinned(buffer));
3082
3083 /*
3084 * NB: We have to wait for the buffer header spinlock to be not held, as
3085 * TerminateBufferIO() relies on the spinlock.
3086 */
3088 for (;;)
3089 {
3092
3094
3097
3099 buf_state))
3100 break;
3101 }
3102
3103 /*
3104 * If the buffer was not dirty already, do vacuum accounting.
3105 */
3106 if (!(old_buf_state & BM_DIRTY))
3107 {
3109 if (VacuumCostActive)
3111 }
3112}
pg_noinline uint64 WaitBufHdrUnlocked(BufferDesc *buf)
Definition bufmgr.c:7349
int VacuumCostPageDirty
Definition globals.c:153
int64 shared_blks_dirtied
Definition instrument.h:28

References Assert, BM_DIRTY, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), BufferIsPinned, BufferIsValid(), elog, ERROR, fb(), GetBufferDescriptor(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), pgBufferUsage, BufferUsage::shared_blks_dirtied, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), dataExecPlaceToPageInternal(), dataExecPlaceToPageLeaf(), doPickSplit(), entryExecPlaceToPage(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePostingPage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_inplace_update_and_unlock(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune_and_freeze(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_new_or_empty(), lazy_scan_prune(), lazy_vacuum_heap_page(), log_newpage_range(), MarkDirtyUnpinnedBufferInternal(), moveLeafs(), nextval_internal(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), SetSequence(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), visibilitymap_set_vmbits(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

◆ MarkDirtyAllUnpinnedBuffers()

void MarkDirtyAllUnpinnedBuffers ( int32 buffers_dirtied,
int32 buffers_already_dirty,
int32 buffers_skipped 
)

Definition at line 8015 of file bufmgr.c.

8018{
8019 *buffers_dirtied = 0;
8021 *buffers_skipped = 0;
8022
8023 for (int buf = 1; buf <= NBuffers; buf++)
8024 {
8025 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8028
8030
8032 if (!(buf_state & BM_VALID))
8033 continue;
8034
8037
8038 LockBufHdr(desc);
8039
8041 (*buffers_dirtied)++;
8042 else if (buffer_already_dirty)
8043 (*buffers_already_dirty)++;
8044 else
8045 (*buffers_skipped)++;
8046 }
8047}
static bool MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
Definition bufmgr.c:7866

References BM_VALID, buf, CHECK_FOR_INTERRUPTS, CurrentResourceOwner, fb(), GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), NBuffers, pg_atomic_read_u64(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), and BufferDesc::state.

Referenced by pg_buffercache_mark_dirty_all().

◆ MarkDirtyRelUnpinnedBuffers()

void MarkDirtyRelUnpinnedBuffers ( Relation  rel,
int32 buffers_dirtied,
int32 buffers_already_dirty,
int32 buffers_skipped 
)

Definition at line 7958 of file bufmgr.c.

7962{
7964
7965 *buffers_dirtied = 0;
7967 *buffers_skipped = 0;
7968
7969 for (int buf = 1; buf <= NBuffers; buf++)
7970 {
7971 BufferDesc *desc = GetBufferDescriptor(buf - 1);
7974
7976
7977 /* An unlocked precheck should be safe and saves some cycles. */
7978 if ((buf_state & BM_VALID) == 0 ||
7980 continue;
7981
7982 /* Make sure we can pin the buffer. */
7985
7986 buf_state = LockBufHdr(desc);
7987
7988 /* recheck, could have changed without the lock */
7989 if ((buf_state & BM_VALID) == 0 ||
7991 {
7992 UnlockBufHdr(desc);
7993 continue;
7994 }
7995
7997 (*buffers_dirtied)++;
7998 else if (buffer_already_dirty)
7999 (*buffers_already_dirty)++;
8000 else
8001 (*buffers_skipped)++;
8002 }
8003}

References Assert, BM_VALID, buf, BufTagMatchesRelFileLocator(), CHECK_FOR_INTERRUPTS, CurrentResourceOwner, fb(), GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), NBuffers, pg_atomic_read_u64(), RelationData::rd_locator, RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by pg_buffercache_mark_dirty_relation().

◆ MarkDirtyUnpinnedBuffer()

bool MarkDirtyUnpinnedBuffer ( Buffer  buf,
bool buffer_already_dirty 
)

Definition at line 7922 of file bufmgr.c.

7923{
7924 BufferDesc *desc;
7925 bool buffer_dirtied = false;
7926
7928
7929 /* Make sure we can pin the buffer. */
7932
7933 desc = GetBufferDescriptor(buf - 1);
7934 LockBufHdr(desc);
7935
7937 /* Both can not be true at the same time */
7939
7940 return buffer_dirtied;
7941}

References Assert, buf, BufferIsLocal, CurrentResourceOwner, fb(), GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), ReservePrivateRefCountEntry(), and ResourceOwnerEnlarge().

Referenced by pg_buffercache_mark_dirty().

◆ MarkDirtyUnpinnedBufferInternal()

static bool MarkDirtyUnpinnedBufferInternal ( Buffer  buf,
BufferDesc desc,
bool buffer_already_dirty 
)
static

Definition at line 7866 of file bufmgr.c.

7868{
7870 bool result = false;
7871
7872 *buffer_already_dirty = false;
7873
7876
7877 if ((buf_state & BM_VALID) == 0)
7878 {
7879 UnlockBufHdr(desc);
7880 return false;
7881 }
7882
7883 /* Check that it's not pinned already. */
7885 {
7886 UnlockBufHdr(desc);
7887 return false;
7888 }
7889
7890 /* Pin the buffer and then release the buffer spinlock */
7891 PinBuffer_Locked(desc);
7892
7893 /* If it was not already dirty, mark it as dirty. */
7894 if (!(buf_state & BM_DIRTY))
7895 {
7898 result = true;
7899 BufferLockUnlock(buf, desc);
7900 }
7901 else
7902 *buffer_already_dirty = true;
7903
7904 UnpinBuffer(desc);
7905
7906 return result;
7907}
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3063

References Assert, BM_DIRTY, BM_LOCKED, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_EXCLUSIVE, BufferLockAcquire(), BufferLockUnlock(), fb(), MarkBufferDirty(), pg_atomic_read_u64(), PinBuffer_Locked(), BufferDesc::state, UnlockBufHdr(), and UnpinBuffer().

Referenced by MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), and MarkDirtyUnpinnedBuffer().

◆ MarkSharedBufferDirtyHint()

static void MarkSharedBufferDirtyHint ( Buffer  buffer,
BufferDesc bufHdr,
uint64  lockstate,
bool  buffer_std 
)
inlinestatic

Definition at line 5563 of file bufmgr.c.

5565{
5566 Page page = BufferGetPage(buffer);
5567
5568 Assert(GetPrivateRefCount(buffer) > 0);
5569
5570 /* here, either share-exclusive or exclusive lock is OK */
5573
5574 /*
5575 * This routine might get called many times on the same page, if we are
5576 * making the first scan after commit of an xact that added/deleted many
5577 * tuples. So, be as quick as we can if the buffer is already dirty.
5578 *
5579 * As we are holding (at least) a share-exclusive lock, nobody could have
5580 * cleaned or dirtied the page concurrently, so we can just rely on the
5581 * previously fetched value here without any danger of races.
5582 */
5583 if (unlikely(!(lockstate & BM_DIRTY)))
5584 {
5586 bool wal_log = false;
5588
5589 /*
5590 * If we need to protect hint bit updates from torn writes, WAL-log a
5591 * full page image of the page. This full page image is only necessary
5592 * if the hint bit update is the first change to the page since the
5593 * last checkpoint.
5594 *
5595 * We don't check full_page_writes here because that logic is included
5596 * when we call XLogInsert() since the value changes dynamically.
5597 */
5599 {
5600 /*
5601 * If we must not write WAL, due to a relfilelocator-specific
5602 * condition or being in recovery, don't dirty the page. We can
5603 * set the hint, just not dirty the page as a result so the hint
5604 * is lost when we evict the page or shutdown.
5605 *
5606 * See src/backend/storage/page/README for longer discussion.
5607 */
5608 if (RecoveryInProgress() ||
5610 return;
5611
5612 wal_log = true;
5613 }
5614
5615 /*
5616 * We must mark the page dirty before we emit the WAL record, as per
5617 * the usual rules, to ensure that BufferSync()/SyncOneBuffer() try to
5618 * flush the buffer, even if we haven't inserted the WAL record yet.
5619 * As we hold at least a share-exclusive lock, checkpoints will wait
5620 * for this backend to be done with the buffer before continuing. If
5621 * we did it the other way round, a checkpoint could start between
5622 * writing the WAL record and marking the buffer dirty.
5623 */
5625
5626 /*
5627 * It should not be possible for the buffer to already be dirty, see
5628 * comment above.
5629 */
5633 BM_DIRTY,
5634 0, 0);
5635
5636 /*
5637 * If the block is already dirty because we either made a change or
5638 * set a hint already, then we don't need to write a full page image.
5639 * Note that aggressive cleaning of blocks dirtied by hint bit setting
5640 * would increase the call rate. Bulk setting of hint bits would
5641 * reduce the call rate...
5642 */
5643 if (wal_log)
5644 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5645
5646 if (XLogRecPtrIsValid(lsn))
5647 {
5648 /*
5649 * Set the page LSN if we wrote a backup block. To allow backends
5650 * that only hold a share lock on the buffer to read the LSN in a
5651 * tear-free manner, we set the page LSN while holding the buffer
5652 * header lock. This allows any reader of an LSN who holds only a
5653 * share lock to also obtain a buffer header lock before using
5654 * PageGetLSN() to read the LSN in a tear free way. This is done
5655 * in BufferGetLSNAtomic().
5656 *
5657 * If checksums are enabled, you might think we should reset the
5658 * checksum here. That will happen when the page is written
5659 * sometime later in this checkpoint cycle.
5660 */
5662 PageSetLSN(page, lsn);
5664 }
5665
5667 if (VacuumCostActive)
5669 }
5670}
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:417
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition storage.c:573
bool RecoveryInProgress(void)
Definition xlog.c:6444
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)

References Assert, BM_DIRTY, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferGetPage(), BufferLockHeldByMeInMode(), BufTagGetRelFileLocator(), fb(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), PageSetLSN(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, unlikely, UnlockBufHdr(), UnlockBufHdrExt(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsValid, and XLogSaveBufferForHint().

Referenced by BufferSetHintBits16(), and MarkBufferDirtyHint().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 388 of file bufmgr.c.

389{
391
392 /* only allowed to be called when a reservation has been made */
394
395 /* use up the reserved entry */
397
398 /* and fill it */
400 res->buffer = buffer;
401 res->data.refcount = 0;
403
404 /* update cache for the next lookup */
406
408
409 return res;
410}

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, PrivateRefCountEntry::data, PrivateRefCountData::lockmode, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountEntryLast, PrivateRefCountData::refcount, and ReservedRefCountSlot.

Referenced by TrackNewBufferPin().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy,
bool  skip_if_not_valid 
)
static

Definition at line 3188 of file bufmgr.c.

3190{
3192 bool result;
3194
3197
3198 ref = GetPrivateRefCountEntry(b, true);
3199
3200 if (ref == NULL)
3201 {
3204
3206 for (;;)
3207 {
3209 return false;
3210
3211 /*
3212 * We're not allowed to increase the refcount while the buffer
3213 * header spinlock is held. Wait for the lock to be released.
3214 */
3217
3219
3220 /* increase refcount */
3222
3223 if (strategy == NULL)
3224 {
3225 /* Default case: increase usagecount unless already max. */
3228 }
3229 else
3230 {
3231 /*
3232 * Ring buffers shouldn't evict others from pool. Thus we
3233 * don't make usagecount more than 1.
3234 */
3237 }
3238
3240 buf_state))
3241 {
3242 result = (buf_state & BM_VALID) != 0;
3243
3245 break;
3246 }
3247 }
3248 }
3249 else
3250 {
3251 /*
3252 * If we previously pinned the buffer, it is likely to be valid, but
3253 * it may not be if StartReadBuffers() was called and
3254 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3255 * the flags without locking. This is racy, but it's OK to return
3256 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3257 * it'll see that it's now valid.
3258 *
3259 * Note: We deliberately avoid a Valgrind client request here.
3260 * Individual access methods can optionally superimpose buffer page
3261 * client requests on top of our client requests to enforce that
3262 * buffers are only accessed while locked (and pinned). It's possible
3263 * that the buffer page is legitimately non-accessible here. We
3264 * cannot meddle with that.
3265 */
3266 result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
3267
3268 Assert(ref->data.refcount > 0);
3269 ref->data.refcount++;
3271 }
3272
3273 return result;
3274}
#define BM_MAX_USAGE_COUNT
#define BUF_STATE_GET_USAGECOUNT(state)
void TrackNewBufferPin(Buffer buf)
Definition bufmgr.c:3423

References Assert, b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, CurrentResourceOwner, fb(), GetPrivateRefCountEntry(), pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), ReservedRefCountSlot, ResourceOwnerRememberBuffer(), TrackNewBufferPin(), unlikely, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), ExtendBufferedRelShared(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 3299 of file bufmgr.c.

3300{
3302
3303 /*
3304 * As explained, We don't expect any preexisting pins. That allows us to
3305 * manipulate the PrivateRefCount after releasing the spinlock
3306 */
3308
3309 /*
3310 * Since we hold the buffer spinlock, we can update the buffer state and
3311 * release the lock in one operation.
3312 */
3314
3316 0, 0, 1);
3317
3319}

References Assert, buf, BufferDescriptorGetBuffer(), fb(), GetPrivateRefCountEntry(), pg_atomic_read_u64(), TrackNewBufferPin(), and UnlockBufHdrExt().

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), MarkDirtyUnpinnedBufferInternal(), and SyncOneBuffer().

◆ PinBufferForBlock()

static pg_attribute_always_inline Buffer PinBufferForBlock ( Relation  rel,
SMgrRelation  smgr,
char  persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr 
)
static

Definition at line 1220 of file bufmgr.c.

1227{
1231
1232 Assert(blockNum != P_NEW);
1233
1234 /* Persistence should be set before */
1235 Assert((persistence == RELPERSISTENCE_TEMP ||
1236 persistence == RELPERSISTENCE_PERMANENT ||
1237 persistence == RELPERSISTENCE_UNLOGGED));
1238
1239 if (persistence == RELPERSISTENCE_TEMP)
1240 {
1243 }
1244 else
1245 {
1246 io_context = IOContextForStrategy(strategy);
1248 }
1249
1250 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1254 smgr->smgr_rlocator.backend);
1255
1256 if (persistence == RELPERSISTENCE_TEMP)
1257 {
1258 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1259 if (*foundPtr)
1261 }
1262 else
1263 {
1264 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1265 strategy, foundPtr, io_context);
1266 if (*foundPtr)
1268 }
1269 if (rel)
1270 {
1271 /*
1272 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1273 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1274 * zeroed instead), the per-relation stats always count them.
1275 */
1277 if (*foundPtr)
1279 }
1280 if (*foundPtr)
1281 {
1283 if (VacuumCostActive)
1285
1286 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1290 smgr->smgr_rlocator.backend,
1291 true);
1292 }
1293
1295}
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition bufmgr.c:2110
#define P_NEW
Definition bufmgr.h:198
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition localbuf.c:119
#define pgstat_count_buffer_read(rel)
Definition pgstat.h:746

References Assert, RelFileLocatorBackend::backend, BufferAlloc(), BufferDescriptorGetBuffer(), RelFileLocator::dbOid, fb(), IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_HIT, BufferUsage::local_blks_hit, LocalBufferAlloc(), RelFileLocatorBackend::locator, P_NEW, pgBufferUsage, pgstat_count_buffer_hit, pgstat_count_buffer_read, pgstat_count_io_op(), RelFileLocator::relNumber, BufferUsage::shared_blks_hit, SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, VacuumCostActive, VacuumCostBalance, and VacuumCostPageHit.

Referenced by ReadBuffer_common(), and StartReadBuffersImpl().

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 782 of file bufmgr.c.

783{
785 Assert(BlockNumberIsValid(blockNum));
786
788 {
789 /* see comments in ReadBufferExtended */
793 errmsg("cannot access temporary tables of other sessions")));
794
795 /* pass it off to localbuf.c */
796 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
797 }
798 else
799 {
800 /* pass it to the shared buffer version */
801 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
802 }
803}
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:692
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition localbuf.c:72
#define RELATION_IS_OTHER_TEMP(relation)
Definition rel.h:667
#define RelationIsValid(relation)
Definition rel.h:489

References Assert, BlockNumberIsValid(), ereport, errcode(), errmsg, ERROR, fb(), PrefetchLocalBuffer(), PrefetchSharedBuffer(), RELATION_IS_OTHER_TEMP, RelationGetSmgr(), RelationIsValid, and RelationUsesLocalBuffers.

Referenced by count_nondeletable_pages(), invalidate_rel_block(), and pg_prewarm().

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 692 of file bufmgr.c.

695{
696 PrefetchBufferResult result = {InvalidBuffer, false};
697 BufferTag newTag; /* identity of requested block */
698 uint32 newHash; /* hash value for newTag */
699 LWLock *newPartitionLock; /* buffer partition lock for it */
700 int buf_id;
701
702 Assert(BlockNumberIsValid(blockNum));
703
704 /* create a tag so we can lookup the buffer */
705 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
706 forkNum, blockNum);
707
708 /* determine its hash code and partition lock ID */
711
712 /* see if the block is in the buffer pool already */
714 buf_id = BufTableLookup(&newTag, newHash);
716
717 /* If not in buffers, initiate prefetch */
718 if (buf_id < 0)
719 {
720#ifdef USE_PREFETCH
721 /*
722 * Try to initiate an asynchronous read. This returns false in
723 * recovery if the relation file doesn't exist.
724 */
725 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
726 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
727 {
728 result.initiated_io = true;
729 }
730#endif /* USE_PREFETCH */
731 }
732 else
733 {
734 /*
735 * Report the buffer it was in at that time. The caller may be able
736 * to avoid a buffer table lookup, but it's not pinned and it must be
737 * rechecked!
738 */
739 result.recent_buffer = buf_id + 1;
740 }
741
742 /*
743 * If the block *is* in buffers, we do nothing. This is not really ideal:
744 * the block might be just about to be evicted, which would be stupid
745 * since we know we are going to need it soon. But the only easy answer
746 * is to bump the usage_count, which does not seem like a great solution:
747 * when the caller does ultimately touch the block, usage_count would get
748 * bumped again, resulting in too much favoritism for blocks that are
749 * involved in a prefetch sequence. A real fix would involve some
750 * additional per-buffer state, and it's not clear that there's enough of
751 * a problem to justify that.
752 */
753
754 return result;
755}
int io_direct_flags
Definition fd.c:172
#define IO_DIRECT_DATA
Definition fd.h:54
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition smgr.c:678
Buffer recent_buffer
Definition bufmgr.h:61

References Assert, BlockNumberIsValid(), BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), fb(), InitBufferTag(), PrefetchBufferResult::initiated_io, InvalidBuffer, IO_DIRECT_DATA, io_direct_flags, LW_SHARED, LWLockAcquire(), LWLockRelease(), PrefetchBufferResult::recent_buffer, and smgrprefetch().

Referenced by PrefetchBuffer(), and XLogPrefetcherNextBlock().

◆ ProcessReadBuffersResult()

static void ProcessReadBuffersResult ( ReadBuffersOperation operation)
static

Definition at line 1703 of file bufmgr.c.

1704{
1705 PgAioReturn *aio_ret = &operation->io_return;
1707 int newly_read_blocks = 0;
1708
1709 Assert(pgaio_wref_valid(&operation->io_wref));
1710 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1711
1712 /*
1713 * SMGR reports the number of blocks successfully read as the result of
1714 * the IO operation. Thus we can simply add that to ->nblocks_done.
1715 */
1716
1717 if (likely(rs != PGAIO_RS_ERROR))
1718 newly_read_blocks = aio_ret->result.result;
1719
1720 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1721 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1722 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1723 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1724 {
1725 /*
1726 * We'll retry, so we just emit a debug message to the server log (or
1727 * not even that in prod scenarios).
1728 */
1729 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1730 elog(DEBUG3, "partial read, will retry");
1731 }
1732
1735
1736 operation->nblocks_done += newly_read_blocks;
1737
1738 Assert(operation->nblocks_done <= operation->nblocks);
1739}
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition aio.c:971
PgAioResultStatus
Definition aio_types.h:79
@ PGAIO_RS_UNKNOWN
Definition aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
#define DEBUG3
Definition elog.h:28
PgAioResult result
Definition aio_types.h:132

References Assert, DEBUG1, DEBUG3, elog, ERROR, fb(), ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, likely, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PGAIO_RS_UNKNOWN, PGAIO_RS_WARNING, pgaio_wref_valid(), PgAioReturn::result, PgAioResult::status, and WARNING.

Referenced by WaitReadBuffers().

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 874 of file bufmgr.c.

875{
877}
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:921
@ RBM_NORMAL
Definition bufmgr.h:46

References fb(), MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_allocbuf(), _bt_getbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_lock_tuple(), heap_update(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

◆ ReadBuffer_common()

static pg_attribute_always_inline Buffer ReadBuffer_common ( Relation  rel,
SMgrRelation  smgr,
char  smgr_persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
static

Definition at line 1303 of file bufmgr.c.

1307{
1308 ReadBuffersOperation operation;
1309 Buffer buffer;
1310 int flags;
1311 char persistence;
1312
1313 /*
1314 * Backward compatibility path, most code should use ExtendBufferedRel()
1315 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1316 * scales a lot better.
1317 */
1318 if (unlikely(blockNum == P_NEW))
1319 {
1321
1322 /*
1323 * Since no-one else can be looking at the page contents yet, there is
1324 * no difference between an exclusive lock and a cleanup-strength
1325 * lock.
1326 */
1328 flags |= EB_LOCK_FIRST;
1329
1330 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1331 }
1332
1333 if (rel)
1334 persistence = rel->rd_rel->relpersistence;
1335 else
1336 persistence = smgr_persistence;
1337
1340 {
1341 bool found;
1342
1343 buffer = PinBufferForBlock(rel, smgr, persistence,
1344 forkNum, blockNum, strategy, &found);
1345 ZeroAndLockBuffer(buffer, mode, found);
1346 return buffer;
1347 }
1348
1349 /*
1350 * Signal that we are going to immediately wait. If we're immediately
1351 * waiting, there is no benefit in actually executing the IO
1352 * asynchronously, it would just add dispatch overhead.
1353 */
1355 if (mode == RBM_ZERO_ON_ERROR)
1357 operation.smgr = smgr;
1358 operation.rel = rel;
1359 operation.persistence = persistence;
1360 operation.forknum = forkNum;
1361 operation.strategy = strategy;
1362 if (StartReadBuffer(&operation,
1363 &buffer,
1364 blockNum,
1365 flags))
1366 WaitReadBuffers(&operation);
1367
1368 return buffer;
1369}
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:974
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition bufmgr.c:1141
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition bufmgr.c:1220
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition bufmgr.c:1742
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition bufmgr.c:1618
@ RBM_ZERO_ON_ERROR
Definition bufmgr.h:51
#define BMR_REL(p_rel)
Definition bufmgr.h:114
Form_pg_class rd_rel
Definition rel.h:111

References BMR_REL, PrivateRefCountEntry::buffer, EB_LOCK_FIRST, EB_SKIP_EXTENSION_LOCK, ExtendBufferedRel(), fb(), ReadBuffersOperation::forknum, mode, P_NEW, ReadBuffersOperation::persistence, PinBufferForBlock(), RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelationData::rd_rel, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, ReadBuffersOperation::rel, ReadBuffersOperation::smgr, StartReadBuffer(), ReadBuffersOperation::strategy, unlikely, WaitReadBuffers(), and ZeroAndLockBuffer().

Referenced by ExtendBufferedRelTo(), ReadBufferExtended(), and ReadBufferWithoutRelcache().

◆ ReadBufferExtended()

Buffer ReadBufferExtended ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
inline

Definition at line 921 of file bufmgr.c.

923{
924 Buffer buf;
925
926 /*
927 * Reject attempts to read non-local temporary relations; we would be
928 * likely to get wrong data since we have no visibility into the owning
929 * session's local buffers.
930 */
934 errmsg("cannot access temporary tables of other sessions")));
935
936 /*
937 * Read the buffer, and update pgstat counters to reflect a cache hit or
938 * miss.
939 */
941 forkNum, blockNum, mode, strategy);
942
943 return buf;
944}

References buf, ereport, errcode(), errmsg, ERROR, fb(), mode, ReadBuffer_common(), RELATION_IS_OTHER_TEMP, and RelationGetSmgr().

Referenced by _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), BloomInitMetapage(), bt_recheck_sibling_links(), btvacuumpage(), count_nondeletable_pages(), create_toy_buffer(), fsm_readbuf(), get_raw_page_internal(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), gin_refind_parent(), ginbulkdelete(), ginScanPostingTreeToDelete(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistvacuum_delete_empty_pages(), gistvacuumpage(), heapam_scan_sample_next_block(), log_newpage_range(), modify_rel_block(), palloc_btree_page(), pgstat_btree_page(), pgstat_gist_page(), pgstat_hash_page(), pgstat_heap(), pgstatindex_impl(), ReadBuffer(), ReadBufferBI(), spgprocesspending(), and vm_readbuf().

◆ ReadBuffersCanStartIO()

static bool ReadBuffersCanStartIO ( Buffer  buffer,
bool  nowait 
)
inlinestatic

Definition at line 1674 of file bufmgr.c.

1675{
1676 /*
1677 * If this backend currently has staged IO, we need to submit the pending
1678 * IO before waiting for the right to issue IO, to avoid the potential for
1679 * deadlocks (and, more commonly, unnecessary delays for other backends).
1680 */
1681 if (!nowait && pgaio_have_staged())
1682 {
1683 if (ReadBuffersCanStartIOOnce(buffer, true))
1684 return true;
1685
1686 /*
1687 * Unfortunately StartBufferIO() returning false doesn't allow to
1688 * distinguish between the buffer already being valid and IO already
1689 * being in progress. Since IO already being in progress is quite
1690 * rare, this approach seems fine.
1691 */
1693 }
1694
1695 return ReadBuffersCanStartIOOnce(buffer, nowait);
1696}
bool pgaio_have_staged(void)
Definition aio.c:1107
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition bufmgr.c:1661

References PrivateRefCountEntry::buffer, pgaio_have_staged(), pgaio_submit_staged(), and ReadBuffersCanStartIOOnce().

Referenced by AsyncReadBuffers().

◆ ReadBuffersCanStartIOOnce()

static bool ReadBuffersCanStartIOOnce ( Buffer  buffer,
bool  nowait 
)
inlinestatic

Definition at line 1661 of file bufmgr.c.

1662{
1663 if (BufferIsLocal(buffer))
1664 return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1),
1665 true, nowait);
1666 else
1667 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1668}
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition localbuf.c:523

References PrivateRefCountEntry::buffer, BufferIsLocal, GetBufferDescriptor(), GetLocalBufferDescriptor(), StartBufferIO(), and StartLocalBufferIO().

Referenced by ReadBuffersCanStartIO().

◆ ReadBufferWithoutRelcache()

Buffer ReadBufferWithoutRelcache ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool  permanent 
)

Definition at line 958 of file bufmgr.c.

961{
962 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
963
964 return ReadBuffer_common(NULL, smgr,
966 forkNum, blockNum,
967 mode, strategy);
968}

References fb(), INVALID_PROC_NUMBER, mode, ReadBuffer_common(), and smgropen().

Referenced by RelationCopyStorageUsingBuffer(), ScanSourceDatabasePgClass(), and XLogReadBufferExtended().

◆ ReadRecentBuffer()

bool ReadRecentBuffer ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
Buffer  recent_buffer 
)

Definition at line 813 of file bufmgr.c.

815{
817 BufferTag tag;
819
820 Assert(BufferIsValid(recent_buffer));
821
824 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
825
826 if (BufferIsLocal(recent_buffer))
827 {
828 int b = -recent_buffer - 1;
829
832
833 /* Is it still valid and holding the right tag? */
834 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
835 {
836 PinLocalBuffer(bufHdr, true);
837
839
840 return true;
841 }
842 }
843 else
844 {
845 bufHdr = GetBufferDescriptor(recent_buffer - 1);
846
847 /*
848 * Is it still valid and holding the right tag? We do an unlocked tag
849 * comparison first, to make it unlikely that we'll increment the
850 * usage counter of the wrong buffer, if someone calls us with a very
851 * out of date recent_buffer. Then we'll check it again if we get the
852 * pin.
853 */
854 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
855 PinBuffer(bufHdr, NULL, true))
856 {
857 if (BufferTagsEqual(&tag, &bufHdr->tag))
858 {
860 return true;
861 }
863 }
864 }
865
866 return false;
867}

References Assert, b, BM_VALID, BufferIsLocal, BufferIsValid(), BufferTagsEqual(), CurrentResourceOwner, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), InitBufferTag(), BufferUsage::local_blks_hit, pg_atomic_read_u64(), pgBufferUsage, PinBuffer(), PinLocalBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_hit, and UnpinBuffer().

Referenced by invalidate_rel_block(), and XLogReadBufferExtended().

◆ RelationCopyStorageUsingBuffer()

static void RelationCopyStorageUsingBuffer ( RelFileLocator  srclocator,
RelFileLocator  dstlocator,
ForkNumber  forkNum,
bool  permanent 
)
static

Definition at line 5267 of file bufmgr.c.

5270{
5271 Buffer srcBuf;
5272 Buffer dstBuf;
5273 Page srcPage;
5274 Page dstPage;
5275 bool use_wal;
5276 BlockNumber nblocks;
5277 BlockNumber blkno;
5284
5285 /*
5286 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5287 * can skip it when copying any fork of an unlogged relation other than
5288 * the init fork.
5289 */
5290 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5291
5292 /* Get number of blocks in the source relation. */
5294 forkNum);
5295
5296 /* Nothing to copy; just return. */
5297 if (nblocks == 0)
5298 return;
5299
5300 /*
5301 * Bulk extend the destination relation of the same size as the source
5302 * relation before starting to copy block by block.
5303 */
5304 memset(buf.data, 0, BLCKSZ);
5305 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5306 buf.data, true);
5307
5308 /* This is a bulk operation, so use buffer access strategies. */
5311
5312 /* Initialize streaming read */
5313 p.current_blocknum = 0;
5314 p.last_exclusive = nblocks;
5316
5317 /*
5318 * It is safe to use batchmode as block_range_read_stream_cb takes no
5319 * locks.
5320 */
5324 src_smgr,
5326 forkNum,
5328 &p,
5329 0);
5330
5331 /* Iterate over each block of the source relation file. */
5332 for (blkno = 0; blkno < nblocks; blkno++)
5333 {
5335
5336 /* Read block from source relation. */
5340
5344 permanent);
5346
5348
5349 /* Copy page data from the source to the destination. */
5352
5353 /* WAL-log the copied page. */
5354 if (use_wal)
5356
5358
5361 }
5364
5367}
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5522
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition bufmgr.c:958
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:461
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:643
#define START_CRIT_SECTION()
Definition miscadmin.h:150
#define END_CRIT_SECTION()
Definition miscadmin.h:152
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.c:620
#define XLogIsNeeded()
Definition xlog.h:111
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)

References Assert, BAS_BULKREAD, BAS_BULKWRITE, block_range_read_stream_cb(), buf, BUFFER_LOCK_SHARE, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, BlockRangeReadStreamPrivate::current_blocknum, END_CRIT_SECTION, fb(), FreeAccessStrategy(), GetAccessStrategy(), INIT_FORKNUM, INVALID_PROC_NUMBER, InvalidBuffer, BlockRangeReadStreamPrivate::last_exclusive, LockBuffer(), log_newpage_buffer(), MarkBufferDirty(), RBM_ZERO_AND_LOCK, read_stream_begin_smgr_relation(), read_stream_end(), READ_STREAM_FULL, read_stream_next_buffer(), READ_STREAM_USE_BATCHING, ReadBufferWithoutRelcache(), smgrextend(), smgrnblocks(), smgropen(), START_CRIT_SECTION, UnlockReleaseBuffer(), and XLogIsNeeded.

Referenced by CreateAndCopyRelationData().

◆ RelationGetNumberOfBlocksInFork()

BlockNumber RelationGetNumberOfBlocksInFork ( Relation  relation,
ForkNumber  forkNum 
)

Definition at line 4564 of file bufmgr.c.

4565{
4566 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4567 {
4568 /*
4569 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4570 * tableam returns the size in bytes - but for the purpose of this
4571 * routine, we want the number of blocks. Therefore divide, rounding
4572 * up.
4573 */
4575
4576 szbytes = table_relation_size(relation, forkNum);
4577
4578 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4579 }
4580 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4581 {
4582 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4583 }
4584 else
4585 Assert(false);
4586
4587 return 0; /* keep compiler quiet */
4588}
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition tableam.h:1858

References Assert, fb(), RelationData::rd_rel, RelationGetSmgr(), smgrnblocks(), and table_relation_size().

Referenced by _hash_getnewbuf(), _hash_init(), autoprewarm_database_main(), get_raw_page_internal(), and pg_prewarm().

◆ ReleaseAndReadBuffer()

Buffer ReleaseAndReadBuffer ( Buffer  buffer,
Relation  relation,
BlockNumber  blockNum 
)

Definition at line 3128 of file bufmgr.c.

3131{
3132 ForkNumber forkNum = MAIN_FORKNUM;
3134
3135 if (BufferIsValid(buffer))
3136 {
3137 Assert(BufferIsPinned(buffer));
3138 if (BufferIsLocal(buffer))
3139 {
3140 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3141 if (bufHdr->tag.blockNum == blockNum &&
3142 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3143 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3144 return buffer;
3145 UnpinLocalBuffer(buffer);
3146 }
3147 else
3148 {
3149 bufHdr = GetBufferDescriptor(buffer - 1);
3150 /* we have pin, so it's ok to examine tag without spinlock */
3151 if (bufHdr->tag.blockNum == blockNum &&
3152 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3153 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3154 return buffer;
3156 }
3157 }
3158
3159 return ReadBuffer(relation, blockNum);
3160}
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:874

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), MAIN_FORKNUM, RelationData::rd_locator, ReadBuffer(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by _bt_relandgetbuf(), ginFindLeafPage(), and heapam_index_fetch_tuple().

◆ ReleaseBuffer()

void ReleaseBuffer ( Buffer  buffer)

Definition at line 5505 of file bufmgr.c.

5506{
5507 if (!BufferIsValid(buffer))
5508 elog(ERROR, "bad buffer ID: %d", buffer);
5509
5510 if (BufferIsLocal(buffer))
5511 UnpinLocalBuffer(buffer);
5512 else
5513 UnpinBuffer(GetBufferDescriptor(buffer - 1));
5514}

References PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by _bt_allocbuf(), _bt_pagedel(), _bt_relbuf(), _bt_search_insert(), _bt_unlink_halfdead_page(), _hash_dropbuf(), _hash_getbuf_with_condlock_cleanup(), autoprewarm_database_main(), BitmapHeapScanNextBlock(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brin_vacuum_scan(), bringetbitmap(), brinGetTupleForHeapBlock(), brininsert(), brinRevmapTerminate(), brinsummarize(), buffer_create_toy(), collect_corrupt_items(), collect_visibility_data(), entryLoadMoreItems(), ExecEndIndexOnlyScan(), ExtendBufferedRelTo(), FreeBulkInsertState(), freeGinBtreeStack(), fsm_search(), fsm_vacuum_page(), get_actual_variable_endpoint(), get_raw_page_internal(), GetRecordedFreeSpace(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), ginFindParents(), ginFinishSplit(), ginFreeScanKeys(), ginInsertCleanup(), GinNewBuffer(), gistdoinsert(), gistFindCorrectParent(), gistNewBuffer(), gistvacuum_delete_empty_pages(), grow_rel(), heap_abort_speculative(), heap_delete(), heap_endscan(), heap_fetch(), heap_fetch_next_buffer(), heap_force_common(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_rescan(), heap_update(), heap_vac_scan_next_block(), heap_xlog_delete(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), heapam_index_fetch_reset(), heapam_scan_sample_next_block(), heapam_tuple_lock(), heapgettup(), heapgettup_pagemode(), invalidate_rel_block(), lazy_scan_heap(), lazy_vacuum_heap_rel(), modify_rel_block(), pg_prewarm(), pg_visibility(), pg_visibility_map(), pgstatindex_impl(), read_rel_block_ll(), read_stream_reset(), ReadBufferBI(), RelationAddBlocks(), RelationGetBufferForTuple(), ReleaseBulkInsertStatePin(), revmap_get_buffer(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), statapprox_heap(), summarize_range(), terminate_brin_buildstate(), tts_buffer_heap_clear(), tts_buffer_heap_materialize(), tts_buffer_heap_store_tuple(), UnlockReleaseBuffer(), verify_heapam(), visibilitymap_count(), visibilitymap_get_status(), visibilitymap_pin(), and XLogReadBufferExtended().

◆ ReservePrivateRefCountEntry()

static void ReservePrivateRefCountEntry ( void  )
static

Definition at line 309 of file bufmgr.c.

310{
311 /* Already reserved (or freed), nothing to do */
312 if (ReservedRefCountSlot != -1)
313 return;
314
315 /*
316 * First search for a free entry the array, that'll be sufficient in the
317 * majority of cases.
318 */
319 {
320 int i;
321
322 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
323 {
325 {
327
328 /*
329 * We could return immediately, but iterating till the end of
330 * the array allows compiler-autovectorization.
331 */
332 }
333 }
334
335 if (ReservedRefCountSlot != -1)
336 return;
337 }
338
339 /*
340 * No luck. All array entries are full. Move one array entry into the hash
341 * table.
342 */
343 {
344 /*
345 * Move entry from the current clock position in the array into the
346 * hashtable. Use that slot.
347 */
348 int victim_slot;
351 bool found;
352
353 /* select victim slot */
357
358 /* Better be used, otherwise we shouldn't get here. */
362
363 /* enter victim array entry into hashtable */
366 &found);
367 Assert(!found);
368 /* move data from the entry in the array to the hash entry */
369 hashent->data = victim_entry->data;
370
371 /* clear the now free array slot */
373 victim_entry->buffer = InvalidBuffer;
374
375 /* clear the whole data member, just for future proofing */
376 memset(&victim_entry->data, 0, sizeof(victim_entry->data));
377 victim_entry->data.refcount = 0;
378 victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
379
381 }
382}
static uint32 PrivateRefCountClock
Definition bufmgr.c:267

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, fb(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountClock, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountSlot.

Referenced by BufferAlloc(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetPrivateRefCountEntrySlow(), GetVictimBuffer(), MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), MarkDirtyUnpinnedBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ ResOwnerPrintBuffer()

static char * ResOwnerPrintBuffer ( Datum  res)
static

Definition at line 7664 of file bufmgr.c.

7665{
7667}
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202

References DatumGetInt32(), and DebugPrintBufferRefcount().

◆ ResOwnerPrintBufferIO()

static char * ResOwnerPrintBufferIO ( Datum  res)
static

Definition at line 7614 of file bufmgr.c.

7615{
7616 Buffer buffer = DatumGetInt32(res);
7617
7618 return psprintf("lost track of buffer IO on buffer %d", buffer);
7619}

References PrivateRefCountEntry::buffer, DatumGetInt32(), and psprintf().

◆ ResOwnerReleaseBuffer()

static void ResOwnerReleaseBuffer ( Datum  res)
static

Definition at line 7628 of file bufmgr.c.

7629{
7630 Buffer buffer = DatumGetInt32(res);
7631
7632 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
7633 if (!BufferIsValid(buffer))
7634 elog(ERROR, "bad buffer ID: %d", buffer);
7635
7636 if (BufferIsLocal(buffer))
7638 else
7639 {
7641
7642 ref = GetPrivateRefCountEntry(buffer, false);
7643
7644 /* not having a private refcount would imply resowner corruption */
7645 Assert(ref != NULL);
7646
7647 /*
7648 * If the buffer was locked at the time of the resowner release,
7649 * release the lock now. This should only happen after errors.
7650 */
7651 if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
7652 {
7653 BufferDesc *buf = GetBufferDescriptor(buffer - 1);
7654
7655 HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
7656 BufferLockUnlock(buffer, buf);
7657 }
7658
7660 }
7661}
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition bufmgr.c:3376
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition localbuf.c:848

References Assert, buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), BufferLockUnlock(), DatumGetInt32(), elog, ERROR, fb(), GetBufferDescriptor(), GetPrivateRefCountEntry(), HOLD_INTERRUPTS, UnpinBufferNoOwner(), and UnpinLocalBufferNoOwner().

◆ ResOwnerReleaseBufferIO()

static void ResOwnerReleaseBufferIO ( Datum  res)
static

Definition at line 7606 of file bufmgr.c.

7607{
7608 Buffer buffer = DatumGetInt32(res);
7609
7610 AbortBufferIO(buffer);
7611}
static void AbortBufferIO(Buffer buffer)
Definition bufmgr.c:7203

References AbortBufferIO(), PrivateRefCountEntry::buffer, and DatumGetInt32().

◆ rlocator_comparator()

static int rlocator_comparator ( const void p1,
const void p2 
)
static

Definition at line 7274 of file bufmgr.c.

7275{
7276 RelFileLocator n1 = *(const RelFileLocator *) p1;
7277 RelFileLocator n2 = *(const RelFileLocator *) p2;
7278
7279 if (n1.relNumber < n2.relNumber)
7280 return -1;
7281 else if (n1.relNumber > n2.relNumber)
7282 return 1;
7283
7284 if (n1.dbOid < n2.dbOid)
7285 return -1;
7286 else if (n1.dbOid > n2.dbOid)
7287 return 1;
7288
7289 if (n1.spcOid < n2.spcOid)
7290 return -1;
7291 else if (n1.spcOid > n2.spcOid)
7292 return 1;
7293 else
7294 return 0;
7295}

References fb().

Referenced by buffertag_comparator(), DropRelationsAllBuffers(), and FlushRelationsAllBuffers().

◆ ScheduleBufferTagForWriteback()

void ScheduleBufferTagForWriteback ( WritebackContext wb_context,
IOContext  io_context,
BufferTag tag 
)

Definition at line 7473 of file bufmgr.c.

7475{
7476 PendingWriteback *pending;
7477
7478 /*
7479 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
7480 * point in tracking in that case.
7481 */
7483 !enableFsync)
7484 return;
7485
7486 /*
7487 * Add buffer to the pending writeback array, unless writeback control is
7488 * disabled.
7489 */
7490 if (*wb_context->max_pending > 0)
7491 {
7493
7494 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
7495
7496 pending->tag = *tag;
7497 }
7498
7499 /*
7500 * Perform pending flushes if the writeback limit is exceeded. This
7501 * includes the case where previously an item has been added, but control
7502 * is now disabled.
7503 */
7504 if (wb_context->nr_pending >= *wb_context->max_pending)
7506}
bool enableFsync
Definition globals.c:129
#define WRITEBACK_MAX_PENDING_FLUSHES

References Assert, enableFsync, fb(), IO_DIRECT_DATA, io_direct_flags, IssuePendingWritebacks(), PendingWriteback::tag, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by GetVictimBuffer(), and SyncOneBuffer().

◆ shared_buffer_readv_complete()

static PgAioResult shared_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 8656 of file bufmgr.c.

8658{
8660}

References buffer_readv_complete(), and fb().

◆ shared_buffer_readv_complete_local()

static PgAioResult shared_buffer_readv_complete_local ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

◆ shared_buffer_readv_stage()

static void shared_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 8650 of file bufmgr.c.

8651{
8652 buffer_stage_common(ioh, false, false);
8653}

References buffer_stage_common(), and fb().

◆ shared_buffer_write_error_callback()

static void shared_buffer_write_error_callback ( void arg)
static

Definition at line 7242 of file bufmgr.c.

7243{
7245
7246 /* Buffer is pinned, so we can read the tag without locking the spinlock */
7247 if (bufHdr != NULL)
7248 errcontext("writing block %u of relation \"%s\"",
7249 bufHdr->tag.blockNum,
7251 BufTagGetForkNum(&bufHdr->tag)).str);
7252}

References arg, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, fb(), and relpathperm.

Referenced by FlushBuffer().

◆ SharedBufferBeginSetHintBits()

static bool SharedBufferBeginSetHintBits ( Buffer  buffer,
BufferDesc buf_hdr,
uint64 lockstate 
)
inlinestatic

Definition at line 6818 of file bufmgr.c.

6819{
6823
6824 ref = GetPrivateRefCountEntry(buffer, true);
6825
6826 if (ref == NULL)
6827 elog(ERROR, "buffer is not pinned");
6828
6829 mode = ref->data.lockmode;
6830 if (mode == BUFFER_LOCK_UNLOCK)
6831 elog(ERROR, "buffer is not locked");
6832
6833 /* we're done if we are already holding a sufficient lock level */
6835 {
6837 return true;
6838 }
6839
6840 /*
6841 * We are only holding a share lock right now, try to upgrade it to
6842 * SHARE_EXCLUSIVE.
6843 */
6845
6847 while (true)
6848 {
6850
6852
6853 /*
6854 * Can't upgrade if somebody else holds the lock in exclusive or
6855 * share-exclusive mode.
6856 */
6858 {
6859 return false;
6860 }
6861
6862 /* currently held lock state */
6864
6865 /* new lock level */
6867
6870 {
6871 ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE;
6873
6874 return true;
6875 }
6876 }
6877}

References Assert, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_VAL_SHARE_EXCLUSIVE, BM_LOCK_VAL_SHARED, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, elog, ERROR, fb(), GetPrivateRefCountEntry(), likely, mode, pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), and unlikely.

Referenced by BufferBeginSetHintBits(), and BufferSetHintBits16().

◆ StartBufferIO()

bool StartBufferIO ( BufferDesc buf,
bool  forInput,
bool  nowait 
)

Definition at line 7085 of file bufmgr.c.

7086{
7088
7090
7091 for (;;)
7092 {
7094
7096 break;
7098 if (nowait)
7099 return false;
7100 WaitIO(buf);
7101 }
7102
7103 /* Once we get here, there is definitely no I/O active on this buffer */
7104
7105 /* Check if someone else already did the I/O */
7106 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
7107 {
7109 return false;
7110 }
7111
7114 0);
7115
7118
7119 return true;
7120}
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)

References BM_DIRTY, BM_IO_IN_PROGRESS, BM_VALID, buf, BufferDescriptorGetBuffer(), CurrentResourceOwner, fb(), LockBufHdr(), ResourceOwnerEnlarge(), ResourceOwnerRememberBufferIO(), UnlockBufHdr(), UnlockBufHdrExt(), and WaitIO().

Referenced by buffer_call_start_io(), ExtendBufferedRelShared(), FlushBuffer(), read_rel_block_ll(), ReadBuffersCanStartIOOnce(), and ZeroAndLockBuffer().

◆ StartReadBuffer()

bool StartReadBuffer ( ReadBuffersOperation operation,
Buffer buffer,
BlockNumber  blocknum,
int  flags 
)

Definition at line 1618 of file bufmgr.c.

1622{
1623 int nblocks = 1;
1624 bool result;
1625
1626 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1627 false /* single block, no forwarding */ );
1628 Assert(nblocks == 1); /* single block can't be short */
1629
1630 return result;
1631}
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition bufmgr.c:1372

References Assert, PrivateRefCountEntry::buffer, and StartReadBuffersImpl().

Referenced by read_stream_next_buffer(), and ReadBuffer_common().

◆ StartReadBuffers()

bool StartReadBuffers ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int nblocks,
int  flags 
)

Definition at line 1599 of file bufmgr.c.

1604{
1605 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1606 true /* expect forwarded buffers */ );
1607}

References StartReadBuffersImpl().

Referenced by read_stream_start_pending_read().

◆ StartReadBuffersImpl()

static pg_attribute_always_inline bool StartReadBuffersImpl ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int nblocks,
int  flags,
bool  allow_forwarding 
)
static

Definition at line 1372 of file bufmgr.c.

1378{
1379 int actual_nblocks = *nblocks;
1380 int maxcombine = 0;
1381 bool did_start_io;
1382
1383 Assert(*nblocks == 1 || allow_forwarding);
1384 Assert(*nblocks > 0);
1385 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1386
1387 for (int i = 0; i < actual_nblocks; ++i)
1388 {
1389 bool found;
1390
1391 if (allow_forwarding && buffers[i] != InvalidBuffer)
1392 {
1394
1395 /*
1396 * This is a buffer that was pinned by an earlier call to
1397 * StartReadBuffers(), but couldn't be handled in one operation at
1398 * that time. The operation was split, and the caller has passed
1399 * an already pinned buffer back to us to handle the rest of the
1400 * operation. It must continue at the expected block number.
1401 */
1402 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1403
1404 /*
1405 * It might be an already valid buffer (a hit) that followed the
1406 * final contiguous block of an earlier I/O (a miss) marking the
1407 * end of it, or a buffer that some other backend has since made
1408 * valid by performing the I/O for us, in which case we can handle
1409 * it as a hit now. It is safe to check for a BM_VALID flag with
1410 * a relaxed load, because we got a fresh view of it while pinning
1411 * it in the previous call.
1412 *
1413 * On the other hand if we don't see BM_VALID yet, it must be an
1414 * I/O that was split by the previous call and we need to try to
1415 * start a new I/O from this block. We're also racing against any
1416 * other backend that might start the I/O or even manage to mark
1417 * it BM_VALID after this check, but StartBufferIO() will handle
1418 * those cases.
1419 */
1420 if (BufferIsLocal(buffers[i]))
1421 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1422 else
1423 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1425 found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
1426 }
1427 else
1428 {
1429 buffers[i] = PinBufferForBlock(operation->rel,
1430 operation->smgr,
1431 operation->persistence,
1432 operation->forknum,
1433 blockNum + i,
1434 operation->strategy,
1435 &found);
1436 }
1437
1438 if (found)
1439 {
1440 /*
1441 * We have a hit. If it's the first block in the requested range,
1442 * we can return it immediately and report that WaitReadBuffers()
1443 * does not need to be called. If the initial value of *nblocks
1444 * was larger, the caller will have to call again for the rest.
1445 */
1446 if (i == 0)
1447 {
1448 *nblocks = 1;
1449
1450#ifdef USE_ASSERT_CHECKING
1451
1452 /*
1453 * Initialize enough of ReadBuffersOperation to make
1454 * CheckReadBuffersOperation() work. Outside of assertions
1455 * that's not necessary when no IO is issued.
1456 */
1457 operation->buffers = buffers;
1458 operation->blocknum = blockNum;
1459 operation->nblocks = 1;
1460 operation->nblocks_done = 1;
1461 CheckReadBuffersOperation(operation, true);
1462#endif
1463 return false;
1464 }
1465
1466 /*
1467 * Otherwise we already have an I/O to perform, but this block
1468 * can't be included as it is already valid. Split the I/O here.
1469 * There may or may not be more blocks requiring I/O after this
1470 * one, we haven't checked, but they can't be contiguous with this
1471 * one in the way. We'll leave this buffer pinned, forwarding it
1472 * to the next call, avoiding the need to unpin it here and re-pin
1473 * it in the next call.
1474 */
1475 actual_nblocks = i;
1476 break;
1477 }
1478 else
1479 {
1480 /*
1481 * Check how many blocks we can cover with the same IO. The smgr
1482 * implementation might e.g. be limited due to a segment boundary.
1483 */
1484 if (i == 0 && actual_nblocks > 1)
1485 {
1486 maxcombine = smgrmaxcombine(operation->smgr,
1487 operation->forknum,
1488 blockNum);
1490 {
1491 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1492 blockNum, actual_nblocks, maxcombine);
1494 }
1495 }
1496 }
1497 }
1498 *nblocks = actual_nblocks;
1499
1500 /* Populate information needed for I/O. */
1501 operation->buffers = buffers;
1502 operation->blocknum = blockNum;
1503 operation->flags = flags;
1504 operation->nblocks = actual_nblocks;
1505 operation->nblocks_done = 0;
1506 pgaio_wref_clear(&operation->io_wref);
1507
1508 /*
1509 * When using AIO, start the IO in the background. If not, issue prefetch
1510 * requests if desired by the caller.
1511 *
1512 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1513 * de-risk the introduction of AIO somewhat. It's a large architectural
1514 * change, with lots of chances for unanticipated performance effects.
1515 *
1516 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1517 * asynchronously, but without the check here we'd execute IO earlier than
1518 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1519 */
1520 if (io_method != IOMETHOD_SYNC)
1521 {
1522 /*
1523 * Try to start IO asynchronously. It's possible that no IO needs to
1524 * be started, if another backend already performed the IO.
1525 *
1526 * Note that if an IO is started, it might not cover the entire
1527 * requested range, e.g. because an intermediary block has been read
1528 * in by another backend. In that case any "trailing" buffers we
1529 * already pinned above will be "forwarded" by read_stream.c to the
1530 * next call to StartReadBuffers().
1531 *
1532 * This is signalled to the caller by decrementing *nblocks *and*
1533 * reducing operation->nblocks. The latter is done here, but not below
1534 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1535 * overall read size anymore, we need to retry until done in its
1536 * entirety or until failed.
1537 */
1538 did_start_io = AsyncReadBuffers(operation, nblocks);
1539
1540 operation->nblocks = *nblocks;
1541 }
1542 else
1543 {
1544 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1545
1546 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1547 {
1548 /*
1549 * In theory we should only do this if PinBufferForBlock() had to
1550 * allocate new buffers above. That way, if two calls to
1551 * StartReadBuffers() were made for the same blocks before
1552 * WaitReadBuffers(), only the first would issue the advice.
1553 * That'd be a better simulation of true asynchronous I/O, which
1554 * would only start the I/O once, but isn't done here for
1555 * simplicity.
1556 */
1557 smgrprefetch(operation->smgr,
1558 operation->forknum,
1559 blockNum,
1561 }
1562
1563 /*
1564 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1565 * will initiate the necessary IO.
1566 */
1567 did_start_io = true;
1568 }
1569
1571
1572 return did_start_io;
1573}
int io_method
Definition aio.c:74
@ IOMETHOD_SYNC
Definition aio.h:34
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition bufmgr.c:1637
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition bufmgr.c:1874
#define READ_BUFFERS_ISSUE_ADVICE
Definition bufmgr.h:124
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition smgr.c:697

References Assert, AsyncReadBuffers(), ReadBuffersOperation::blocknum, BM_TAG_VALID, BM_VALID, BufferGetBlockNumber(), BufferIsLocal, ReadBuffersOperation::buffers, CheckReadBuffersOperation(), DEBUG2, elog, fb(), ReadBuffersOperation::flags, ReadBuffersOperation::forknum, GetBufferDescriptor(), GetLocalBufferDescriptor(), i, InvalidBuffer, io_method, ReadBuffersOperation::io_wref, IOMETHOD_SYNC, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, pg_atomic_read_u64(), pgaio_wref_clear(), PinBufferForBlock(), READ_BUFFERS_ISSUE_ADVICE, READ_BUFFERS_SYNCHRONOUSLY, ReadBuffersOperation::rel, ReadBuffersOperation::smgr, smgrmaxcombine(), smgrprefetch(), ReadBuffersOperation::strategy, and unlikely.

Referenced by StartReadBuffer(), and StartReadBuffers().

◆ SyncOneBuffer()

static int SyncOneBuffer ( int  buf_id,
bool  skip_recently_used,
WritebackContext wb_context 
)
static

Definition at line 4040 of file bufmgr.c.

4041{
4043 int result = 0;
4045 BufferTag tag;
4046
4047 /* Make sure we can handle the pin */
4050
4051 /*
4052 * Check whether buffer needs writing.
4053 *
4054 * We can make this check without taking the buffer content lock so long
4055 * as we mark pages dirty in access methods *before* logging changes with
4056 * XLogInsert(): if someone marks the buffer dirty just after our check we
4057 * don't worry because our checkpoint.redo points before log record for
4058 * upcoming changes and so we are not required to write such dirty buffer.
4059 */
4061
4064 {
4065 result |= BUF_REUSABLE;
4066 }
4067 else if (skip_recently_used)
4068 {
4069 /* Caller told us not to write recently-used buffers */
4071 return result;
4072 }
4073
4074 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4075 {
4076 /* It's clean, so nothing to do */
4078 return result;
4079 }
4080
4081 /*
4082 * Pin it, share-exclusive-lock it, write it. (FlushBuffer will do
4083 * nothing if the buffer is clean by the time we've locked it.)
4084 */
4086
4088
4089 tag = bufHdr->tag;
4090
4092
4093 /*
4094 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4095 * IOContext will always be IOCONTEXT_NORMAL.
4096 */
4098
4099 return result | BUF_WRITTEN;
4100}

References BM_DIRTY, BM_VALID, BUF_REUSABLE, BUF_STATE_GET_REFCOUNT, BUF_STATE_GET_USAGECOUNT, BUF_WRITTEN, CurrentResourceOwner, fb(), FlushUnlockedBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), UnlockBufHdr(), and UnpinBuffer().

Referenced by BgBufferSync(), and BufferSync().

◆ TerminateBufferIO()

void TerminateBufferIO ( BufferDesc buf,
bool  clear_dirty,
uint64  set_flag_bits,
bool  forget_owner,
bool  release_aio 
)

Definition at line 7141 of file bufmgr.c.

7143{
7146 int refcount_change = 0;
7147
7149
7152
7153 /* Clear earlier errors, if this IO failed, it'll be marked again */
7155
7156 if (clear_dirty)
7158
7159 if (release_aio)
7160 {
7161 /* release ownership by the AIO subsystem */
7163 refcount_change = -1;
7164 pgaio_wref_clear(&buf->io_wref);
7165 }
7166
7170
7171 if (forget_owner)
7174
7176
7177 /*
7178 * Support LockBufferForCleanup()
7179 *
7180 * We may have just released the last pin other than the waiter's. In most
7181 * cases, this backend holds another pin on the buffer. But, if, for
7182 * example, this backend is completing an IO issued by another backend, it
7183 * may be time to wake the waiter.
7184 */
7187}
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void WakePinCountWaiter(BufferDesc *buf)
Definition bufmgr.c:3331
void ConditionVariableBroadcast(ConditionVariable *cv)

References Assert, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_PIN_COUNT_WAITER, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetIOCV(), ConditionVariableBroadcast(), CurrentResourceOwner, fb(), LockBufHdr(), pgaio_wref_clear(), ResourceOwnerForgetBufferIO(), UnlockBufHdrExt(), and WakePinCountWaiter().

Referenced by AbortBufferIO(), buffer_call_terminate_io(), buffer_readv_complete_one(), ExtendBufferedRelShared(), FlushBuffer(), and ZeroAndLockBuffer().

◆ TrackNewBufferPin()

void TrackNewBufferPin ( Buffer  buf)
inline

Definition at line 3423 of file bufmgr.c.

3424{
3426
3428 ref->data.refcount++;
3429
3431
3432 /*
3433 * This is the first pin for this page by this backend, mark its page as
3434 * defined to valgrind. While the page contents might not actually be
3435 * valid yet, we don't currently guarantee that such pages are marked
3436 * undefined or non-accessible.
3437 *
3438 * It's not necessarily the prettiest to do this here, but otherwise we'd
3439 * need this block of code in multiple places.
3440 */
3442 BLCKSZ);
3443}
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition bufmgr.c:388

References buf, BufHdrGetBlock, CurrentResourceOwner, fb(), GetBufferDescriptor(), NewPrivateRefCountEntry(), ResourceOwnerRememberBuffer(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by GetBufferFromRing(), PinBuffer(), PinBuffer_Locked(), and StrategyGetBuffer().

◆ ts_ckpt_progress_comparator()

static int ts_ckpt_progress_comparator ( Datum  a,
Datum  b,
void arg 
)
static

Definition at line 7438 of file bufmgr.c.

7439{
7442
7443 /* we want a min-heap, so return 1 for the a < b */
7444 if (sa->progress < sb->progress)
7445 return 1;
7446 else if (sa->progress == sb->progress)
7447 return 0;
7448 else
7449 return -1;
7450}

References a, b, DatumGetPointer(), and fb().

Referenced by BufferSync().

◆ UnlockBuffer()

void UnlockBuffer ( Buffer  buffer)

Definition at line 6425 of file bufmgr.c.

6426{
6428
6429 Assert(BufferIsPinned(buffer));
6430 if (BufferIsLocal(buffer))
6431 return; /* local buffers need no lock */
6432
6433 buf_hdr = GetBufferDescriptor(buffer - 1);
6434 BufferLockUnlock(buffer, buf_hdr);
6435}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockUnlock(), fb(), and GetBufferDescriptor().

Referenced by LockBuffer().

◆ UnlockBuffers()

void UnlockBuffers ( void  )

Definition at line 5719 of file bufmgr.c.

5720{
5722
5723 if (buf)
5724 {
5726 uint64 unset_bits = 0;
5727
5729
5730 /*
5731 * Don't complain if flag bit not set; it could have been reset but we
5732 * got a cancel/die interrupt before getting the signal.
5733 */
5734 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5735 buf->wait_backend_pgprocno == MyProcNumber)
5737
5739 0, unset_bits,
5740 0);
5741
5743 }
5744}

References BM_PIN_COUNT_WAITER, buf, fb(), LockBufHdr(), MyProcNumber, PinCountWaitBuf, and UnlockBufHdrExt().

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

◆ UnlockReleaseBuffer()

void UnlockReleaseBuffer ( Buffer  buffer)

Definition at line 5522 of file bufmgr.c.

5523{
5525 ReleaseBuffer(buffer);
5526}

References PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, LockBuffer(), and ReleaseBuffer().

Referenced by _bt_clear_incomplete_split(), _bt_restore_meta(), _hash_relbuf(), allocNewBuffer(), AlterSequence(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinGetStats(), brinRevmapDesummarizeRange(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), collect_corrupt_items(), collect_visibility_data(), count_nondeletable_pages(), createPostingTree(), doPickSplit(), entryLoadMoreItems(), fill_seq_fork_with_data(), flushCachedPage(), FreeSpaceMapPrepareTruncateRel(), fsm_search(), fsm_set_and_search(), generic_redo(), gin_refind_parent(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoSplit(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginRedoVacuumPage(), ginScanPostingTreeToDelete(), ginStepRight(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbufferinginserttuples(), gistbuild(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistplacetopage(), gistProcessItup(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_split_page(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_insert(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heap_xlog_visible(), heapam_scan_analyze_next_tuple(), initBloomState(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), log_newpage_range(), moveLeafs(), nextval_internal(), palloc_btree_page(), pg_get_sequence_data(), pg_sequence_last_value(), pg_visibility(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), ResetSequence(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), scanPostingTree(), ScanSourceDatabasePgClass(), seq_redo(), SequenceChangePersistence(), SetSequence(), shiftList(), spgAddNodeAction(), spgbuild(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistUpdateMetaPage(), spgMatchNodeAction(), spgprocesspending(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), statapprox_heap(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_prepare_truncate(), writeListPage(), xlog_redo(), and XLogRecordPageWithFreeSpace().

◆ UnpinBuffer()

◆ UnpinBufferNoOwner()

static void UnpinBufferNoOwner ( BufferDesc buf)
static

Definition at line 3376 of file bufmgr.c.

3377{
3380
3382
3383 /* not moving as we're likely deleting it soon anyway */
3384 ref = GetPrivateRefCountEntry(b, false);
3385 Assert(ref != NULL);
3386 Assert(ref->data.refcount > 0);
3387 ref->data.refcount--;
3388 if (ref->data.refcount == 0)
3389 {
3391
3392 /*
3393 * Mark buffer non-accessible to Valgrind.
3394 *
3395 * Note that the buffer may have already been marked non-accessible
3396 * within access method code that enforces that buffers are only
3397 * accessed while a buffer lock is held.
3398 */
3400
3401 /*
3402 * I'd better not still hold the buffer content lock. Can't use
3403 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3404 */
3406
3407 /* decrement the shared reference count */
3409
3410 /* Support LockBufferForCleanup() */
3413
3415 }
3416}
static uint64 pg_atomic_fetch_sub_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:541
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition bufmgr.c:565

References Assert, b, BM_PIN_COUNT_WAITER, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, BufferLockHeldByMe(), BufHdrGetBlock, fb(), ForgetPrivateRefCountEntry(), GetPrivateRefCountEntry(), pg_atomic_fetch_sub_u64(), VALGRIND_MAKE_MEM_NOACCESS, and WakePinCountWaiter().

Referenced by ResOwnerReleaseBuffer(), and UnpinBuffer().

◆ WaitBufHdrUnlocked()

pg_noinline uint64 WaitBufHdrUnlocked ( BufferDesc buf)

◆ WaitIO()

static void WaitIO ( BufferDesc buf)
static

Definition at line 7006 of file bufmgr.c.

7007{
7009
7011 for (;;)
7012 {
7015
7016 /*
7017 * It may not be necessary to acquire the spinlock to check the flag
7018 * here, but since this test is essential for correctness, we'd better
7019 * play it safe.
7020 */
7022
7023 /*
7024 * Copy the wait reference while holding the spinlock. This protects
7025 * against a concurrent TerminateBufferIO() in another backend from
7026 * clearing the wref while it's being read.
7027 */
7028 iow = buf->io_wref;
7030
7031 /* no IO in progress, we don't need to wait */
7033 break;
7034
7035 /*
7036 * The buffer has asynchronous IO in progress, wait for it to
7037 * complete.
7038 */
7039 if (pgaio_wref_valid(&iow))
7040 {
7042
7043 /*
7044 * The AIO subsystem internally uses condition variables and thus
7045 * might remove this backend from the BufferDesc's CV. While that
7046 * wouldn't cause a correctness issue (the first CV sleep just
7047 * immediately returns if not already registered), it seems worth
7048 * avoiding unnecessary loop iterations, given that we take care
7049 * to do so at the start of the function.
7050 */
7052 continue;
7053 }
7054
7055 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
7057 }
7059}
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition aio.c:991
bool ConditionVariableCancelSleep(void)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)

References BM_IO_IN_PROGRESS, buf, BufferDescriptorGetIOCV(), ConditionVariableCancelSleep(), ConditionVariablePrepareToSleep(), ConditionVariableSleep(), fb(), LockBufHdr(), pgaio_wref_valid(), pgaio_wref_wait(), and UnlockBufHdr().

Referenced by InvalidateBuffer(), and StartBufferIO().

◆ WaitReadBuffers()

void WaitReadBuffers ( ReadBuffersOperation operation)

Definition at line 1742 of file bufmgr.c.

1743{
1744 PgAioReturn *aio_ret = &operation->io_return;
1747
1748 if (operation->persistence == RELPERSISTENCE_TEMP)
1749 {
1752 }
1753 else
1754 {
1757 }
1758
1759 /*
1760 * If we get here without an IO operation having been issued, the
1761 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1762 * caller should not have called WaitReadBuffers().
1763 *
1764 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1765 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1766 * of the retry logic below, no extra code is required.
1767 *
1768 * This path is expected to eventually go away.
1769 */
1770 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1771 elog(ERROR, "waiting for read operation that didn't read");
1772
1773 /*
1774 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1775 * done. We may need multiple retries, not just because we could get
1776 * multiple partial reads, but also because some of the remaining
1777 * to-be-read buffers may have been read in by other backends, limiting
1778 * the IO size.
1779 */
1780 while (true)
1781 {
1783
1784 CheckReadBuffersOperation(operation, false);
1785
1786 /*
1787 * If there is an IO associated with the operation, we may need to
1788 * wait for it.
1789 */
1790 if (pgaio_wref_valid(&operation->io_wref))
1791 {
1792 /*
1793 * Track the time spent waiting for the IO to complete. As
1794 * tracking a wait even if we don't actually need to wait
1795 *
1796 * a) is not cheap, due to the timestamping overhead
1797 *
1798 * b) reports some time as waiting, even if we never waited
1799 *
1800 * we first check if we already know the IO is complete.
1801 */
1802 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1803 !pgaio_wref_check_done(&operation->io_wref))
1804 {
1806
1807 pgaio_wref_wait(&operation->io_wref);
1808
1809 /*
1810 * The IO operation itself was already counted earlier, in
1811 * AsyncReadBuffers(), this just accounts for the wait time.
1812 */
1814 io_start, 0, 0);
1815 }
1816 else
1817 {
1818 Assert(pgaio_wref_check_done(&operation->io_wref));
1819 }
1820
1821 /*
1822 * We now are sure the IO completed. Check the results. This
1823 * includes reporting on errors if there were any.
1824 */
1825 ProcessReadBuffersResult(operation);
1826 }
1827
1828 /*
1829 * Most of the time, the one IO we already started, will read in
1830 * everything. But we need to deal with partial reads and buffers not
1831 * needing IO anymore.
1832 */
1833 if (operation->nblocks_done == operation->nblocks)
1834 break;
1835
1837
1838 /*
1839 * This may only complete the IO partially, either because some
1840 * buffers were already valid, or because of a partial read.
1841 *
1842 * NB: In contrast to after the AsyncReadBuffers() call in
1843 * StartReadBuffers(), we do *not* reduce
1844 * ReadBuffersOperation->nblocks here, callers expect the full
1845 * operation to be completed at this point (as more operations may
1846 * have been queued).
1847 */
1849 }
1850
1851 CheckReadBuffersOperation(operation, true);
1852
1853 /* NB: READ_DONE tracepoint was already executed in completion callback */
1854}
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition aio.c:1005
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition bufmgr.c:1703

References Assert, AsyncReadBuffers(), CHECK_FOR_INTERRUPTS, CheckReadBuffersOperation(), elog, ERROR, fb(), io_method, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, IOCONTEXT_NORMAL, IOContextForStrategy(), IOMETHOD_SYNC, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_READ, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, PGAIO_RS_UNKNOWN, pgaio_wref_check_done(), pgaio_wref_valid(), pgaio_wref_wait(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), ProcessReadBuffersResult(), ReadBuffersOperation::strategy, and track_io_timing.

Referenced by read_stream_next_buffer(), and ReadBuffer_common().

◆ WakePinCountWaiter()

static void WakePinCountWaiter ( BufferDesc buf)
static

Definition at line 3331 of file bufmgr.c.

3332{
3333 /*
3334 * Acquire the buffer header lock, re-check that there's a waiter. Another
3335 * backend could have unpinned this buffer, and already woken up the
3336 * waiter.
3337 *
3338 * There's no danger of the buffer being replaced after we unpinned it
3339 * above, as it's pinned by the waiter. The waiter removes
3340 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3341 * backend waking it up.
3342 */
3344
3347 {
3348 /* we just released the last pin other than the waiter's */
3349 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3350
3353 0);
3354 ProcSendSignal(wait_backend_pgprocno);
3355 }
3356 else
3358}
void ProcSendSignal(ProcNumber procNumber)
Definition proc.c:2014

References BM_PIN_COUNT_WAITER, buf, BUF_STATE_GET_REFCOUNT, fb(), LockBufHdr(), ProcSendSignal(), UnlockBufHdr(), and UnlockBufHdrExt().

Referenced by TerminateBufferIO(), and UnpinBufferNoOwner().

◆ WritebackContextInit()

void WritebackContextInit ( WritebackContext context,
int max_pending 
)

Definition at line 7461 of file bufmgr.c.

7462{
7463 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7464
7465 context->max_pending = max_pending;
7466 context->nr_pending = 0;
7467}

References Assert, WritebackContext::max_pending, WritebackContext::nr_pending, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by BackgroundWriterMain(), BufferManagerShmemInit(), and BufferSync().

◆ ZeroAndLockBuffer()

static void ZeroAndLockBuffer ( Buffer  buffer,
ReadBufferMode  mode,
bool  already_valid 
)
static

Definition at line 1141 of file bufmgr.c.

1142{
1144 bool need_to_zero;
1145 bool isLocalBuf = BufferIsLocal(buffer);
1146
1148
1149 if (already_valid)
1150 {
1151 /*
1152 * If the caller already knew the buffer was valid, we can skip some
1153 * header interaction. The caller just wants to lock the buffer.
1154 */
1155 need_to_zero = false;
1156 }
1157 else if (isLocalBuf)
1158 {
1159 /* Simple case for non-shared buffers. */
1160 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1161 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1162 }
1163 else
1164 {
1165 /*
1166 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1167 * concurrently. Even though we aren't doing I/O, that ensures that
1168 * we don't zero a page that someone else has pinned. An exclusive
1169 * content lock wouldn't be enough, because readers are allowed to
1170 * drop the content lock after determining that a tuple is visible
1171 * (see buffer access rules in README).
1172 */
1173 bufHdr = GetBufferDescriptor(buffer - 1);
1174 need_to_zero = StartBufferIO(bufHdr, true, false);
1175 }
1176
1177 if (need_to_zero)
1178 {
1179 memset(BufferGetPage(buffer), 0, BLCKSZ);
1180
1181 /*
1182 * Grab the buffer content lock before marking the page as valid, to
1183 * make sure that no other backend sees the zeroed page before the
1184 * caller has had a chance to initialize it.
1185 *
1186 * Since no-one else can be looking at the page contents yet, there is
1187 * no difference between an exclusive lock and a cleanup-strength
1188 * lock. (Note that we cannot use LockBuffer() or
1189 * LockBufferForCleanup() here, because they assert that the buffer is
1190 * already valid.)
1191 */
1192 if (!isLocalBuf)
1194
1195 /* Set BM_VALID, terminate IO, and wake up any waiters */
1196 if (isLocalBuf)
1197 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1198 else
1199 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1200 }
1201 else if (!isLocalBuf)
1202 {
1203 /*
1204 * The buffer is valid, so we can't zero it. The caller still expects
1205 * the page to be locked on return.
1206 */
1207 if (mode == RBM_ZERO_AND_LOCK)
1209 else
1210 LockBufferForCleanup(buffer);
1211 }
1212}
void LockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6537

References Assert, BM_VALID, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferGetPage(), BufferIsLocal, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), LockBuffer(), LockBufferForCleanup(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, StartBufferIO(), StartLocalBufferIO(), TerminateBufferIO(), and TerminateLocalBufferIO().

Referenced by ReadBuffer_common().

Variable Documentation

◆ aio_local_buffer_readv_cb

const PgAioHandleCallbacks aio_local_buffer_readv_cb
Initial value:
= {
.complete_local = local_buffer_readv_complete,
}
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8707
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8701
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition bufmgr.c:8555

Definition at line 8723 of file bufmgr.c.

8723 {
8724 .stage = local_buffer_readv_stage,
8725
8726 /*
8727 * Note that this, in contrast to the shared_buffers case, uses
8728 * complete_local, as only the issuing backend has access to the required
8729 * datastructures. This is important in case the IO completion may be
8730 * consumed incidentally by another backend.
8731 */
8732 .complete_local = local_buffer_readv_complete,
8733 .report = buffer_readv_report,
8734};

◆ aio_shared_buffer_readv_cb

const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Initial value:
= {
.complete_shared = shared_buffer_readv_complete,
}
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8670
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8650
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8656

Definition at line 8714 of file bufmgr.c.

8714 {
8716 .complete_shared = shared_buffer_readv_complete,
8717 /* need a local callback to report checksum failures */
8718 .complete_local = shared_buffer_readv_complete_local,
8719 .report = buffer_readv_report,
8720};

◆ backend_flush_after

int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER

Definition at line 225 of file bufmgr.c.

Referenced by BufferManagerShmemInit().

◆ bgwriter_flush_after

int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER

Definition at line 224 of file bufmgr.c.

Referenced by BackgroundWriterMain().

◆ bgwriter_lru_maxpages

int bgwriter_lru_maxpages = 100

Definition at line 190 of file bufmgr.c.

Referenced by BgBufferSync().

◆ bgwriter_lru_multiplier

double bgwriter_lru_multiplier = 2.0

Definition at line 191 of file bufmgr.c.

Referenced by BgBufferSync().

◆ buffer_io_resowner_desc

const ResourceOwnerDesc buffer_io_resowner_desc
Initial value:
=
{
.name = "buffer io",
.release_priority = RELEASE_PRIO_BUFFER_IOS,
.ReleaseResource = ResOwnerReleaseBufferIO,
.DebugPrint = ResOwnerPrintBufferIO
}
static void ResOwnerReleaseBufferIO(Datum res)
Definition bufmgr.c:7606
static char * ResOwnerPrintBufferIO(Datum res)
Definition bufmgr.c:7614
#define RELEASE_PRIO_BUFFER_IOS
Definition resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition resowner.h:54

Definition at line 285 of file bufmgr.c.

286{
287 .name = "buffer io",
288 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
289 .release_priority = RELEASE_PRIO_BUFFER_IOS,
290 .ReleaseResource = ResOwnerReleaseBufferIO,
291 .DebugPrint = ResOwnerPrintBufferIO
292};

Referenced by ResourceOwnerForgetBufferIO(), and ResourceOwnerRememberBufferIO().

◆ buffer_resowner_desc

const ResourceOwnerDesc buffer_resowner_desc
Initial value:
=
{
.name = "buffer",
.release_priority = RELEASE_PRIO_BUFFER_PINS,
.ReleaseResource = ResOwnerReleaseBuffer,
.DebugPrint = ResOwnerPrintBuffer
}
static void ResOwnerReleaseBuffer(Datum res)
Definition bufmgr.c:7628
static char * ResOwnerPrintBuffer(Datum res)
Definition bufmgr.c:7664
#define RELEASE_PRIO_BUFFER_PINS
Definition resowner.h:63

Definition at line 294 of file bufmgr.c.

295{
296 .name = "buffer",
297 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
298 .release_priority = RELEASE_PRIO_BUFFER_PINS,
299 .ReleaseResource = ResOwnerReleaseBuffer,
300 .DebugPrint = ResOwnerPrintBuffer
301};

Referenced by ResourceOwnerForgetBuffer(), and ResourceOwnerRememberBuffer().

◆ checkpoint_flush_after

int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER

Definition at line 223 of file bufmgr.c.

Referenced by BufferSync().

◆ effective_io_concurrency

◆ io_combine_limit

◆ io_combine_limit_guc

int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT

Definition at line 216 of file bufmgr.c.

Referenced by assign_io_max_combine_limit().

◆ io_max_combine_limit

◆ maintenance_io_concurrency

◆ MaxProportionalPins

uint32 MaxProportionalPins
static

Definition at line 271 of file bufmgr.c.

Referenced by GetAdditionalPinLimit(), GetPinLimit(), and InitBufferManagerAccess().

◆ PinCountWaitBuf

BufferDesc* PinCountWaitBuf = NULL
static

Definition at line 228 of file bufmgr.c.

Referenced by LockBufferForCleanup(), and UnlockBuffers().

◆ PrivateRefCountArray

◆ PrivateRefCountArrayKeys

◆ PrivateRefCountClock

uint32 PrivateRefCountClock = 0
static

Definition at line 267 of file bufmgr.c.

Referenced by ReservePrivateRefCountEntry().

◆ PrivateRefCountEntryLast

int PrivateRefCountEntryLast = -1
static

◆ PrivateRefCountHash

◆ PrivateRefCountOverflowed

◆ ReservedRefCountSlot

int ReservedRefCountSlot = -1
static

◆ track_io_timing

◆ zero_damaged_pages

bool zero_damaged_pages = false

Definition at line 189 of file bufmgr.c.

Referenced by AsyncReadBuffers(), mdreadv(), and read_rel_block_ll().