PostgreSQL Source Code git master
Loading...
Searching...
No Matches
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "common/hashfn.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/aio.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/proclist.h"
#include "storage/procsignal.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
#include "utils/wait_event.h"
#include "lib/simplehash.h"
#include "lib/sort_template.h"
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountData
 
struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define SH_PREFIX   refcount
 
#define SH_ELEMENT_TYPE   PrivateRefCountEntry
 
#define SH_KEY_TYPE   Buffer
 
#define SH_KEY   buffer
 
#define SH_HASH_KEY(tb, key)   murmurhash32((uint32) (key))
 
#define SH_EQUAL(tb, a, b)   ((a) == (b))
 
#define SH_SCOPE   static inline
 
#define SH_DECLARE
 
#define SH_DEFINE
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define READV_COUNT_BITS   7
 
#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)
 

Typedefs

typedef struct PrivateRefCountData PrivateRefCountData
 
typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static void ResOwnerReleaseBufferIO (Datum res)
 
static charResOwnerPrintBufferIO (Datum res)
 
static void ResOwnerReleaseBuffer (Datum res)
 
static charResOwnerPrintBuffer (Datum res)
 
static pg_noinline PrivateRefCountEntryGetPrivateRefCountEntrySlow (Buffer buffer, bool do_move)
 
static Buffer ReadBuffer_common (Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static BlockNumber ExtendBufferedRelCommon (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static BlockNumber ExtendBufferedRelShared (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void UnpinBufferNoOwner (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static void AbortBufferIO (Buffer buffer)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
 
static bool AsyncReadBuffers (ReadBuffersOperation *operation, int *nblocks_progress)
 
static void CheckReadBuffersOperation (ReadBuffersOperation *operation, bool is_complete)
 
static pg_attribute_always_inline void TrackBufferHit (IOObject io_object, IOContext io_context, Relation rel, char persistence, SMgrRelation smgr, ForkNumber forknum, BlockNumber blocknum)
 
static Buffer GetVictimBuffer (BufferAccessStrategy strategy, IOContext io_context)
 
static void FlushUnlockedBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
static void BufferLockAcquire (Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
 
static void BufferLockUnlock (Buffer buffer, BufferDesc *buf_hdr)
 
static bool BufferLockConditional (Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
 
static bool BufferLockHeldByMeInMode (BufferDesc *buf_hdr, BufferLockMode mode)
 
static bool BufferLockHeldByMe (BufferDesc *buf_hdr)
 
static void BufferLockDisown (Buffer buffer, BufferDesc *buf_hdr)
 
static int BufferLockDisownInternal (Buffer buffer, BufferDesc *buf_hdr)
 
static bool BufferLockAttempt (BufferDesc *buf_hdr, BufferLockMode mode)
 
static void BufferLockQueueSelf (BufferDesc *buf_hdr, BufferLockMode mode)
 
static void BufferLockDequeueSelf (BufferDesc *buf_hdr)
 
static void BufferLockWakeup (BufferDesc *buf_hdr, bool wake_exclusive)
 
static void BufferLockProcessRelease (BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
 
static uint64 BufferLockReleaseSub (BufferLockMode mode)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
Buffer ExtendBufferedRel (BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
 
BlockNumber ExtendBufferedRelBy (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
 
Buffer ExtendBufferedRelTo (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
 
static void ZeroAndLockBuffer (Buffer buffer, ReadBufferMode mode, bool already_valid)
 
static pg_attribute_always_inline Buffer PinBufferForBlock (Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, IOObject io_object, IOContext io_context, bool *foundPtr)
 
static pg_attribute_always_inline bool StartReadBuffersImpl (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
 
bool StartReadBuffers (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
 
bool StartReadBuffer (ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
 
static void ProcessReadBuffersResult (ReadBuffersOperation *operation)
 
bool WaitReadBuffers (ReadBuffersOperation *operation)
 
static void InvalidateBuffer (BufferDesc *buf)
 
static bool InvalidateVictimBuffer (BufferDesc *buf_hdr)
 
uint32 GetPinLimit (void)
 
uint32 GetAdditionalPinLimit (void)
 
void LimitAdditionalPins (uint32 *additional_pins)
 
bool BufferIsLockedByMe (Buffer buffer)
 
bool BufferIsLockedByMeInMode (Buffer buffer, BufferLockMode mode)
 
bool BufferIsDirty (Buffer buffer)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
static void WakePinCountWaiter (BufferDesc *buf)
 
void TrackNewBufferPin (Buffer buf)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferManagerAccess (void)
 
charDebugPrintBufferRefcount (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
static void MarkSharedBufferDirtyHint (Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, bool buffer_std)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void UnlockBuffer (Buffer buffer)
 
void LockBufferInternal (Buffer buffer, BufferLockMode mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void CheckBufferIsPinnedOnce (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
static bool SharedBufferBeginSetHintBits (Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
 
bool BufferBeginSetHintBits (Buffer buffer)
 
void BufferFinishSetHintBits (Buffer buffer, bool mark_dirty, bool buffer_std)
 
bool BufferSetHintBits16 (uint16 *ptr, uint16 val, Buffer buffer)
 
StartBufferIOResult StartSharedBufferIO (BufferDesc *buf, bool forInput, bool wait, PgAioWaitRef *io_wref)
 
StartBufferIOResult StartBufferIO (Buffer buffer, bool forInput, bool wait, PgAioWaitRef *io_wref)
 
void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
 
uint64 LockBufHdr (BufferDesc *desc)
 
pg_noinline uint64 WaitBufHdrUnlocked (BufferDesc *buf)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *wb_context, IOContext io_context)
 
static bool EvictUnpinnedBufferInternal (BufferDesc *desc, bool *buffer_flushed)
 
bool EvictUnpinnedBuffer (Buffer buf, bool *buffer_flushed)
 
void EvictAllUnpinnedBuffers (int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
void EvictRelUnpinnedBuffers (Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
static bool MarkDirtyUnpinnedBufferInternal (Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
 
bool MarkDirtyUnpinnedBuffer (Buffer buf, bool *buffer_already_dirty)
 
void MarkDirtyRelUnpinnedBuffers (Relation rel, int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
 
void MarkDirtyAllUnpinnedBuffers (int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
 
static pg_attribute_always_inline void buffer_stage_common (PgAioHandle *ioh, bool is_write, bool is_temp)
 
static void buffer_readv_decode_error (PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
 
static void buffer_readv_encode_error (PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
 
static pg_attribute_always_inline void buffer_readv_complete_one (PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
 
static pg_attribute_always_inline PgAioResult buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
 
static void buffer_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static void shared_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete_local (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void local_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult local_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT
 
int io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescPinCountWaitBuf = NULL
 
static Buffer PrivateRefCountArrayKeys [REFCOUNT_ARRAY_ENTRIES]
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static refcount_hashPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static int ReservedRefCountSlot = -1
 
static int PrivateRefCountEntryLast = -1
 
static uint32 MaxProportionalPins
 
const ResourceOwnerDesc buffer_io_resowner_desc
 
const ResourceOwnerDesc buffer_resowner_desc
 
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
 
const PgAioHandleCallbacks aio_local_buffer_readv_cb
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 95 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 85 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 84 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 77 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
false \
: \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition bufmgr.c:542
static bool BufferIsValid(Buffer bufnum)
Definition bufmgr.h:419
int32 * LocalRefCount
Definition localbuf.c:49
static int fb(int x)

Definition at line 599 of file bufmgr.c.

603 : \
605 (LocalRefCount[-(bufnum) - 1] > 0) \
606 : \
608)

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 76 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 80 of file bufmgr.c.

◆ READV_COUNT_BITS

#define READV_COUNT_BITS   7

◆ READV_COUNT_MASK

#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 145 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 87 of file bufmgr.c.

◆ SH_DECLARE

#define SH_DECLARE

Definition at line 140 of file bufmgr.c.

◆ SH_DEFINE

#define SH_DEFINE

Definition at line 141 of file bufmgr.c.

◆ SH_ELEMENT_TYPE

#define SH_ELEMENT_TYPE   PrivateRefCountEntry

Definition at line 134 of file bufmgr.c.

◆ SH_EQUAL

#define SH_EQUAL (   tb,
  a,
  b 
)    ((a) == (b))

Definition at line 138 of file bufmgr.c.

◆ SH_HASH_KEY

#define SH_HASH_KEY (   tb,
  key 
)    murmurhash32((uint32) (key))

Definition at line 137 of file bufmgr.c.

◆ SH_KEY

#define SH_KEY   buffer

Definition at line 136 of file bufmgr.c.

◆ SH_KEY_TYPE

#define SH_KEY_TYPE   Buffer

Definition at line 135 of file bufmgr.c.

◆ SH_PREFIX

#define SH_PREFIX   refcount

Definition at line 133 of file bufmgr.c.

◆ SH_SCOPE

#define SH_SCOPE   static inline

Definition at line 139 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 3545 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 3545 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 3547 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 3547 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 3544 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 3544 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 3546 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 3546 of file bufmgr.c.

◆ ST_SORT [1/2]

Definition at line 3543 of file bufmgr.c.

◆ ST_SORT [2/2]

Definition at line 3543 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

◆ PrivateRefCountData

◆ PrivateRefCountEntry

◆ SMgrSortArray

Function Documentation

◆ AbortBufferIO()

static void AbortBufferIO ( Buffer  buffer)
static

Definition at line 7429 of file bufmgr.c.

7430{
7431 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
7433
7436
7437 if (!(buf_state & BM_VALID))
7438 {
7441 }
7442 else
7443 {
7446
7447 /* Issue notice if this is not the first failure... */
7448 if (buf_state & BM_IO_ERROR)
7449 {
7450 /* Buffer is pinned, so we can read tag without spinlock */
7453 errmsg("could not write block %u of %s",
7454 buf_hdr->tag.blockNum,
7456 BufTagGetForkNum(&buf_hdr->tag)).str),
7457 errdetail("Multiple failures --- write error might be permanent.")));
7458 }
7459 }
7460
7461 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
7462}
#define BM_TAG_VALID
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc)
#define BM_DIRTY
#define BM_IO_IN_PROGRESS
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
#define BM_IO_ERROR
static BufferDesc * GetBufferDescriptor(uint32 id)
uint64 LockBufHdr(BufferDesc *desc)
Definition bufmgr.c:7527
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag_bits, bool forget_owner, bool release_aio)
Definition bufmgr.c:7367
#define Assert(condition)
Definition c.h:943
uint64_t uint64
Definition c.h:625
int errcode(int sqlerrcode)
Definition elog.c:875
int errdetail(const char *fmt,...) pg_attribute_printf(1
#define WARNING
Definition elog.h:37
#define ereport(elevel,...)
Definition elog.h:152
static char * errmsg
#define relpathperm(rlocator, forknum)
Definition relpath.h:146

References Assert, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg, fb(), GetBufferDescriptor(), LockBufHdr(), relpathperm, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by ResOwnerReleaseBufferIO().

◆ AsyncReadBuffers()

static bool AsyncReadBuffers ( ReadBuffersOperation operation,
int nblocks_progress 
)
static

Definition at line 1938 of file bufmgr.c.

1939{
1940 Buffer *buffers = &operation->buffers[0];
1941 int flags = operation->flags;
1942 ForkNumber forknum = operation->forknum;
1943 char persistence = operation->persistence;
1944 int16 nblocks_done = operation->nblocks_done;
1945 BlockNumber blocknum = operation->blocknum + nblocks_done;
1946 Buffer *io_buffers = &operation->buffers[nblocks_done];
1947 int io_buffers_len = 0;
1949 uint32 ioh_flags = 0;
1954 StartBufferIOResult status;
1955
1956 if (persistence == RELPERSISTENCE_TEMP)
1957 {
1960 }
1961 else
1962 {
1965 }
1966
1967 /*
1968 * When this IO is executed synchronously, either because the caller will
1969 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1970 * the AIO subsystem needs to know.
1971 */
1972 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1974
1975 if (persistence == RELPERSISTENCE_TEMP)
1977
1978 /*
1979 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1980 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1981 * set globally, but on a per-session basis. The completion callback,
1982 * which may be run in other processes, e.g. in IO workers, may have a
1983 * different value of the zero_damaged_pages GUC.
1984 *
1985 * XXX: We probably should eventually use a different flag for
1986 * zero_damaged_pages, so we can report different log levels / error codes
1987 * for zero_damaged_pages and ZERO_ON_ERROR.
1988 */
1991
1992 /*
1993 * For the same reason as with zero_damaged_pages we need to use this
1994 * backend's ignore_checksum_failure value.
1995 */
1998
1999
2000 /*
2001 * To be allowed to report stats in the local completion callback we need
2002 * to prepare to report stats now. This ensures we can safely report the
2003 * checksum failure even in a critical section.
2004 */
2005 pgstat_prepare_report_checksum_failure(operation->smgr->smgr_rlocator.locator.dbOid);
2006
2007 /*
2008 * We must get an IO handle before StartBufferIO(), as pgaio_io_acquire()
2009 * might block, which we don't want after setting IO_IN_PROGRESS. If we
2010 * don't need to do the IO, we'll release the handle.
2011 *
2012 * If we need to wait for IO before we can get a handle, submit
2013 * already-staged IO first, so that other backends don't need to wait.
2014 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
2015 * wait for already submitted IO, which doesn't require additional locks,
2016 * but it could still cause undesirable waits.
2017 *
2018 * A secondary benefit is that this would allow us to measure the time in
2019 * pgaio_io_acquire() without causing undue timer overhead in the common,
2020 * non-blocking, case. However, currently the pgstats infrastructure
2021 * doesn't really allow that, as it a) asserts that an operation can't
2022 * have time without operations b) doesn't have an API to report
2023 * "accumulated" time.
2024 */
2026 if (unlikely(!ioh))
2027 {
2030 }
2031
2032 operation->foreign_io = false;
2033 pgaio_wref_clear(&operation->io_wref);
2034
2035 /*
2036 * Try to start IO on the first buffer in a new run of blocks. If AIO is
2037 * in progress, be it in this backend or another backend, we just
2038 * associate the wait reference with the operation and wait in
2039 * WaitReadBuffers(). This turns out to be important for performance in
2040 * two workloads:
2041 *
2042 * 1) A read stream that has to read the same block multiple times within
2043 * the readahead distance. This can happen e.g. for the table accesses of
2044 * an index scan.
2045 *
2046 * 2) Concurrent scans by multiple backends on the same relation.
2047 *
2048 * If we were to synchronously wait for the in-progress IO, we'd not be
2049 * able to keep enough I/O in flight.
2050 *
2051 * If we do find there is ongoing I/O for the buffer, we set up a 1-block
2052 * ReadBuffersOperation that WaitReadBuffers then can wait on.
2053 *
2054 * It's possible that another backend has started IO on the buffer but not
2055 * yet set its wait reference. In this case, we have no choice but to wait
2056 * for either the wait reference to be valid or the IO to be done.
2057 */
2058 status = StartBufferIO(buffers[nblocks_done], true, true,
2059 &operation->io_wref);
2060 if (status != BUFFER_IO_READY_FOR_IO)
2061 {
2063 *nblocks_progress = 1;
2064 if (status == BUFFER_IO_ALREADY_DONE)
2065 {
2066 /*
2067 * Someone has already completed this block, we're done.
2068 *
2069 * When IO is necessary, ->nblocks_done is updated in
2070 * ProcessReadBuffersResult(), but that is not called if no IO is
2071 * necessary. Thus update here.
2072 */
2073 operation->nblocks_done += 1;
2074 Assert(operation->nblocks_done <= operation->nblocks);
2075
2076 Assert(!pgaio_wref_valid(&operation->io_wref));
2077
2078 /*
2079 * Report and track this as a 'hit' for this backend, even though
2080 * it must have started out as a miss in PinBufferForBlock(). The
2081 * other backend will track this as a 'read'.
2082 */
2084 operation->rel, operation->persistence,
2085 operation->smgr, operation->forknum,
2086 blocknum);
2087 return false;
2088 }
2089
2090 /* The IO is already in-progress */
2091 Assert(status == BUFFER_IO_IN_PROGRESS);
2092 Assert(pgaio_wref_valid(&operation->io_wref));
2093 operation->foreign_io = true;
2094
2095 return true;
2096 }
2097
2098 Assert(io_buffers[0] == buffers[nblocks_done]);
2099 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
2100 io_buffers_len = 1;
2101
2102 /*
2103 * NB: As little code as possible should be added between the
2104 * StartBufferIO() above, the further StartBufferIO()s below and the
2105 * smgrstartreadv(), as some of the buffers are now marked as
2106 * IO_IN_PROGRESS and will thus cause other backends to wait.
2107 */
2108
2109 /*
2110 * How many neighboring-on-disk blocks can we scatter-read into other
2111 * buffers at the same time? In this case we don't wait if we see an I/O
2112 * already in progress (see comment above).
2113 */
2114 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
2115 {
2116 /* Must be consecutive block numbers. */
2117 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
2118 BufferGetBlockNumber(buffers[i]) - 1);
2119
2120 status = StartBufferIO(buffers[i], true, false, NULL);
2121 if (status != BUFFER_IO_READY_FOR_IO)
2122 break;
2123
2124 Assert(io_buffers[io_buffers_len] == buffers[i]);
2125
2126 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
2127 }
2128
2129 /* get a reference to wait for in WaitReadBuffers() */
2130 pgaio_io_get_wref(ioh, &operation->io_wref);
2131
2132 /* provide the list of buffers to the completion callbacks */
2134
2136 persistence == RELPERSISTENCE_TEMP ?
2139 flags);
2140
2142
2143 /* ---
2144 * Even though we're trying to issue IO asynchronously, track the time
2145 * in smgrstartreadv():
2146 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
2147 * immediately
2148 * - the io method might not support the IO (e.g. worker IO for a temp
2149 * table)
2150 * ---
2151 */
2153 smgrstartreadv(ioh, operation->smgr, forknum,
2154 blocknum,
2158
2159 if (persistence == RELPERSISTENCE_TEMP)
2161 else
2163
2164 /*
2165 * Track vacuum cost when issuing IO, not after waiting for it. Otherwise
2166 * we could end up issuing a lot of IO in a short timespan, despite a low
2167 * cost limit.
2168 */
2169 if (VacuumCostActive)
2171
2173
2174 return true;
2175}
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition aio.c:971
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:162
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition aio.c:964
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition aio.c:366
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition aio.c:330
void pgaio_submit_staged(void)
Definition aio.c:1133
void pgaio_io_release(PgAioHandle *ioh)
Definition aio.c:240
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition aio.c:188
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition aio.h:198
@ PGAIO_HF_SYNCHRONOUS
Definition aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
uint32 BlockNumber
Definition block.h:31
int Buffer
Definition buf.h:23
StartBufferIOResult
@ BUFFER_IO_IN_PROGRESS
@ BUFFER_IO_ALREADY_DONE
@ BUFFER_IO_READY_FOR_IO
bool track_io_timing
Definition bufmgr.c:192
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition bufmgr.c:4455
bool zero_damaged_pages
Definition bufmgr.c:189
static pg_attribute_always_inline void TrackBufferHit(IOObject io_object, IOContext io_context, Relation rel, char persistence, SMgrRelation smgr, ForkNumber forknum, BlockNumber blocknum)
Definition bufmgr.c:1683
StartBufferIOResult StartBufferIO(Buffer buffer, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition bufmgr.c:7330
#define READ_BUFFERS_ZERO_ON_ERROR
Definition bufmgr.h:122
static Block BufferGetBlock(Buffer buffer)
Definition bufmgr.h:435
#define MAX_IO_COMBINE_LIMIT
Definition bufmgr.h:175
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition bufmgr.h:126
#define READ_BUFFERS_SYNCHRONOUSLY
Definition bufmgr.h:128
bool ignore_checksum_failure
Definition bufpage.c:27
int16_t int16
Definition c.h:619
#define unlikely(x)
Definition c.h:438
uint32_t uint32
Definition c.h:624
static DataChecksumsWorkerOperation operation
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition freelist.c:712
int VacuumCostPageMiss
Definition globals.c:155
bool VacuumCostActive
Definition globals.c:161
int VacuumCostBalance
Definition globals.c:160
BufferUsage pgBufferUsage
Definition instrument.c:25
int i
Definition isn.c:77
IOObject
Definition pgstat.h:280
@ IOOBJECT_RELATION
Definition pgstat.h:281
@ IOOBJECT_TEMP_RELATION
Definition pgstat.h:282
IOContext
Definition pgstat.h:289
@ IOCONTEXT_NORMAL
Definition pgstat.h:293
@ IOOP_READ
Definition pgstat.h:319
void pgstat_prepare_report_checksum_failure(Oid dboid)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition pgstat_io.c:91
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:122
ForkNumber
Definition relpath.h:56
ResourceOwner CurrentResourceOwner
Definition resowner.c:173
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition smgr.c:753
int64 shared_blks_read
Definition instrument.h:27
int64 local_blks_read
Definition instrument.h:31

References Assert, BUFFER_IO_ALREADY_DONE, BUFFER_IO_IN_PROGRESS, BUFFER_IO_READY_FOR_IO, BufferGetBlock(), BufferGetBlockNumber(), CurrentResourceOwner, fb(), i, ignore_checksum_failure, IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_READ, BufferUsage::local_blks_read, MAX_IO_COMBINE_LIMIT, operation, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_HF_REFERENCES_LOCAL, PGAIO_HF_SYNCHRONOUS, pgaio_io_acquire(), pgaio_io_acquire_nb(), pgaio_io_get_wref(), pgaio_io_register_callbacks(), pgaio_io_release(), pgaio_io_set_flag(), pgaio_io_set_handle_data_32(), pgaio_submit_staged(), pgaio_wref_clear(), pgaio_wref_valid(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), pgstat_prepare_report_checksum_failure(), READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, BufferUsage::shared_blks_read, smgrstartreadv(), StartBufferIO(), PrivateRefCountEntry::status, track_io_timing, TrackBufferHit(), unlikely, VacuumCostActive, VacuumCostBalance, VacuumCostPageMiss, and zero_damaged_pages.

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 4208 of file bufmgr.c.

4209{
4211
4213
4215}
static void CheckForBufferLeaks(void)
Definition bufmgr.c:4272
static int32 PrivateRefCountOverflowed
Definition bufmgr.c:266
void AtEOXact_LocalBuffers(bool isCommit)
Definition localbuf.c:1027

References Assert, AtEOXact_LocalBuffers(), CheckForBufferLeaks(), fb(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 4254 of file bufmgr.c.

4255{
4256 UnlockBuffers();
4257
4259
4260 /* localbuf.c needs a chance too */
4262}
void UnlockBuffers(void)
Definition bufmgr.c:5861
void AtProcExit_LocalBuffers(void)
Definition localbuf.c:1038

References AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferManagerAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 3840 of file bufmgr.c.

3841{
3842 /* info obtained from freelist.c */
3843 int strategy_buf_id;
3846
3847 /*
3848 * Information saved between calls so we can determine the strategy
3849 * point's advance rate and avoid scanning already-cleaned buffers.
3850 */
3851 static bool saved_info_valid = false;
3852 static int prev_strategy_buf_id;
3854 static int next_to_clean;
3855 static uint32 next_passes;
3856
3857 /* Moving averages of allocation rate and clean-buffer density */
3858 static float smoothed_alloc = 0;
3859 static float smoothed_density = 10.0;
3860
3861 /* Potentially these could be tunables, but for now, not */
3862 float smoothing_samples = 16;
3863 float scan_whole_pool_milliseconds = 120000.0;
3864
3865 /* Used to compute how far we scan ahead */
3866 long strategy_delta;
3867 int bufs_to_lap;
3868 int bufs_ahead;
3869 float scans_per_alloc;
3872 int min_scan_buffers;
3873
3874 /* Variables for the scanning loop proper */
3875 int num_to_scan;
3876 int num_written;
3877 int reusable_buffers;
3878
3879 /* Variables for final smoothed_density update */
3880 long new_strategy_delta;
3882
3883 /*
3884 * Find out where the clock-sweep currently is, and how many buffer
3885 * allocations have happened since our last call.
3886 */
3888
3889 /* Report buffer alloc counts to pgstat */
3891
3892 /*
3893 * If we're not running the LRU scan, just stop after doing the stats
3894 * stuff. We mark the saved state invalid so that we can recover sanely
3895 * if LRU scan is turned back on later.
3896 */
3897 if (bgwriter_lru_maxpages <= 0)
3898 {
3899 saved_info_valid = false;
3900 return true;
3901 }
3902
3903 /*
3904 * Compute strategy_delta = how many buffers have been scanned by the
3905 * clock-sweep since last time. If first time through, assume none. Then
3906 * see if we are still ahead of the clock-sweep, and if so, how many
3907 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3908 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3909 * behavior when the passes counts wrap around.
3910 */
3911 if (saved_info_valid)
3912 {
3914
3917
3918 Assert(strategy_delta >= 0);
3919
3920 if ((int32) (next_passes - strategy_passes) > 0)
3921 {
3922 /* we're one pass ahead of the strategy point */
3924#ifdef BGW_DEBUG
3925 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3929#endif
3930 }
3931 else if (next_passes == strategy_passes &&
3933 {
3934 /* on same pass, but ahead or at least not behind */
3936#ifdef BGW_DEBUG
3937 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3941#endif
3942 }
3943 else
3944 {
3945 /*
3946 * We're behind, so skip forward to the strategy point and start
3947 * cleaning from there.
3948 */
3949#ifdef BGW_DEBUG
3950 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3954#endif
3958 }
3959 }
3960 else
3961 {
3962 /*
3963 * Initializing at startup or after LRU scanning had been off. Always
3964 * start at the strategy point.
3965 */
3966#ifdef BGW_DEBUG
3967 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3969#endif
3970 strategy_delta = 0;
3974 }
3975
3976 /* Update saved info for next time */
3979 saved_info_valid = true;
3980
3981 /*
3982 * Compute how many buffers had to be scanned for each new allocation, ie,
3983 * 1/density of reusable buffers, and track a moving average of that.
3984 *
3985 * If the strategy point didn't move, we don't update the density estimate
3986 */
3987 if (strategy_delta > 0 && recent_alloc > 0)
3988 {
3992 }
3993
3994 /*
3995 * Estimate how many reusable buffers there are between the current
3996 * strategy point and where we've scanned ahead to, based on the smoothed
3997 * density estimate.
3998 */
4001
4002 /*
4003 * Track a moving average of recent buffer allocations. Here, rather than
4004 * a true average we want a fast-attack, slow-decline behavior: we
4005 * immediately follow any increase.
4006 */
4007 if (smoothed_alloc <= (float) recent_alloc)
4009 else
4012
4013 /* Scale the estimate by a GUC to allow more aggressive tuning. */
4015
4016 /*
4017 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
4018 * eventually underflow to zero, and the underflows produce annoying
4019 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
4020 * zero, there's no point in tracking smaller and smaller values of
4021 * smoothed_alloc, so just reset it to exactly zero to avoid this
4022 * syndrome. It will pop back up as soon as recent_alloc increases.
4023 */
4024 if (upcoming_alloc_est == 0)
4025 smoothed_alloc = 0;
4026
4027 /*
4028 * Even in cases where there's been little or no buffer allocation
4029 * activity, we want to make a small amount of progress through the buffer
4030 * cache so that as many reusable buffers as possible are clean after an
4031 * idle period.
4032 *
4033 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
4034 * the BGW will be called during the scan_whole_pool time; slice the
4035 * buffer pool into that many sections.
4036 */
4038
4040 {
4041#ifdef BGW_DEBUG
4042 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
4044#endif
4046 }
4047
4048 /*
4049 * Now write out dirty reusable buffers, working forward from the
4050 * next_to_clean point, until we have lapped the strategy scan, or cleaned
4051 * enough buffers to match our estimate of the next cycle's allocation
4052 * requirements, or hit the bgwriter_lru_maxpages limit.
4053 */
4054
4055 num_to_scan = bufs_to_lap;
4056 num_written = 0;
4058
4059 /* Execute the LRU scan */
4060 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
4061 {
4063 wb_context);
4064
4065 if (++next_to_clean >= NBuffers)
4066 {
4067 next_to_clean = 0;
4068 next_passes++;
4069 }
4070 num_to_scan--;
4071
4072 if (sync_state & BUF_WRITTEN)
4073 {
4076 {
4078 break;
4079 }
4080 }
4081 else if (sync_state & BUF_REUSABLE)
4083 }
4084
4086
4087#ifdef BGW_DEBUG
4088 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
4091 bufs_to_lap - num_to_scan,
4094#endif
4095
4096 /*
4097 * Consider the above scan as being like a new allocation scan.
4098 * Characterize its density and update the smoothed one based on it. This
4099 * effectively halves the moving average period in cases where both the
4100 * strategy and the background writer are doing some useful scanning,
4101 * which is helpful because a long memory isn't as desirable on the
4102 * density estimates.
4103 */
4104 new_strategy_delta = bufs_to_lap - num_to_scan;
4106 if (new_strategy_delta > 0 && new_recent_alloc > 0)
4107 {
4111
4112#ifdef BGW_DEBUG
4113 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
4116#endif
4117 }
4118
4119 /* Return true if OK to hibernate */
4120 return (bufs_to_lap == 0 && recent_alloc == 0);
4121}
int BgWriterDelay
Definition bgwriter.c:59
#define BUF_REUSABLE
Definition bufmgr.c:85
double bgwriter_lru_multiplier
Definition bufmgr.c:191
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition bufmgr.c:4138
int bgwriter_lru_maxpages
Definition bufmgr.c:190
#define BUF_WRITTEN
Definition bufmgr.c:84
int32_t int32
Definition c.h:620
#define DEBUG2
Definition elog.h:30
#define DEBUG1
Definition elog.h:31
#define elog(elevel,...)
Definition elog.h:228
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition freelist.c:331
int NBuffers
Definition globals.c:144
PgStat_BgWriterStats PendingBgWriterStats
PgStat_Counter buf_written_clean
Definition pgstat.h:246
PgStat_Counter maxwritten_clean
Definition pgstat.h:247
PgStat_Counter buf_alloc
Definition pgstat.h:248

References Assert, bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, DEBUG1, DEBUG2, elog, fb(), PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ buffer_readv_complete()

static pg_attribute_always_inline PgAioResult buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data,
bool  is_temp 
)
static

Definition at line 8685 of file bufmgr.c.

8687{
8693 uint8 error_count = 0;
8694 uint8 zeroed_count = 0;
8695 uint8 ignored_count = 0;
8697 uint64 *io_data;
8698 uint8 handle_data_len;
8699
8700 if (is_temp)
8701 {
8702 Assert(td->smgr.is_temp);
8704 }
8705 else
8706 Assert(!td->smgr.is_temp);
8707
8708 /*
8709 * Iterate over all the buffers affected by this IO and call the
8710 * per-buffer completion function for each buffer.
8711 */
8712 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8713 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
8714 {
8716 bool failed;
8717 bool failed_verification = false;
8718 bool failed_checksum = false;
8719 bool zeroed_buffer = false;
8720 bool ignored_checksum = false;
8721
8723
8724 /*
8725 * If the entire I/O failed on a lower-level, each buffer needs to be
8726 * marked as failed. In case of a partial read, the first few buffers
8727 * may be ok.
8728 */
8729 failed =
8731 || prior_result.result <= buf_off;
8732
8733 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
8737 &zeroed_buffer);
8738
8739 /*
8740 * Track information about the number of different kinds of error
8741 * conditions across all pages, as there can be multiple pages failing
8742 * verification as part of one IO.
8743 */
8746 if (zeroed_buffer && zeroed_count++ == 0)
8748 if (ignored_checksum && ignored_count++ == 0)
8750 if (failed_checksum)
8752 }
8753
8754 /*
8755 * If the smgr read succeeded [partially] and page verification failed for
8756 * some of the pages, adjust the IO's result state appropriately.
8757 */
8758 if (prior_result.status != PGAIO_RS_ERROR &&
8759 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
8760 {
8762 zeroed_count > 0, ignored_count > 0,
8767 }
8768
8769 /*
8770 * For shared relations this reporting is done in
8771 * shared_buffer_readv_complete_local().
8772 */
8773 if (is_temp && checkfail_count > 0)
8776
8777 return result;
8778}
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition aio.c:355
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition aio_target.c:73
@ PGAIO_RS_ERROR
Definition aio_types.h:84
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition bufmgr.c:8534
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition bufmgr.c:8439
uint8_t uint8
Definition c.h:622
uint32 result
ProcNumber MyProcNumber
Definition globals.c:92
static char buf[DEFAULT_XLOG_SEG_SIZE]
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
RelFileLocator rlocator
Definition aio_types.h:65
struct PgAioTargetData::@131 smgr

References Assert, buf, buffer_readv_complete_one(), buffer_readv_encode_error(), BufferIsValid(), RelFileLocator::dbOid, DEBUG1, fb(), PgAioTargetData::is_temp, MyProcNumber, pgaio_io_get_handle_data(), pgaio_io_get_owner(), pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, pgstat_report_checksum_failures_in_db(), result, PgAioTargetData::rlocator, and PgAioTargetData::smgr.

Referenced by local_buffer_readv_complete(), and shared_buffer_readv_complete().

◆ buffer_readv_complete_one()

static pg_attribute_always_inline void buffer_readv_complete_one ( PgAioTargetData td,
uint8  buf_off,
Buffer  buffer,
uint8  flags,
bool  failed,
bool  is_temp,
bool buffer_invalid,
bool failed_checksum,
bool ignored_checksum,
bool zeroed_buffer 
)
static

Definition at line 8534 of file bufmgr.c.

8540{
8541 BufferDesc *buf_hdr = is_temp ?
8542 GetLocalBufferDescriptor(-buffer - 1)
8543 : GetBufferDescriptor(buffer - 1);
8544 BufferTag tag = buf_hdr->tag;
8545 char *bufdata = BufferGetBlock(buffer);
8547 int piv_flags;
8548
8549 /* check that the buffer is in the expected state for a read */
8550#ifdef USE_ASSERT_CHECKING
8551 {
8553
8556 /* temp buffers don't use BM_IO_IN_PROGRESS */
8557 if (!is_temp)
8560 }
8561#endif
8562
8563 *buffer_invalid = false;
8564 *failed_checksum = false;
8565 *ignored_checksum = false;
8566 *zeroed_buffer = false;
8567
8568 /*
8569 * We ask PageIsVerified() to only log the message about checksum errors,
8570 * as the completion might be run in any backend (or IO workers). We will
8571 * report checksum errors in buffer_readv_report().
8572 */
8574
8575 /* the local zero_damaged_pages may differ from the definer's */
8578
8579 /*
8580 * If the buffers are marked for zero on error, we want to log that in
8581 * case of a checksum failure.
8582 */
8583 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8585
8586 /* Check for garbage data. */
8587 if (!failed)
8588 {
8589 /*
8590 * If the buffer is not currently pinned by this backend, e.g. because
8591 * we're completing this IO after an error, the buffer data will have
8592 * been marked as inaccessible when the buffer was unpinned. The AIO
8593 * subsystem holds a pin, but that doesn't prevent the buffer from
8594 * having been marked as inaccessible. The completion might also be
8595 * executed in a different process.
8596 */
8597#ifdef USE_VALGRIND
8598 if (!BufferIsPinned(buffer))
8600#endif
8601
8602 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
8604 {
8605 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
8606 {
8607 memset(bufdata, 0, BLCKSZ);
8608 *zeroed_buffer = true;
8609 }
8610 else
8611 {
8612 *buffer_invalid = true;
8613 /* mark buffer as having failed */
8614 failed = true;
8615 }
8616 }
8617 else if (*failed_checksum)
8618 *ignored_checksum = true;
8619
8620 /* undo what we did above */
8621#ifdef USE_VALGRIND
8622 if (!BufferIsPinned(buffer))
8624#endif
8625
8626 /*
8627 * Immediately log a message about the invalid page, but only to the
8628 * server log. The reason to do so immediately is that this may be
8629 * executed in a different backend than the one that originated the
8630 * request. The reason to do so immediately is that the originator
8631 * might not process the query result immediately (because it is busy
8632 * doing another part of query processing) or at all (e.g. if it was
8633 * cancelled or errored out due to another IO also failing). The
8634 * definer of the IO will emit an ERROR or WARNING when processing the
8635 * IO's results
8636 *
8637 * To avoid duplicating the code to emit these log messages, we reuse
8638 * buffer_readv_report().
8639 */
8641 {
8642 PgAioResult result_one = {0};
8643
8648 *zeroed_buffer ? 1 : 0,
8649 *failed_checksum ? 1 : 0,
8652 }
8653 }
8654
8655 /* Terminate I/O and set BM_VALID. */
8656 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
8657 if (is_temp)
8659 else
8660 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
8661
8662 /*
8663 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
8664 * callback may not be executed in the same backend that called
8665 * BUFFER_READ_START. The alternative would be to defer calling the
8666 * tracepoint to a later point (e.g. the local completion callback for
8667 * shared buffer reads), which seems even less helpful.
8668 */
8670 tag.blockNum,
8671 tag.spcOid,
8672 tag.dbOid,
8673 tag.relNumber,
8675 false);
8676}
static uint64 pg_atomic_read_u64(volatile pg_atomic_uint64 *ptr)
Definition atomics.h:467
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition bufmgr.c:599
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition bufpage.c:94
#define PIV_LOG_LOG
Definition bufpage.h:500
#define PIV_ZERO_BUFFERS_ON_ERROR
Definition bufpage.h:502
PageData * Page
Definition bufpage.h:81
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition bufpage.h:501
#define LOG_SERVER_ONLY
Definition elog.h:33
#define false
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint64 set_flag_bits, bool release_aio)
Definition localbuf.c:586
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition memdebug.h:27
#define INVALID_PROC_NUMBER
Definition procnumber.h:26
BlockNumber blockNum
RelFileNumber relNumber
ForkNumber forkNum

References Assert, buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, buffer_readv_encode_error(), BufferGetBlock(), BufferIsPinned, buftag::dbOid, fb(), buftag::forkNum, GetBufferDescriptor(), GetLocalBufferDescriptor(), INVALID_PROC_NUMBER, LOG_SERVER_ONLY, MyProcNumber, PageIsVerified(), pg_atomic_read_u64(), pgaio_result_report(), PIV_IGNORE_CHECKSUM_FAILURE, PIV_LOG_LOG, PIV_ZERO_BUFFERS_ON_ERROR, READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_ZERO_ON_ERROR, buftag::relNumber, buftag::spcOid, TerminateBufferIO(), TerminateLocalBufferIO(), VALGRIND_MAKE_MEM_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by buffer_readv_complete().

◆ buffer_readv_decode_error()

static void buffer_readv_decode_error ( PgAioResult  result,
bool zeroed_any,
bool ignored_any,
uint8 zeroed_or_error_count,
uint8 checkfail_count,
uint8 first_off 
)
inlinestatic

Definition at line 8397 of file bufmgr.c.

8403{
8404 uint32 rem_error = result.error_data;
8405
8406 /* see static asserts in buffer_readv_encode_error */
8407#define READV_COUNT_BITS 7
8408#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
8409
8410 *zeroed_any = rem_error & 1;
8411 rem_error >>= 1;
8412
8413 *ignored_any = rem_error & 1;
8414 rem_error >>= 1;
8415
8418
8421
8424}
#define READV_COUNT_BITS
#define READV_COUNT_MASK

References fb(), READV_COUNT_BITS, READV_COUNT_MASK, and result.

Referenced by buffer_readv_encode_error(), buffer_readv_report(), and shared_buffer_readv_complete_local().

◆ buffer_readv_encode_error()

static void buffer_readv_encode_error ( PgAioResult result,
bool  is_temp,
bool  zeroed_any,
bool  ignored_any,
uint8  error_count,
uint8  zeroed_count,
uint8  checkfail_count,
uint8  first_error_off,
uint8  first_zeroed_off,
uint8  first_ignored_off 
)
inlinestatic

Definition at line 8439 of file bufmgr.c.

8449{
8450
8451 uint8 shift = 0;
8455
8457 "PG_IOV_MAX is bigger than reserved space for error data");
8459 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
8460
8461 /*
8462 * We only have space to encode one offset - but luckily that's good
8463 * enough. If there is an error, the error is the interesting offset, same
8464 * with a zeroed buffer vs an ignored buffer.
8465 */
8466 if (error_count > 0)
8468 else if (zeroed_count > 0)
8470 else
8472
8473 Assert(!zeroed_any || error_count == 0);
8474
8475 result->error_data = 0;
8476
8477 result->error_data |= zeroed_any << shift;
8478 shift += 1;
8479
8480 result->error_data |= ignored_any << shift;
8481 shift += 1;
8482
8483 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
8484 shift += READV_COUNT_BITS;
8485
8486 result->error_data |= ((uint32) checkfail_count) << shift;
8487 shift += READV_COUNT_BITS;
8488
8489 result->error_data |= ((uint32) first_off) << shift;
8490 shift += READV_COUNT_BITS;
8491
8492 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
8494
8495 if (error_count > 0)
8496 result->status = PGAIO_RS_ERROR;
8497 else
8498 result->status = PGAIO_RS_WARNING;
8499
8500 /*
8501 * The encoding is complicated enough to warrant cross-checking it against
8502 * the decode function.
8503 */
8504#ifdef USE_ASSERT_CHECKING
8505 {
8506 bool zeroed_any_2,
8511
8516 &first_off_2);
8522 }
8523#endif
8524
8525#undef READV_COUNT_BITS
8526#undef READV_COUNT_MASK
8527}
#define PGAIO_RESULT_ERROR_BITS
Definition aio_types.h:98
@ PGAIO_RS_WARNING
Definition aio_types.h:83
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition bufmgr.c:8397
#define StaticAssertDecl(condition, errmessage)
Definition c.h:1008
#define PG_IOV_MAX
Definition pg_iovec.h:47

References Assert, buffer_readv_decode_error(), fb(), PG_IOV_MAX, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_RESULT_ERROR_BITS, PGAIO_RS_ERROR, PGAIO_RS_WARNING, READV_COUNT_BITS, result, and StaticAssertDecl.

Referenced by buffer_readv_complete(), and buffer_readv_complete_one().

◆ buffer_readv_report()

static void buffer_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 8788 of file bufmgr.c.

8790{
8791 int nblocks = td->smgr.nblocks;
8792 BlockNumber first = td->smgr.blockNum;
8793 BlockNumber last = first + nblocks - 1;
8796 RelPathStr rpath =
8798 bool zeroed_any,
8802 first_off;
8804 const char *msg_one,
8805 *msg_mult,
8806 *det_mult,
8807 *hint_mult;
8808
8812 &first_off);
8813
8814 /*
8815 * Treat a read that had both zeroed buffers *and* ignored checksums as a
8816 * special case, it's too irregular to be emitted the same way as the
8817 * other cases.
8818 */
8819 if (zeroed_any && ignored_any)
8820 {
8822 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
8823 Assert(result.status != PGAIO_RS_ERROR);
8825
8826 ereport(elevel,
8828 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
8829 affected_count, checkfail_count, first, last, rpath.str),
8830 affected_count > 1 ?
8831 errdetail("Block %u held the first zeroed page.",
8832 first + first_off) : 0,
8833 errhint_plural("See server log for details about the other %d invalid block.",
8834 "See server log for details about the other %d invalid blocks.",
8837 return;
8838 }
8839
8840 /*
8841 * The other messages are highly repetitive. To avoid duplicating a long
8842 * and complicated ereport(), gather the translated format strings
8843 * separately and then do one common ereport.
8844 */
8845 if (result.status == PGAIO_RS_ERROR)
8846 {
8847 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
8849 msg_one = _("invalid page in block %u of relation \"%s\"");
8850 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
8851 det_mult = _("Block %u held the first invalid page.");
8852 hint_mult = _("See server log for the other %u invalid block(s).");
8853 }
8854 else if (zeroed_any && !ignored_any)
8855 {
8857 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
8858 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
8859 det_mult = _("Block %u held the first zeroed page.");
8860 hint_mult = _("See server log for the other %u zeroed block(s).");
8861 }
8862 else if (!zeroed_any && ignored_any)
8863 {
8865 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
8866 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
8867 det_mult = _("Block %u held the first ignored page.");
8868 hint_mult = _("See server log for the other %u ignored block(s).");
8869 }
8870 else
8872
8873 ereport(elevel,
8875 affected_count == 1 ?
8876 errmsg_internal(msg_one, first + first_off, rpath.str) :
8877 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
8880}
#define pg_unreachable()
Definition c.h:367
#define _(x)
Definition elog.c:96
int int errdetail_internal(const char *fmt,...) pg_attribute_printf(1
int int int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...) pg_attribute_printf(1
int int errmsg_internal(const char *fmt,...) pg_attribute_printf(1
int int errhint_internal(const char *fmt,...) pg_attribute_printf(1
const char * str
#define ERRCODE_DATA_CORRUPTED
int ProcNumber
Definition procnumber.h:24
#define relpathbackend(rlocator, backend, forknum)
Definition relpath.h:141
char str[REL_PATH_STR_MAXLEN+1]
Definition relpath.h:123
BlockNumber blockNum
Definition aio_types.h:66
BlockNumber nblocks
Definition aio_types.h:67
ForkNumber forkNum
Definition aio_types.h:68

References _, Assert, PgAioTargetData::blockNum, buffer_readv_decode_error(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errdetail(), errdetail_internal(), errhint_internal(), errhint_plural(), errmsg, errmsg_internal(), fb(), PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, pg_unreachable, PGAIO_RS_ERROR, relpathbackend, result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and RelPathStr::str.

◆ buffer_stage_common()

static pg_attribute_always_inline void buffer_stage_common ( PgAioHandle ioh,
bool  is_write,
bool  is_temp 
)
static

Definition at line 8290 of file bufmgr.c.

8291{
8292 uint64 *io_data;
8293 uint8 handle_data_len;
8296
8297 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
8298
8300
8301 /* iterate over all buffers affected by the vectored readv/writev */
8302 for (int i = 0; i < handle_data_len; i++)
8303 {
8304 Buffer buffer = (Buffer) io_data[i];
8305 BufferDesc *buf_hdr = is_temp ?
8306 GetLocalBufferDescriptor(-buffer - 1)
8307 : GetBufferDescriptor(buffer - 1);
8309
8310 /*
8311 * Check that all the buffers are actually ones that could conceivably
8312 * be done in one IO, i.e. are sequential. This is the last
8313 * buffer-aware code before IO is actually executed and confusion
8314 * about which buffers are targeted by IO can be hard to debug, making
8315 * it worth doing extra-paranoid checks.
8316 */
8317 if (i == 0)
8318 first = buf_hdr->tag;
8319 else
8320 {
8321 Assert(buf_hdr->tag.relNumber == first.relNumber);
8322 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
8323 }
8324
8325 if (is_temp)
8327 else
8329
8330 /* verify the buffer is in the expected state */
8332 if (is_write)
8333 {
8336 }
8337 else
8338 {
8341 }
8342
8343 /* temp buffers don't use BM_IO_IN_PROGRESS */
8344 if (!is_temp)
8346
8348
8349 /*
8350 * Reflect that the buffer is now owned by the AIO subsystem.
8351 *
8352 * For local buffers: This can't be done just via LocalRefCount, as
8353 * one might initially think, as this backend could error out while
8354 * AIO is still in progress, releasing all the pins by the backend
8355 * itself.
8356 *
8357 * This pin is released again in TerminateBufferIO().
8358 */
8359 buf_hdr->io_wref = io_ref;
8360
8361 if (is_temp)
8362 {
8365 }
8366 else
8368
8369 /*
8370 * Ensure the content lock that prevents buffer modifications while
8371 * the buffer is being written out is not released early due to an
8372 * error.
8373 */
8374 if (is_write && !is_temp)
8375 {
8377
8378 /*
8379 * Lock is now owned by AIO subsystem.
8380 */
8381 BufferLockDisown(buffer, buf_hdr);
8382 }
8383
8384 /*
8385 * Stop tracking this buffer via the resowner - the AIO system now
8386 * keeps track.
8387 */
8388 if (!is_temp)
8390 }
8391}
static void pg_atomic_unlocked_write_u64(volatile pg_atomic_uint64 *ptr, uint64 val)
Definition atomics.h:494
#define BUF_REFCOUNT_ONE
static uint64 UnlockBufHdrExt(BufferDesc *desc, uint64 old_buf_state, uint64 set_bits, uint64 unset_bits, int refcount_change)
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BUF_STATE_GET_REFCOUNT(state)
static void BufferLockDisown(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6280
static bool BufferLockHeldByMe(BufferDesc *buf_hdr)
Definition bufmgr.c:6552
#define PG_USED_FOR_ASSERTS_ONLY
Definition c.h:249
BufferTag tag

References Assert, BM_DIRTY, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, BUF_REFCOUNT_ONE, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferLockDisown(), BufferLockHeldByMe(), CurrentResourceOwner, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, LockBufHdr(), pg_atomic_read_u64(), pg_atomic_unlocked_write_u64(), PG_USED_FOR_ASSERTS_ONLY, pgaio_io_get_handle_data(), pgaio_io_get_wref(), ResourceOwnerForgetBufferIO(), and UnlockBufHdrExt().

Referenced by local_buffer_readv_stage(), and shared_buffer_readv_stage().

◆ BufferAlloc()

static pg_attribute_always_inline BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr,
IOContext  io_context 
)
inlinestatic

Definition at line 2197 of file bufmgr.c.

2201{
2202 BufferTag newTag; /* identity of requested block */
2203 uint32 newHash; /* hash value for newTag */
2204 LWLock *newPartitionLock; /* buffer partition lock for it */
2205 int existing_buf_id;
2209 uint64 set_bits = 0;
2210
2211 /* Make sure we will have room to remember the buffer pin */
2214
2215 /* create a tag so we can lookup the buffer */
2216 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2217
2218 /* determine its hash code and partition lock ID */
2221
2222 /* see if the block is in the buffer pool already */
2225 if (existing_buf_id >= 0)
2226 {
2227 BufferDesc *buf;
2228 bool valid;
2229
2230 /*
2231 * Found it. Now, pin the buffer so no one can steal it from the
2232 * buffer pool, and check to see if the correct data has been loaded
2233 * into the buffer.
2234 */
2236
2237 valid = PinBuffer(buf, strategy, false);
2238
2239 /* Can release the mapping lock as soon as we've pinned it */
2241
2242 *foundPtr = true;
2243
2244 if (!valid)
2245 {
2246 /*
2247 * We can only get here if (a) someone else is still reading in
2248 * the page, (b) a previous read attempt failed, or (c) someone
2249 * called StartReadBuffers() but not yet WaitReadBuffers().
2250 */
2251 *foundPtr = false;
2252 }
2253
2254 return buf;
2255 }
2256
2257 /*
2258 * Didn't find it in the buffer pool. We'll have to initialize a new
2259 * buffer. Remember to unlock the mapping lock while doing the work.
2260 */
2262
2263 /*
2264 * Acquire a victim buffer. Somebody else might try to do the same, we
2265 * don't hold any conflicting locks. If so we'll have to undo our work
2266 * later.
2267 */
2270
2271 /*
2272 * Try to make a hashtable entry for the buffer under its new tag. If
2273 * somebody else inserted another buffer for the tag, we'll release the
2274 * victim buffer we acquired and use the already inserted one.
2275 */
2278 if (existing_buf_id >= 0)
2279 {
2281 bool valid;
2282
2283 /*
2284 * Got a collision. Someone has already done what we were about to do.
2285 * We'll just handle this as if it were found in the buffer pool in
2286 * the first place. First, give up the buffer we were planning to
2287 * use.
2288 *
2289 * We could do this after releasing the partition lock, but then we'd
2290 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2291 * before acquiring the lock, for the rare case of such a collision.
2292 */
2294
2295 /* remaining code should match code at top of routine */
2296
2298
2299 valid = PinBuffer(existing_buf_hdr, strategy, false);
2300
2301 /* Can release the mapping lock as soon as we've pinned it */
2303
2304 *foundPtr = true;
2305
2306 if (!valid)
2307 {
2308 /*
2309 * We can only get here if (a) someone else is still reading in
2310 * the page, (b) a previous read attempt failed, or (c) someone
2311 * called StartReadBuffers() but not yet WaitReadBuffers().
2312 */
2313 *foundPtr = false;
2314 }
2315
2316 return existing_buf_hdr;
2317 }
2318
2319 /*
2320 * Need to lock the buffer header too in order to change its tag.
2321 */
2323
2324 /* some sanity checks while we hold the buffer header lock */
2327
2328 victim_buf_hdr->tag = newTag;
2329
2330 /*
2331 * Make sure BM_PERMANENT is set for buffers that must be written at every
2332 * checkpoint. Unlogged buffers only need to be written at shutdown
2333 * checkpoints, except for their "init" forks, which need to be treated
2334 * just like permanent relations.
2335 */
2337 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2339
2341 set_bits, 0, 0);
2342
2344
2345 /*
2346 * Buffer contents are currently invalid.
2347 */
2348 *foundPtr = false;
2349
2350 return victim_buf_hdr;
2351}
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_PERMANENT
#define BUF_USAGECOUNT_ONE
static LWLock * BufMappingPartitionLock(uint32 hashcode)
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:96
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition buf_table.c:84
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition buf_table.c:124
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition bufmgr.c:2548
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition bufmgr.c:3281
static void ReservePrivateRefCountEntry(void)
Definition bufmgr.c:309
static void UnpinBuffer(BufferDesc *buf)
Definition bufmgr.c:3465
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition lwlock.c:1150
void LWLockRelease(LWLock *lock)
Definition lwlock.c:1767
@ LW_SHARED
Definition lwlock.h:105
@ LW_EXCLUSIVE
Definition lwlock.h:104
@ INIT_FORKNUM
Definition relpath.h:61
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition resowner.c:449
RelFileLocator locator
RelFileLocatorBackend smgr_rlocator
Definition smgr.h:38

References Assert, BM_DIRTY, BM_IO_IN_PROGRESS, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), CurrentResourceOwner, fb(), GetBufferDescriptor(), GetVictimBuffer(), INIT_FORKNUM, InitBufferTag(), RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrRelationData::smgr_rlocator, UnlockBufHdrExt(), and UnpinBuffer().

Referenced by PinBufferForBlock().

◆ BufferBeginSetHintBits()

bool BufferBeginSetHintBits ( Buffer  buffer)

Definition at line 7051 of file bufmgr.c.

7052{
7055
7056 if (BufferIsLocal(buffer))
7057 {
7058 /*
7059 * NB: Will need to check if there is a write in progress, once it is
7060 * possible for writes to be done asynchronously.
7061 */
7062 return true;
7063 }
7064
7065 buf_hdr = GetBufferDescriptor(buffer - 1);
7066
7068}
#define BufferIsLocal(buffer)
Definition buf.h:37
static bool SharedBufferBeginSetHintBits(Buffer buffer, BufferDesc *buf_hdr, uint64 *lockstate)
Definition bufmgr.c:6960

References PrivateRefCountEntry::buffer, BufferIsLocal, fb(), GetBufferDescriptor(), and SharedBufferBeginSetHintBits().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), fsm_search_avail(), fsm_vacuum_page(), gistkillitems(), and SetHintBitsExt().

◆ BufferFinishSetHintBits()

void BufferFinishSetHintBits ( Buffer  buffer,
bool  mark_dirty,
bool  buffer_std 
)

Definition at line 7079 of file bufmgr.c.

7080{
7081 if (!BufferIsLocal(buffer))
7084
7085 if (mark_dirty)
7087}
bool BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
Definition bufmgr.c:3096
void MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
Definition bufmgr.c:5830
@ BUFFER_LOCK_SHARE_EXCLUSIVE
Definition bufmgr.h:217
@ BUFFER_LOCK_EXCLUSIVE
Definition bufmgr.h:222

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), fb(), and MarkBufferDirtyHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), fsm_search_avail(), fsm_vacuum_page(), gistkillitems(), and HeapTupleSatisfiesMVCCBatch().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 4455 of file bufmgr.c.

4456{
4458
4459 Assert(BufferIsPinned(buffer));
4460
4461 if (BufferIsLocal(buffer))
4462 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4463 else
4464 bufHdr = GetBufferDescriptor(buffer - 1);
4465
4466 /* pinned, so OK to read tag without spinlock */
4467 return bufHdr->tag.blockNum;
4468}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, fb(), GetBufferDescriptor(), and GetLocalBufferDescriptor().

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_finish_split(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readpage(), _bt_relandgetbuf(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), AsyncReadBuffers(), BitmapHeapScanNextBlock(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), btvacuumpage(), check_index_page(), CheckReadBuffersOperation(), collect_corrupt_items(), collectMatchBitmap(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginDeletePostingPage(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), gistvacuumpage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_fetch_next_buffer(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_prune_opt(), heap_page_would_be_all_visible(), heap_prepare_pagescan(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), heapam_index_fetch_tuple(), heapam_scan_analyze_next_block(), heapgettup(), heapgettup_pagemode(), index_compute_xid_horizon_for_tuples(), lazy_scan_heap(), lazy_scan_noprune(), lazy_scan_prune(), lazy_vacuum_heap_rel(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), prune_freeze_setup(), read_buffers(), read_stream_start_pending_read(), ReadBufferBI(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), StartReadBuffersImpl(), startScanEntry(), statapprox_heap(), terminate_brin_buildstate(), vacuumLeafPage(), verify_heapam(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), and visibilitymap_set().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 4722 of file bufmgr.c.

4723{
4724 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4725 Assert(BufferIsValid(buffer));
4726 Assert(BufferIsPinned(buffer));
4727
4728#ifdef PG_HAVE_8BYTE_SINGLE_COPY_ATOMICITY
4729 return PageGetLSN(BufferGetPage(buffer));
4730#else
4731 {
4732 char *page = BufferGetPage(buffer);
4734 XLogRecPtr lsn;
4735
4736 /*
4737 * If we don't need locking for correctness, fastpath out.
4738 */
4739 if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
4740 return PageGetLSN(page);
4741
4742 bufHdr = GetBufferDescriptor(buffer - 1);
4744 lsn = PageGetLSN(page);
4746
4747 return lsn;
4748 }
4749#endif
4750}
static Page BufferGetPage(Buffer buffer)
Definition bufmgr.h:468
static XLogRecPtr PageGetLSN(const PageData *page)
Definition bufpage.h:410
#define XLogHintBitIsNeeded()
Definition xlog.h:123
uint64 XLogRecPtr
Definition xlogdefs.h:21

References Assert, PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), fb(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_killitems(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), and SetHintBitsExt().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 4476 of file bufmgr.c.

4478{
4480
4481 /* Do the same checks as BufferGetBlockNumber. */
4482 Assert(BufferIsPinned(buffer));
4483
4484 if (BufferIsLocal(buffer))
4485 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4486 else
4487 bufHdr = GetBufferDescriptor(buffer - 1);
4488
4489 /* pinned, so OK to read tag without spinlock */
4490 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4491 *forknum = BufTagGetForkNum(&bufHdr->tag);
4492 *blknum = bufHdr->tag.blockNum;
4493}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), fb(), GetBufferDescriptor(), and GetLocalBufferDescriptor().

Referenced by fsm_search_avail(), ginRedoInsertEntry(), heap_inplace_update_and_unlock(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), and XLogRegisterBuffer().

◆ BufferIsDirty()

bool BufferIsDirty ( Buffer  buffer)

◆ BufferIsLockedByMe()

bool BufferIsLockedByMe ( Buffer  buffer)

Definition at line 3070 of file bufmgr.c.

3071{
3073
3074 Assert(BufferIsPinned(buffer));
3075
3076 if (BufferIsLocal(buffer))
3077 {
3078 /* Content locks are not maintained for local buffers. */
3079 return true;
3080 }
3081 else
3082 {
3083 bufHdr = GetBufferDescriptor(buffer - 1);
3084 return BufferLockHeldByMe(bufHdr);
3085 }
3086}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockHeldByMe(), fb(), and GetBufferDescriptor().

Referenced by FlushOneBuffer().

◆ BufferIsLockedByMeInMode()

bool BufferIsLockedByMeInMode ( Buffer  buffer,
BufferLockMode  mode 
)

Definition at line 3096 of file bufmgr.c.

3097{
3099
3100 Assert(BufferIsPinned(buffer));
3101
3102 if (BufferIsLocal(buffer))
3103 {
3104 /* Content locks are not maintained for local buffers. */
3105 return true;
3106 }
3107 else
3108 {
3109 bufHdr = GetBufferDescriptor(buffer - 1);
3111 }
3112}
static bool BufferLockHeldByMeInMode(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6534
static PgChecksumMode mode

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockHeldByMeInMode(), fb(), GetBufferDescriptor(), and mode.

Referenced by BufferFinishSetHintBits(), BufferIsDirty(), heap_page_fix_vm_corruption(), HeapTupleSetHintBits(), IsBufferCleanupOK(), MarkBufferDirty(), visibilitymap_set(), and XLogRegisterBuffer().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 4686 of file bufmgr.c.

4687{
4689
4690 /* Local buffers are used only for temp relations. */
4691 if (BufferIsLocal(buffer))
4692 return false;
4693
4694 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4695 Assert(BufferIsValid(buffer));
4696 Assert(BufferIsPinned(buffer));
4697
4698 /*
4699 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4700 * need not bother with the buffer header spinlock. Even if someone else
4701 * changes the buffer header state while we're doing this, the state is
4702 * changed atomically, so we'll read the old value or the new value, but
4703 * not random garbage.
4704 */
4705 bufHdr = GetBufferDescriptor(buffer - 1);
4706 return (pg_atomic_read_u64(&bufHdr->state) & BM_PERMANENT) != 0;
4707}

References Assert, BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), fb(), GetBufferDescriptor(), and pg_atomic_read_u64().

Referenced by SetHintBitsExt().

◆ BufferLockAcquire()

static void BufferLockAcquire ( Buffer  buffer,
BufferDesc buf_hdr,
BufferLockMode  mode 
)
inlinestatic

Definition at line 5907 of file bufmgr.c.

5908{
5909 PrivateRefCountEntry *entry;
5910 int extraWaits = 0;
5911
5912 /*
5913 * Get reference to the refcount entry before we hold the lock, it seems
5914 * better to do before holding the lock.
5915 */
5916 entry = GetPrivateRefCountEntry(buffer, true);
5917
5918 /*
5919 * We better not already hold a lock on the buffer.
5920 */
5922
5923 /*
5924 * Lock out cancel/die interrupts until we exit the code section protected
5925 * by the content lock. This ensures that interrupts will not interfere
5926 * with manipulations of data structures in shared memory.
5927 */
5929
5930 for (;;)
5931 {
5932 uint32 wait_event = 0; /* initialized to avoid compiler warning */
5933 bool mustwait;
5934
5935 /*
5936 * Try to grab the lock the first time, we're not in the waitqueue
5937 * yet/anymore.
5938 */
5940
5941 if (likely(!mustwait))
5942 {
5943 break;
5944 }
5945
5946 /*
5947 * Ok, at this point we couldn't grab the lock on the first try. We
5948 * cannot simply queue ourselves to the end of the list and wait to be
5949 * woken up because by now the lock could long have been released.
5950 * Instead add us to the queue and try to grab the lock again. If we
5951 * succeed we need to revert the queuing and be happy, otherwise we
5952 * recheck the lock. If we still couldn't grab it, we know that the
5953 * other locker will see our queue entries when releasing since they
5954 * existed before we checked for the lock.
5955 */
5956
5957 /* add to the queue */
5959
5960 /* we're now guaranteed to be woken up if necessary */
5962
5963 /* ok, grabbed the lock the second time round, need to undo queueing */
5964 if (!mustwait)
5965 {
5967 break;
5968 }
5969
5970 switch (mode)
5971 {
5974 break;
5977 break;
5978 case BUFFER_LOCK_SHARE:
5980 break;
5981 case BUFFER_LOCK_UNLOCK:
5983
5984 }
5986
5987 /*
5988 * Wait until awakened.
5989 *
5990 * It is possible that we get awakened for a reason other than being
5991 * signaled by BufferLockWakeup(). If so, loop back and wait again.
5992 * Once we've gotten the lock, re-increment the sema by the number of
5993 * additional signals received.
5994 */
5995 for (;;)
5996 {
5999 break;
6000 extraWaits++;
6001 }
6002
6004
6005 /* Retrying, allow BufferLockReleaseSub to release waiters again. */
6007 }
6008
6009 /* Remember that we now hold this lock */
6010 entry->data.lockmode = mode;
6011
6012 /*
6013 * Fix the process wait semaphore's count for any absorbed wakeups.
6014 */
6015 while (unlikely(extraWaits-- > 0))
6017}
static uint64 pg_atomic_fetch_and_u64(volatile pg_atomic_uint64 *ptr, uint64 and_)
Definition atomics.h:551
#define BM_LOCK_WAKE_IN_PROGRESS
static bool BufferLockAttempt(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6105
static void BufferLockDequeueSelf(BufferDesc *buf_hdr)
Definition bufmgr.c:6212
static void BufferLockQueueSelf(BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6172
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition bufmgr.c:507
@ BUFFER_LOCK_SHARE
Definition bufmgr.h:212
@ BUFFER_LOCK_UNLOCK
Definition bufmgr.h:207
#define likely(x)
Definition c.h:437
@ LW_WS_NOT_WAITING
Definition lwlock.h:30
#define HOLD_INTERRUPTS()
Definition miscadmin.h:136
void PGSemaphoreUnlock(PGSemaphore sema)
Definition posix_sema.c:333
void PGSemaphoreLock(PGSemaphore sema)
Definition posix_sema.c:313
PGPROC * MyProc
Definition proc.c:71
PGSemaphore sem
Definition proc.h:258
uint8 lwWaiting
Definition proc.h:283
BufferLockMode lockmode
Definition bufmgr.c:112
PrivateRefCountData data
Definition bufmgr.c:130
static void pgstat_report_wait_start(uint32 wait_event_info)
Definition wait_event.h:67
static void pgstat_report_wait_end(void)
Definition wait_event.h:83

References Assert, BM_LOCK_WAKE_IN_PROGRESS, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferLockAttempt(), BufferLockDequeueSelf(), BufferLockQueueSelf(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), HOLD_INTERRUPTS, likely, PrivateRefCountData::lockmode, LW_WS_NOT_WAITING, PGPROC::lwWaiting, mode, MyProc, pg_atomic_fetch_and_u64(), pg_unreachable, PGSemaphoreLock(), PGSemaphoreUnlock(), pgstat_report_wait_end(), pgstat_report_wait_start(), PGPROC::sem, and unlikely.

Referenced by FlushUnlockedBuffer(), LockBufferInternal(), and MarkDirtyUnpinnedBufferInternal().

◆ BufferLockAttempt()

static bool BufferLockAttempt ( BufferDesc buf_hdr,
BufferLockMode  mode 
)
inlinestatic

Definition at line 6105 of file bufmgr.c.

6106{
6108
6109 /*
6110 * Read once outside the loop, later iterations will get the newer value
6111 * via compare & exchange.
6112 */
6114
6115 /* loop until we've determined whether we could acquire the lock or not */
6116 while (true)
6117 {
6119 bool lock_free;
6120
6122
6124 {
6125 lock_free = (old_state & BM_LOCK_MASK) == 0;
6126 if (lock_free)
6128 }
6130 {
6132 if (lock_free)
6134 }
6135 else
6136 {
6138 if (lock_free)
6140 }
6141
6142 /*
6143 * Attempt to swap in the state we are expecting. If we didn't see
6144 * lock to be free, that's just the old value. If we saw it as free,
6145 * we'll attempt to mark it acquired. The reason that we always swap
6146 * in the value is that this doubles as a memory barrier. We could try
6147 * to be smarter and only swap in values if we saw the lock as free,
6148 * but benchmark haven't shown it as beneficial so far.
6149 *
6150 * Retry if the value changed since we last looked at it.
6151 */
6154 {
6155 if (lock_free)
6156 {
6157 /* Great! Got the lock. */
6158 return false;
6159 }
6160 else
6161 return true; /* somebody else has the lock */
6162 }
6163 }
6164
6166}
static bool pg_atomic_compare_exchange_u64(volatile pg_atomic_uint64 *ptr, uint64 *expected, uint64 newval)
Definition atomics.h:522
#define BM_LOCK_VAL_SHARED
#define BM_LOCK_VAL_EXCLUSIVE
#define BM_LOCK_MASK
#define BM_LOCK_VAL_SHARE_EXCLUSIVE

References BM_LOCK_MASK, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_VAL_SHARE_EXCLUSIVE, BM_LOCK_VAL_SHARED, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, fb(), likely, mode, pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), and pg_unreachable.

Referenced by BufferLockAcquire(), and BufferLockConditional().

◆ BufferLockConditional()

static bool BufferLockConditional ( Buffer  buffer,
BufferDesc buf_hdr,
BufferLockMode  mode 
)
static

Definition at line 6059 of file bufmgr.c.

6060{
6061 PrivateRefCountEntry *entry = GetPrivateRefCountEntry(buffer, true);
6062 bool mustwait;
6063
6064 /*
6065 * As described above, if we're trying to lock a buffer this backend
6066 * already has locked, return false, independent of the existing and
6067 * desired lock level.
6068 */
6069 if (entry->data.lockmode != BUFFER_LOCK_UNLOCK)
6070 return false;
6071
6072 /*
6073 * Lock out cancel/die interrupts until we exit the code section protected
6074 * by the content lock. This ensures that interrupts will not interfere
6075 * with manipulations of data structures in shared memory.
6076 */
6078
6079 /* Check for the lock */
6081
6082 if (mustwait)
6083 {
6084 /* Failed to get lock, so release interrupt holdoff */
6086 }
6087 else
6088 {
6089 entry->data.lockmode = mode;
6090 }
6091
6092 return !mustwait;
6093}
#define RESUME_INTERRUPTS()
Definition miscadmin.h:138

References PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferLockAttempt(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), HOLD_INTERRUPTS, PrivateRefCountData::lockmode, mode, and RESUME_INTERRUPTS.

Referenced by ConditionalLockBuffer(), and GetVictimBuffer().

◆ BufferLockDequeueSelf()

static void BufferLockDequeueSelf ( BufferDesc buf_hdr)
static

Definition at line 6212 of file bufmgr.c.

6213{
6214 bool on_waitlist;
6215
6217
6219 if (on_waitlist)
6220 proclist_delete(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6221
6222 if (proclist_is_empty(&buf_hdr->lock_waiters) &&
6224 {
6226 }
6227
6228 /* XXX: combine with fetch_and above? */
6230
6231 /* clear waiting state again, nice for debugging */
6232 if (on_waitlist)
6234 else
6235 {
6236 int extraWaits = 0;
6237
6238
6239 /*
6240 * Somebody else dequeued us and has or will wake us up. Deal with the
6241 * superfluous absorption of a wakeup.
6242 */
6243
6244 /*
6245 * Clear BM_LOCK_WAKE_IN_PROGRESS if somebody woke us before we
6246 * removed ourselves - they'll have set it.
6247 */
6249
6250 /*
6251 * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
6252 * get reset at some inconvenient point later. Most of the time this
6253 * will immediately return.
6254 */
6255 for (;;)
6256 {
6259 break;
6260 extraWaits++;
6261 }
6262
6263 /*
6264 * Fix the process wait semaphore's count for any absorbed wakeups.
6265 */
6266 while (extraWaits-- > 0)
6268 }
6269}
#define BM_LOCK_HAS_WAITERS
@ LW_WS_WAITING
Definition lwlock.h:31
#define proclist_delete(list, procno, link_member)
Definition proclist.h:187
static bool proclist_is_empty(const proclist_head *list)
Definition proclist.h:38

References BM_LOCK_HAS_WAITERS, BM_LOCK_WAKE_IN_PROGRESS, fb(), LockBufHdr(), LW_WS_NOT_WAITING, LW_WS_WAITING, PGPROC::lwWaiting, MyProc, MyProcNumber, pg_atomic_fetch_and_u64(), pg_atomic_read_u64(), PGSemaphoreLock(), PGSemaphoreUnlock(), proclist_delete, proclist_is_empty(), PGPROC::sem, and UnlockBufHdr().

Referenced by BufferLockAcquire().

◆ BufferLockDisown()

static void BufferLockDisown ( Buffer  buffer,
BufferDesc buf_hdr 
)
inlinestatic

Definition at line 6280 of file bufmgr.c.

6281{
6284}
static int BufferLockDisownInternal(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6294

References PrivateRefCountEntry::buffer, BufferLockDisownInternal(), fb(), and RESUME_INTERRUPTS.

Referenced by buffer_stage_common().

◆ BufferLockDisownInternal()

static int BufferLockDisownInternal ( Buffer  buffer,
BufferDesc buf_hdr 
)
inlinestatic

Definition at line 6294 of file bufmgr.c.

6295{
6298
6299 ref = GetPrivateRefCountEntry(buffer, false);
6300 if (ref == NULL)
6301 elog(ERROR, "lock %d is not held", buffer);
6302 mode = ref->data.lockmode;
6303 ref->data.lockmode = BUFFER_LOCK_UNLOCK;
6304
6305 return mode;
6306}
BufferLockMode
Definition bufmgr.h:206
#define ERROR
Definition elog.h:40

References PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, elog, ERROR, fb(), GetPrivateRefCountEntry(), and mode.

Referenced by BufferLockDisown(), BufferLockUnlock(), and UnlockReleaseBuffer().

◆ BufferLockHeldByMe()

static bool BufferLockHeldByMe ( BufferDesc buf_hdr)
static

Definition at line 6552 of file bufmgr.c.

6553{
6554 PrivateRefCountEntry *entry =
6556
6557 if (!entry)
6558 return false;
6559 else
6560 return entry->data.lockmode != BUFFER_LOCK_UNLOCK;
6561}
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)

References BUFFER_LOCK_UNLOCK, BufferDescriptorGetBuffer(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), and PrivateRefCountData::lockmode.

Referenced by buffer_stage_common(), BufferIsLockedByMe(), and UnpinBufferNoOwner().

◆ BufferLockHeldByMeInMode()

static bool BufferLockHeldByMeInMode ( BufferDesc buf_hdr,
BufferLockMode  mode 
)
static

Definition at line 6534 of file bufmgr.c.

6535{
6536 PrivateRefCountEntry *entry =
6538
6539 if (!entry)
6540 return false;
6541 else
6542 return entry->data.lockmode == mode;
6543}

References BufferDescriptorGetBuffer(), PrivateRefCountEntry::data, fb(), GetPrivateRefCountEntry(), PrivateRefCountData::lockmode, and mode.

Referenced by BufferIsLockedByMeInMode(), FlushBuffer(), and MarkSharedBufferDirtyHint().

◆ BufferLockProcessRelease()

static void BufferLockProcessRelease ( BufferDesc buf_hdr,
BufferLockMode  mode,
uint64  lockstate 
)
static

Definition at line 6479 of file bufmgr.c.

6480{
6481 bool check_waiters = false;
6482 bool wake_exclusive = false;
6483
6484 /* nobody else can have that kind of lock */
6486
6487 /*
6488 * If we're still waiting for backends to get scheduled, don't wake them
6489 * up again. Otherwise check if we need to look through the waitqueue to
6490 * wake other backends.
6491 */
6494 {
6495 if ((lockstate & BM_LOCK_MASK) == 0)
6496 {
6497 /*
6498 * We released a lock and the lock was, in that moment, free. We
6499 * therefore can wake waiters for any kind of lock.
6500 */
6501 check_waiters = true;
6502 wake_exclusive = true;
6503 }
6505 {
6506 /*
6507 * We released the lock, but another backend still holds a lock.
6508 * We can't have released an exclusive lock, as there couldn't
6509 * have been other lock holders. If we released a share lock, no
6510 * waiters need to be woken up, as there must be other share
6511 * lockers. However, if we held a share-exclusive lock, another
6512 * backend now could acquire a share-exclusive lock.
6513 */
6514 check_waiters = true;
6515 wake_exclusive = false;
6516 }
6517 }
6518
6519 /*
6520 * As waking up waiters requires the spinlock to be acquired, only do so
6521 * if necessary.
6522 */
6523 if (check_waiters)
6525}
static void BufferLockWakeup(BufferDesc *buf_hdr, bool wake_exclusive)
Definition bufmgr.c:6314

References Assert, BM_LOCK_HAS_WAITERS, BM_LOCK_MASK, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_WAKE_IN_PROGRESS, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferLockWakeup(), fb(), and mode.

Referenced by BufferLockUnlock(), and UnlockReleaseBuffer().

◆ BufferLockQueueSelf()

static void BufferLockQueueSelf ( BufferDesc buf_hdr,
BufferLockMode  mode 
)
static

Definition at line 6172 of file bufmgr.c.

6173{
6174 /*
6175 * If we don't have a PGPROC structure, there's no way to wait. This
6176 * should never occur, since MyProc should only be null during shared
6177 * memory initialization.
6178 */
6179 if (MyProc == NULL)
6180 elog(PANIC, "cannot wait without a PGPROC structure");
6181
6183 elog(PANIC, "queueing for lock while waiting on another one");
6184
6186
6187 /* setting the flag is protected by the spinlock */
6189
6190 /*
6191 * These are currently used both for lwlocks and buffer content locks,
6192 * which is acceptable, although not pretty, because a backend can't wait
6193 * for both types of locks at the same time.
6194 */
6197
6198 proclist_push_tail(&buf_hdr->lock_waiters, MyProcNumber, lwWaitLink);
6199
6200 /* Can release the mutex now */
6202}
static uint64 pg_atomic_fetch_or_u64(volatile pg_atomic_uint64 *ptr, uint64 or_)
Definition atomics.h:560
#define PANIC
Definition elog.h:44
#define proclist_push_tail(list, procno, link_member)
Definition proclist.h:191
uint8 lwWaitMode
Definition proc.h:284

References BM_LOCK_HAS_WAITERS, elog, fb(), LockBufHdr(), LW_WS_NOT_WAITING, LW_WS_WAITING, PGPROC::lwWaiting, PGPROC::lwWaitMode, mode, MyProc, MyProcNumber, PANIC, pg_atomic_fetch_or_u64(), proclist_push_tail, and UnlockBufHdr().

Referenced by BufferLockAcquire().

◆ BufferLockReleaseSub()

static uint64 BufferLockReleaseSub ( BufferLockMode  mode)
inlinestatic

Definition at line 6450 of file bufmgr.c.

6451{
6452 /*
6453 * Turns out that a switch() leads gcc to generate sufficiently worse code
6454 * for this to show up in profiles...
6455 */
6457 return BM_LOCK_VAL_EXCLUSIVE;
6460 else
6461 {
6463 return BM_LOCK_VAL_SHARED;
6464 }
6465
6466 return 0; /* keep compiler quiet */
6467}

References Assert, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_VAL_SHARE_EXCLUSIVE, BM_LOCK_VAL_SHARED, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, and mode.

Referenced by BufferLockUnlock(), and UnlockReleaseBuffer().

◆ BufferLockUnlock()

static void BufferLockUnlock ( Buffer  buffer,
BufferDesc buf_hdr 
)
static

Definition at line 6023 of file bufmgr.c.

6024{
6027 uint64 sub;
6028
6030
6031 /*
6032 * Release my hold on lock, after that it can immediately be acquired by
6033 * others, even if we still have to wakeup other waiters.
6034 */
6036
6038
6040
6041 /*
6042 * Now okay to allow cancel/die interrupts.
6043 */
6045}
static uint64 pg_atomic_sub_fetch_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:578
static void BufferLockProcessRelease(BufferDesc *buf_hdr, BufferLockMode mode, uint64 lockstate)
Definition bufmgr.c:6479
static uint64 BufferLockReleaseSub(BufferLockMode mode)
Definition bufmgr.c:6450

References PrivateRefCountEntry::buffer, BufferLockDisownInternal(), BufferLockProcessRelease(), BufferLockReleaseSub(), fb(), mode, pg_atomic_sub_fetch_u64(), and RESUME_INTERRUPTS.

Referenced by FlushUnlockedBuffer(), MarkDirtyUnpinnedBufferInternal(), ResOwnerReleaseBuffer(), and UnlockBuffer().

◆ BufferLockWakeup()

static void BufferLockWakeup ( BufferDesc buf_hdr,
bool  wake_exclusive 
)
static

Definition at line 6314 of file bufmgr.c.

6315{
6316 bool new_wake_in_progress = false;
6317 bool wake_share_exclusive = true;
6320
6322
6323 /* lock wait list while collecting backends to wake up */
6325
6326 proclist_foreach_modify(iter, &buf_hdr->lock_waiters, lwWaitLink)
6327 {
6328 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6329
6330 /*
6331 * Already woke up a conflicting lock, so skip over this wait list
6332 * entry.
6333 */
6335 continue;
6337 continue;
6338
6339 proclist_delete(&buf_hdr->lock_waiters, iter.cur, lwWaitLink);
6340 proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
6341
6342 /*
6343 * Prevent additional wakeups until retryer gets to run. Backends that
6344 * are just waiting for the lock to become free don't retry
6345 * automatically.
6346 */
6347 new_wake_in_progress = true;
6348
6349 /*
6350 * Signal that the process isn't on the wait list anymore. This allows
6351 * BufferLockDequeueSelf() to remove itself from the waitlist with a
6352 * proclist_delete(), rather than having to check if it has been
6353 * removed from the list.
6354 */
6355 Assert(waiter->lwWaiting == LW_WS_WAITING);
6357
6358 /*
6359 * Don't wakeup further waiters after waking a conflicting waiter.
6360 */
6361 if (waiter->lwWaitMode == BUFFER_LOCK_SHARE)
6362 {
6363 /*
6364 * Share locks conflict with exclusive locks.
6365 */
6366 wake_exclusive = false;
6367 }
6368 else if (waiter->lwWaitMode == BUFFER_LOCK_SHARE_EXCLUSIVE)
6369 {
6370 /*
6371 * Share-exclusive locks conflict with share-exclusive and
6372 * exclusive locks.
6373 */
6374 wake_exclusive = false;
6375 wake_share_exclusive = false;
6376 }
6377 else if (waiter->lwWaitMode == BUFFER_LOCK_EXCLUSIVE)
6378 {
6379 /*
6380 * Exclusive locks conflict with all other locks, there's no point
6381 * in waking up anybody else.
6382 */
6383 break;
6384 }
6385 }
6386
6388
6389 /* unset required flags, and release lock, in one fell swoop */
6390 {
6393
6395 while (true)
6396 {
6398
6399 /* compute desired flags */
6400
6403 else
6405
6406 if (proclist_is_empty(&buf_hdr->lock_waiters))
6408
6409 desired_state &= ~BM_LOCKED; /* release lock */
6410
6413 break;
6414 }
6415 }
6416
6417 /* Awaken any waiters I removed from the queue. */
6418 proclist_foreach_modify(iter, &wakeup, lwWaitLink)
6419 {
6420 PGPROC *waiter = GetPGProcByNumber(iter.cur);
6421
6422 proclist_delete(&wakeup, iter.cur, lwWaitLink);
6423
6424 /*
6425 * Guarantee that lwWaiting being unset only becomes visible once the
6426 * unlink from the link has completed. Otherwise the target backend
6427 * could be woken up for other reason and enqueue for a new lock - if
6428 * that happens before the list unlink happens, the list would end up
6429 * being corrupted.
6430 *
6431 * The barrier pairs with the LockBufHdr() when enqueuing for another
6432 * lock.
6433 */
6435 waiter->lwWaiting = LW_WS_NOT_WAITING;
6436 PGSemaphoreUnlock(waiter->sem);
6437 }
6438}
#define pg_write_barrier()
Definition atomics.h:155
@ LW_WS_PENDING_WAKEUP
Definition lwlock.h:32
#define GetPGProcByNumber(n)
Definition proc.h:504
static void proclist_init(proclist_head *list)
Definition proclist.h:29
#define proclist_foreach_modify(iter, lhead, link_member)
Definition proclist.h:206
Definition proc.h:179
static TimestampTz wakeup[NUM_WALRCV_WAKEUPS]

References Assert, BM_LOCK_HAS_WAITERS, BM_LOCK_WAKE_IN_PROGRESS, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, proclist_mutable_iter::cur, fb(), GetPGProcByNumber, LockBufHdr(), LW_WS_NOT_WAITING, LW_WS_PENDING_WAKEUP, LW_WS_WAITING, PGPROC::lwWaiting, PGPROC::lwWaitMode, pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), pg_write_barrier, PGSemaphoreUnlock(), proclist_delete, proclist_foreach_modify, proclist_init(), proclist_is_empty(), proclist_push_tail, PGPROC::sem, and wakeup.

Referenced by BufferLockProcessRelease().

◆ BufferSetHintBits16()

bool BufferSetHintBits16 ( uint16 ptr,
uint16  val,
Buffer  buffer 
)

Definition at line 7102 of file bufmgr.c.

7103{
7106#ifdef USE_ASSERT_CHECKING
7107 char *page;
7108
7109 /* verify that the address is on the page */
7110 page = BufferGetPage(buffer);
7111 Assert((char *) ptr >= page && (char *) ptr < (page + BLCKSZ));
7112#endif
7113
7114 if (BufferIsLocal(buffer))
7115 {
7116 *ptr = val;
7117
7118 MarkLocalBufferDirty(buffer);
7119
7120 return true;
7121 }
7122
7123 buf_hdr = GetBufferDescriptor(buffer - 1);
7124
7126 {
7127 *ptr = val;
7128
7130
7131 return true;
7132 }
7133
7134 return false;
7135}
static void MarkSharedBufferDirtyHint(Buffer buffer, BufferDesc *bufHdr, uint64 lockstate, bool buffer_std)
Definition bufmgr.c:5705
long val
Definition informix.c:689
void MarkLocalBufferDirty(Buffer buffer)
Definition localbuf.c:500

References Assert, PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, fb(), GetBufferDescriptor(), MarkLocalBufferDirty(), MarkSharedBufferDirtyHint(), SharedBufferBeginSetHintBits(), and val.

Referenced by SetHintBitsExt().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 3561 of file bufmgr.c.

3562{
3564 int buf_id;
3565 int num_to_scan;
3566 int num_spaces;
3567 int num_processed;
3568 int num_written;
3570 Oid last_tsid;
3572 int i;
3573 uint64 mask = BM_DIRTY;
3575
3576 /*
3577 * Unless this is a shutdown checkpoint or we have been explicitly told,
3578 * we write only permanent, dirty buffers. But at shutdown or end of
3579 * recovery, we write all dirty buffers.
3580 */
3583 mask |= BM_PERMANENT;
3584
3585 /*
3586 * Loop over all buffers, and mark the ones that need to be written with
3587 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3588 * can estimate how much work needs to be done.
3589 *
3590 * This allows us to write only those pages that were dirty when the
3591 * checkpoint began, and not those that get dirtied while it proceeds.
3592 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3593 * later in this function, or by normal backends or the bgwriter cleaning
3594 * scan, the flag is cleared. Any buffer dirtied after this point won't
3595 * have the flag set.
3596 *
3597 * Note that if we fail to write some buffer, we may leave buffers with
3598 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3599 * certainly need to be written for the next checkpoint attempt, too.
3600 */
3601 num_to_scan = 0;
3602 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3603 {
3605 uint64 set_bits = 0;
3606
3607 /*
3608 * Header spinlock is enough to examine BM_DIRTY, see comment in
3609 * SyncOneBuffer.
3610 */
3612
3613 if ((buf_state & mask) == mask)
3614 {
3615 CkptSortItem *item;
3616
3618
3619 item = &CkptBufferIds[num_to_scan++];
3620 item->buf_id = buf_id;
3621 item->tsId = bufHdr->tag.spcOid;
3622 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3623 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3624 item->blockNum = bufHdr->tag.blockNum;
3625 }
3626
3628 set_bits, 0,
3629 0);
3630
3631 /* Check for barrier events in case NBuffers is large. */
3634 }
3635
3636 if (num_to_scan == 0)
3637 return; /* nothing to do */
3638
3640
3642
3643 /*
3644 * Sort buffers that need to be written to reduce the likelihood of random
3645 * IO. The sorting is also important for the implementation of balancing
3646 * writes between tablespaces. Without balancing writes we'd potentially
3647 * end up writing to the tablespaces one-by-one; possibly overloading the
3648 * underlying system.
3649 */
3651
3652 num_spaces = 0;
3653
3654 /*
3655 * Allocate progress status for each tablespace with buffers that need to
3656 * be flushed. This requires the to-be-flushed array to be sorted.
3657 */
3659 for (i = 0; i < num_to_scan; i++)
3660 {
3661 CkptTsStatus *s;
3662 Oid cur_tsid;
3663
3665
3666 /*
3667 * Grow array of per-tablespace status structs, every time a new
3668 * tablespace is found.
3669 */
3671 {
3672 Size sz;
3673
3674 num_spaces++;
3675
3676 /*
3677 * Not worth adding grow-by-power-of-2 logic here - even with a
3678 * few hundred tablespaces this should be fine.
3679 */
3680 sz = sizeof(CkptTsStatus) * num_spaces;
3681
3682 if (per_ts_stat == NULL)
3684 else
3686
3687 s = &per_ts_stat[num_spaces - 1];
3688 memset(s, 0, sizeof(*s));
3689 s->tsId = cur_tsid;
3690
3691 /*
3692 * The first buffer in this tablespace. As CkptBufferIds is sorted
3693 * by tablespace all (s->num_to_scan) buffers in this tablespace
3694 * will follow afterwards.
3695 */
3696 s->index = i;
3697
3698 /*
3699 * progress_slice will be determined once we know how many buffers
3700 * are in each tablespace, i.e. after this loop.
3701 */
3702
3704 }
3705 else
3706 {
3707 s = &per_ts_stat[num_spaces - 1];
3708 }
3709
3710 s->num_to_scan++;
3711
3712 /* Check for barrier events. */
3715 }
3716
3717 Assert(num_spaces > 0);
3718
3719 /*
3720 * Build a min-heap over the write-progress in the individual tablespaces,
3721 * and compute how large a portion of the total progress a single
3722 * processed buffer is.
3723 */
3726 NULL);
3727
3728 for (i = 0; i < num_spaces; i++)
3729 {
3731
3732 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3733
3735 }
3736
3738
3739 /*
3740 * Iterate through to-be-checkpointed buffers and write the ones (still)
3741 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3742 * tablespaces; otherwise the sorting would lead to only one tablespace
3743 * receiving writes at a time, making inefficient use of the hardware.
3744 */
3745 num_processed = 0;
3746 num_written = 0;
3747 while (!binaryheap_empty(ts_heap))
3748 {
3752
3753 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3754 Assert(buf_id != -1);
3755
3756 bufHdr = GetBufferDescriptor(buf_id);
3757
3758 num_processed++;
3759
3760 /*
3761 * We don't need to acquire the lock here, because we're only looking
3762 * at a single bit. It's possible that someone else writes the buffer
3763 * and clears the flag right after we check, but that doesn't matter
3764 * since SyncOneBuffer will then do nothing. However, there is a
3765 * further race condition: it's conceivable that between the time we
3766 * examine the bit here and the time SyncOneBuffer acquires the lock,
3767 * someone else not only wrote the buffer but replaced it with another
3768 * page and dirtied it. In that improbable case, SyncOneBuffer will
3769 * write the buffer though we didn't need to. It doesn't seem worth
3770 * guarding against this, though.
3771 */
3773 {
3774 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3775 {
3778 num_written++;
3779 }
3780 }
3781
3782 /*
3783 * Measure progress independent of actually having to flush the buffer
3784 * - otherwise writing become unbalanced.
3785 */
3786 ts_stat->progress += ts_stat->progress_slice;
3787 ts_stat->num_scanned++;
3788 ts_stat->index++;
3789
3790 /* Have all the buffers from the tablespace been processed? */
3791 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3792 {
3794 }
3795 else
3796 {
3797 /* update heap with the new progress */
3799 }
3800
3801 /*
3802 * Sleep to throttle our I/O rate.
3803 *
3804 * (This will check for barrier events even if it doesn't sleep.)
3805 */
3806 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3807 }
3808
3809 /*
3810 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3811 * IOContext will always be IOCONTEXT_NORMAL.
3812 */
3814
3816 per_ts_stat = NULL;
3818
3819 /*
3820 * Update checkpoint statistics. As noted above, this doesn't include
3821 * buffers written by other backends or bgwriter scan.
3822 */
3824
3826}
void binaryheap_build(binaryheap *heap)
Definition binaryheap.c:136
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:253
bh_node_type binaryheap_first(binaryheap *heap)
Definition binaryheap.c:175
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition binaryheap.c:190
void binaryheap_free(binaryheap *heap)
Definition binaryheap.c:73
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition binaryheap.c:114
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition binaryheap.c:37
#define binaryheap_empty(h)
Definition binaryheap.h:65
CkptSortItem * CkptBufferIds
Definition buf_init.c:28
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BM_CHECKPOINT_NEEDED
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition bufmgr.c:7664
int checkpoint_flush_after
Definition bufmgr.c:223
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition bufmgr.c:7687
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition bufmgr.c:7749
double float8
Definition c.h:714
size_t Size
Definition c.h:689
void CheckpointWriteDelay(int flags, double progress)
volatile sig_atomic_t ProcSignalBarrierPending
Definition globals.c:40
void * repalloc(void *pointer, Size size)
Definition mcxt.c:1635
void pfree(void *pointer)
Definition mcxt.c:1619
void * palloc(Size size)
Definition mcxt.c:1390
PgStat_CheckpointerStats PendingCheckpointerStats
static Pointer DatumGetPointer(Datum X)
Definition postgres.h:332
#define PointerGetDatum(X)
Definition postgres.h:354
#define InvalidOid
unsigned int Oid
void ProcessProcSignalBarrier(void)
Definition procsignal.c:503
int ckpt_bufs_written
Definition xlog.h:179
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition bufmgr.c:164
int num_to_scan
Definition bufmgr.c:167
PgStat_Counter buffers_written
Definition pgstat.h:270
CheckpointStatsData CheckpointStats
Definition xlog.c:216
#define CHECKPOINT_FLUSH_UNLOGGED
Definition xlog.h:155
#define CHECKPOINT_END_OF_RECOVERY
Definition xlog.h:152
#define CHECKPOINT_IS_SHUTDOWN
Definition xlog.h:151

References Assert, binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buffers_written, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_UNLOGGED, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, DatumGetPointer(), fb(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IOCONTEXT_NORMAL, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u64(), PointerGetDatum, ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), SyncOneBuffer(), ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdrExt(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 7599 of file bufmgr.c.

7600{
7601 int ret;
7604
7607
7609
7610 if (ret != 0)
7611 return ret;
7612
7614 return -1;
7616 return 1;
7617
7618 if (ba->blockNum < bb->blockNum)
7619 return -1;
7620 if (ba->blockNum > bb->blockNum)
7621 return 1;
7622
7623 return 0;
7624}
static int rlocator_comparator(const void *p1, const void *p2)
Definition bufmgr.c:7500

References BufTagGetForkNum(), BufTagGetRelFileLocator(), fb(), and rlocator_comparator().

◆ CheckBufferIsPinnedOnce()

void CheckBufferIsPinnedOnce ( Buffer  buffer)

Definition at line 6646 of file bufmgr.c.

6647{
6648 if (BufferIsLocal(buffer))
6649 {
6650 if (LocalRefCount[-buffer - 1] != 1)
6651 elog(ERROR, "incorrect local pin count: %d",
6652 LocalRefCount[-buffer - 1]);
6653 }
6654 else
6655 {
6656 if (GetPrivateRefCount(buffer) != 1)
6657 elog(ERROR, "incorrect local pin count: %d",
6658 GetPrivateRefCount(buffer));
6659 }
6660}

References PrivateRefCountEntry::buffer, BufferIsLocal, elog, ERROR, GetPrivateRefCount(), and LocalRefCount.

Referenced by GetVictimBuffer(), lazy_scan_heap(), and LockBufferForCleanup().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 4272 of file bufmgr.c.

4273{
4274#ifdef USE_ASSERT_CHECKING
4275 int RefCountErrors = 0;
4277 int i;
4278 char *s;
4279
4280 /* check the array */
4281 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4282 {
4284 {
4285 res = &PrivateRefCountArray[i];
4286
4288 elog(WARNING, "buffer refcount leak: %s", s);
4289 pfree(s);
4290
4292 }
4293 }
4294
4295 /* if necessary search the hash */
4297 {
4298 refcount_iterator iter;
4299
4301 while ((res = refcount_iterate(PrivateRefCountHash, &iter)) != NULL)
4302 {
4304 elog(WARNING, "buffer refcount leak: %s", s);
4305 pfree(s);
4307 }
4308 }
4309
4310 Assert(RefCountErrors == 0);
4311#endif
4312}
#define InvalidBuffer
Definition buf.h:25
static Buffer PrivateRefCountArrayKeys[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:263
static refcount_hash * PrivateRefCountHash
Definition bufmgr.c:265
char * DebugPrintBufferRefcount(Buffer buffer)
Definition bufmgr.c:4398
#define REFCOUNT_ARRAY_ENTRIES
Definition bufmgr.c:145
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition bufmgr.c:264

References Assert, PrivateRefCountEntry::buffer, DebugPrintBufferRefcount(), elog, fb(), i, InvalidBuffer, pfree(), PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and WARNING.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 4441 of file bufmgr.c.

4442{
4443 BufferSync(flags);
4444}
static void BufferSync(int flags)
Definition bufmgr.c:3561

References BufferSync().

Referenced by CheckPointGuts().

◆ CheckReadBuffersOperation()

static void CheckReadBuffersOperation ( ReadBuffersOperation operation,
bool  is_complete 
)
static

Definition at line 1656 of file bufmgr.c.

1657{
1658#ifdef USE_ASSERT_CHECKING
1659 Assert(operation->nblocks_done <= operation->nblocks);
1660 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1661
1662 for (int i = 0; i < operation->nblocks; i++)
1663 {
1664 Buffer buffer = operation->buffers[i];
1665 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1666 GetLocalBufferDescriptor(-buffer - 1) :
1667 GetBufferDescriptor(buffer - 1);
1668
1669 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1671
1672 if (i < operation->nblocks_done)
1674 }
1675#endif
1676}

References Assert, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufferGetBlockNumber(), BufferIsLocal, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, operation, and pg_atomic_read_u64().

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 7633 of file bufmgr.c.

7634{
7635 /* compare tablespace */
7636 if (a->tsId < b->tsId)
7637 return -1;
7638 else if (a->tsId > b->tsId)
7639 return 1;
7640 /* compare relation */
7641 if (a->relNumber < b->relNumber)
7642 return -1;
7643 else if (a->relNumber > b->relNumber)
7644 return 1;
7645 /* compare fork */
7646 else if (a->forkNum < b->forkNum)
7647 return -1;
7648 else if (a->forkNum > b->forkNum)
7649 return 1;
7650 /* compare block number */
7651 else if (a->blockNum < b->blockNum)
7652 return -1;
7653 else if (a->blockNum > b->blockNum)
7654 return 1;
7655 /* equal page IDs are unlikely, but not impossible */
7656 return 0;
7657}
int b
Definition isn.c:74
int a
Definition isn.c:73

References a, and b.

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 6626 of file bufmgr.c.

6627{
6628 BufferDesc *buf;
6629
6630 Assert(BufferIsPinned(buffer));
6631 if (BufferIsLocal(buffer))
6632 return true; /* act as though we got it */
6633
6634 buf = GetBufferDescriptor(buffer - 1);
6635
6637}
static bool BufferLockConditional(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:6059

References Assert, buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsPinned, BufferLockConditional(), and GetBufferDescriptor().

Referenced by _bt_conditionallockbuf(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 6852 of file bufmgr.c.

6853{
6856 refcount;
6857
6858 Assert(BufferIsValid(buffer));
6859
6860 /* see AIO related comment in LockBufferForCleanup() */
6861
6862 if (BufferIsLocal(buffer))
6863 {
6864 refcount = LocalRefCount[-buffer - 1];
6865 /* There should be exactly one pin */
6866 Assert(refcount > 0);
6867 if (refcount != 1)
6868 return false;
6869 /* Nobody else to wait for */
6870 return true;
6871 }
6872
6873 /* There should be exactly one local pin */
6874 refcount = GetPrivateRefCount(buffer);
6875 Assert(refcount);
6876 if (refcount != 1)
6877 return false;
6878
6879 /* Try to acquire lock */
6880 if (!ConditionalLockBuffer(buffer))
6881 return false;
6882
6883 bufHdr = GetBufferDescriptor(buffer - 1);
6886
6887 Assert(refcount > 0);
6888 if (refcount == 1)
6889 {
6890 /* Successfully acquired exclusive lock with pincount 1 */
6892 return true;
6893 }
6894
6895 /* Failed, so release the lock */
6898 return false;
6899}
bool ConditionalLockBuffer(Buffer buffer)
Definition bufmgr.c:6626
static void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition bufmgr.h:334

References Assert, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), fb(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 5471 of file bufmgr.c.

5473{
5474 char relpersistence;
5477
5478 /* Set the relpersistence. */
5479 relpersistence = permanent ?
5481
5484
5485 /*
5486 * Create and copy all forks of the relation. During create database we
5487 * have a separate cleanup mechanism which deletes complete database
5488 * directory. Therefore, each individual relation doesn't need to be
5489 * registered for cleanup.
5490 */
5491 RelationCreateStorage(dst_rlocator, relpersistence, false);
5492
5493 /* copy main fork. */
5495 permanent);
5496
5497 /* copy those extra forks that exist */
5498 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5499 forkNum <= MAX_FORKNUM; forkNum++)
5500 {
5501 if (smgrexists(src_rel, forkNum))
5502 {
5503 smgrcreate(dst_rel, forkNum, false);
5504
5505 /*
5506 * WAL log creation if the relation is persistent, or this is the
5507 * init fork of an unlogged relation.
5508 */
5509 if (permanent || forkNum == INIT_FORKNUM)
5510 log_smgrcreate(&dst_rlocator, forkNum);
5511
5512 /* Copy a fork's data, block by block. */
5514 permanent);
5515 }
5516 }
5517}
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition bufmgr.c:5357
@ MAIN_FORKNUM
Definition relpath.h:58
#define MAX_FORKNUM
Definition relpath.h:70
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition smgr.c:481
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:462
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition storage.c:187

References fb(), INIT_FORKNUM, INVALID_PROC_NUMBER, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DebugPrintBufferRefcount()

char * DebugPrintBufferRefcount ( Buffer  buffer)

Definition at line 4398 of file bufmgr.c.

4399{
4400 BufferDesc *buf;
4402 char *result;
4403 ProcNumber backend;
4405
4406 Assert(BufferIsValid(buffer));
4407 if (BufferIsLocal(buffer))
4408 {
4409 buf = GetLocalBufferDescriptor(-buffer - 1);
4410 loccount = LocalRefCount[-buffer - 1];
4411 backend = MyProcNumber;
4412 }
4413 else
4414 {
4415 buf = GetBufferDescriptor(buffer - 1);
4416 loccount = GetPrivateRefCount(buffer);
4417 backend = INVALID_PROC_NUMBER;
4418 }
4419
4420 /* theoretically we should lock the bufHdr here */
4421 buf_state = pg_atomic_read_u64(&buf->state);
4422
4423 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%" PRIx64 ", refcount=%u %d)",
4424 buffer,
4426 BufTagGetForkNum(&buf->tag)).str,
4427 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4429 return result;
4430}
#define BUF_FLAG_MASK
char * psprintf(const char *fmt,...)
Definition psprintf.c:43

References Assert, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), INVALID_PROC_NUMBER, LocalRefCount, MyProcNumber, pg_atomic_read_u64(), psprintf(), relpathbackend, and result.

Referenced by buffer_call_start_io(), buffer_call_terminate_io(), CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResOwnerPrintBuffer().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 5124 of file bufmgr.c.

5125{
5126 int i;
5127
5128 /*
5129 * We needn't consider local buffers, since by assumption the target
5130 * database isn't our own.
5131 */
5132
5133 for (i = 0; i < NBuffers; i++)
5134 {
5136
5137 /*
5138 * As in DropRelationBuffers, an unlocked precheck should be safe and
5139 * saves some cycles.
5140 */
5141 if (bufHdr->tag.dbOid != dbid)
5142 continue;
5143
5145 if (bufHdr->tag.dbOid == dbid)
5146 InvalidateBuffer(bufHdr); /* releases spinlock */
5147 else
5149 }
5150}
static void InvalidateBuffer(BufferDesc *buf)
Definition bufmgr.c:2370

References fb(), GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 4774 of file bufmgr.c.

4776{
4777 int i;
4778 int j;
4779 RelFileLocatorBackend rlocator;
4782
4783 rlocator = smgr_reln->smgr_rlocator;
4784
4785 /* If it's a local relation, it's localbuf.c's problem. */
4786 if (RelFileLocatorBackendIsTemp(rlocator))
4787 {
4788 if (rlocator.backend == MyProcNumber)
4789 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4791
4792 return;
4793 }
4794
4795 /*
4796 * To remove all the pages of the specified relation forks from the buffer
4797 * pool, we need to scan the entire buffer pool but we can optimize it by
4798 * finding the buffers from BufMapping table provided we know the exact
4799 * size of each fork of the relation. The exact size is required to ensure
4800 * that we don't leave any buffer for the relation being dropped as
4801 * otherwise the background writer or checkpointer can lead to a PANIC
4802 * error while flushing buffers corresponding to files that don't exist.
4803 *
4804 * To know the exact size, we rely on the size cached for each fork by us
4805 * during recovery which limits the optimization to recovery and on
4806 * standbys but we can easily extend it once we have shared cache for
4807 * relation size.
4808 *
4809 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4810 * and the future writes keeps the cached value up-to-date. See
4811 * smgrextend. It is possible that the value of the first lseek is smaller
4812 * than the actual number of existing blocks in the file due to buggy
4813 * Linux kernels that might not have accounted for the recent write. But
4814 * that should be fine because there must not be any buffers after that
4815 * file size.
4816 */
4817 for (i = 0; i < nforks; i++)
4818 {
4819 /* Get the number of blocks for a relation's fork */
4821
4823 {
4825 break;
4826 }
4827
4828 /* calculate the number of blocks to be invalidated */
4830 }
4831
4832 /*
4833 * We apply the optimization iff the total number of blocks to invalidate
4834 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4835 */
4838 {
4839 for (j = 0; j < nforks; j++)
4840 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4842 return;
4843 }
4844
4845 for (i = 0; i < NBuffers; i++)
4846 {
4848
4849 /*
4850 * We can make this a tad faster by prechecking the buffer tag before
4851 * we attempt to lock the buffer; this saves a lot of lock
4852 * acquisitions in typical cases. It should be safe because the
4853 * caller must have AccessExclusiveLock on the relation, or some other
4854 * reason to be certain that no one is loading new pages of the rel
4855 * into the buffer pool. (Otherwise we might well miss such pages
4856 * entirely.) Therefore, while the tag might be changing while we
4857 * look at it, it can't be changing *to* a value we care about, only
4858 * *away* from such a value. So false negatives are impossible, and
4859 * false positives are safe because we'll recheck after getting the
4860 * buffer lock.
4861 *
4862 * We could check forkNum and blockNum as well as the rlocator, but
4863 * the incremental win from doing so seems small.
4864 */
4865 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4866 continue;
4867
4869
4870 for (j = 0; j < nforks; j++)
4871 {
4872 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4873 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4874 bufHdr->tag.blockNum >= firstDelBlock[j])
4875 {
4876 InvalidateBuffer(bufHdr); /* releases spinlock */
4877 break;
4878 }
4879 }
4880 if (j >= nforks)
4882 }
4883}
#define InvalidBlockNumber
Definition block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition bufmgr.c:95
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition bufmgr.c:5064
int j
Definition isn.c:78
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition localbuf.c:689
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:847

References RelFileLocatorBackend::backend, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), fb(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, RelFileLocatorBackendIsTemp, smgrnblocks_cached(), and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 4894 of file bufmgr.c.

4895{
4896 int i;
4897 int n = 0;
4898 SMgrRelation *rels;
4899 BlockNumber (*block)[MAX_FORKNUM + 1];
4902 bool cached = true;
4903 bool use_bsearch;
4904
4905 if (nlocators == 0)
4906 return;
4907
4908 rels = palloc_array(SMgrRelation, nlocators); /* non-local relations */
4909
4910 /* If it's a local relation, it's localbuf.c's problem. */
4911 for (i = 0; i < nlocators; i++)
4912 {
4913 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4914 {
4915 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4916 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4917 }
4918 else
4919 rels[n++] = smgr_reln[i];
4920 }
4921
4922 /*
4923 * If there are no non-local relations, then we're done. Release the
4924 * memory and return.
4925 */
4926 if (n == 0)
4927 {
4928 pfree(rels);
4929 return;
4930 }
4931
4932 /*
4933 * This is used to remember the number of blocks for all the relations
4934 * forks.
4935 */
4936 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4937 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4938
4939 /*
4940 * We can avoid scanning the entire buffer pool if we know the exact size
4941 * of each of the given relation forks. See DropRelationBuffers.
4942 */
4943 for (i = 0; i < n && cached; i++)
4944 {
4945 for (int j = 0; j <= MAX_FORKNUM; j++)
4946 {
4947 /* Get the number of blocks for a relation's fork. */
4948 block[i][j] = smgrnblocks_cached(rels[i], j);
4949
4950 /* We need to only consider the relation forks that exists. */
4951 if (block[i][j] == InvalidBlockNumber)
4952 {
4953 if (!smgrexists(rels[i], j))
4954 continue;
4955 cached = false;
4956 break;
4957 }
4958
4959 /* calculate the total number of blocks to be invalidated */
4960 nBlocksToInvalidate += block[i][j];
4961 }
4962 }
4963
4964 /*
4965 * We apply the optimization iff the total number of blocks to invalidate
4966 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4967 */
4969 {
4970 for (i = 0; i < n; i++)
4971 {
4972 for (int j = 0; j <= MAX_FORKNUM; j++)
4973 {
4974 /* ignore relation forks that doesn't exist */
4975 if (!BlockNumberIsValid(block[i][j]))
4976 continue;
4977
4978 /* drop all the buffers for a particular relation fork */
4979 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4980 j, block[i][j], 0);
4981 }
4982 }
4983
4984 pfree(block);
4985 pfree(rels);
4986 return;
4987 }
4988
4989 pfree(block);
4990 locators = palloc_array(RelFileLocator, n); /* non-local relations */
4991 for (i = 0; i < n; i++)
4992 locators[i] = rels[i]->smgr_rlocator.locator;
4993
4994 /*
4995 * For low number of relations to drop just use a simple walk through, to
4996 * save the bsearch overhead. The threshold to use is rather a guess than
4997 * an exactly determined value, as it depends on many factors (CPU and RAM
4998 * speeds, amount of shared buffers etc.).
4999 */
5001
5002 /* sort the list of rlocators if necessary */
5003 if (use_bsearch)
5005
5006 for (i = 0; i < NBuffers; i++)
5007 {
5008 RelFileLocator *rlocator = NULL;
5010
5011 /*
5012 * As in DropRelationBuffers, an unlocked precheck should be safe and
5013 * saves some cycles.
5014 */
5015
5016 if (!use_bsearch)
5017 {
5018 int j;
5019
5020 for (j = 0; j < n; j++)
5021 {
5023 {
5024 rlocator = &locators[j];
5025 break;
5026 }
5027 }
5028 }
5029 else
5030 {
5031 RelFileLocator locator;
5032
5033 locator = BufTagGetRelFileLocator(&bufHdr->tag);
5034 rlocator = bsearch(&locator,
5035 locators, n, sizeof(RelFileLocator),
5037 }
5038
5039 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5040 if (rlocator == NULL)
5041 continue;
5042
5044 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
5045 InvalidateBuffer(bufHdr); /* releases spinlock */
5046 else
5048 }
5049
5050 pfree(locators);
5051 pfree(rels);
5052}
#define RELS_BSEARCH_THRESHOLD
Definition bufmgr.c:87
#define palloc_array(type, count)
Definition fe_memutils.h:91
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition localbuf.c:726
#define qsort(a, b, c, d)
Definition port.h:496

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), fb(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, palloc(), palloc_array, pfree(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ EvictAllUnpinnedBuffers()

void EvictAllUnpinnedBuffers ( int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 7991 of file bufmgr.c.

7993{
7994 *buffers_evicted = 0;
7995 *buffers_skipped = 0;
7996 *buffers_flushed = 0;
7997
7998 for (int buf = 1; buf <= NBuffers; buf++)
7999 {
8000 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8002 bool buffer_flushed;
8003
8005
8007 if (!(buf_state & BM_VALID))
8008 continue;
8009
8012
8013 LockBufHdr(desc);
8014
8016 (*buffers_evicted)++;
8017 else
8018 (*buffers_skipped)++;
8019
8020 if (buffer_flushed)
8021 (*buffers_flushed)++;
8022 }
8023}
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition bufmgr.c:7900
#define CHECK_FOR_INTERRUPTS()
Definition miscadmin.h:125
pg_atomic_uint64 state

References BM_VALID, buf, CHECK_FOR_INTERRUPTS, CurrentResourceOwner, EvictUnpinnedBufferInternal(), fb(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u64(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), and BufferDesc::state.

Referenced by pg_buffercache_evict_all().

◆ EvictRelUnpinnedBuffers()

void EvictRelUnpinnedBuffers ( Relation  rel,
int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 8041 of file bufmgr.c.

8043{
8045
8046 *buffers_skipped = 0;
8047 *buffers_evicted = 0;
8048 *buffers_flushed = 0;
8049
8050 for (int buf = 1; buf <= NBuffers; buf++)
8051 {
8052 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8054 bool buffer_flushed;
8055
8057
8058 /* An unlocked precheck should be safe and saves some cycles. */
8059 if ((buf_state & BM_VALID) == 0 ||
8061 continue;
8062
8063 /* Make sure we can pin the buffer. */
8066
8067 buf_state = LockBufHdr(desc);
8068
8069 /* recheck, could have changed without the lock */
8070 if ((buf_state & BM_VALID) == 0 ||
8072 {
8073 UnlockBufHdr(desc);
8074 continue;
8075 }
8076
8078 (*buffers_evicted)++;
8079 else
8080 (*buffers_skipped)++;
8081
8082 if (buffer_flushed)
8083 (*buffers_flushed)++;
8084 }
8085}
#define RelationUsesLocalBuffers(relation)
Definition rel.h:648
RelFileLocator rd_locator
Definition rel.h:57

References Assert, BM_VALID, buf, BufTagMatchesRelFileLocator(), CHECK_FOR_INTERRUPTS, CurrentResourceOwner, EvictUnpinnedBufferInternal(), fb(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u64(), RelationData::rd_locator, RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by evict_rel(), and pg_buffercache_evict_relation().

◆ EvictUnpinnedBuffer()

bool EvictUnpinnedBuffer ( Buffer  buf,
bool buffer_flushed 
)

◆ EvictUnpinnedBufferInternal()

static bool EvictUnpinnedBufferInternal ( BufferDesc desc,
bool buffer_flushed 
)
static

Definition at line 7900 of file bufmgr.c.

7901{
7903 bool result;
7904
7905 *buffer_flushed = false;
7906
7909
7910 if ((buf_state & BM_VALID) == 0)
7911 {
7912 UnlockBufHdr(desc);
7913 return false;
7914 }
7915
7916 /* Check that it's not pinned already. */
7918 {
7919 UnlockBufHdr(desc);
7920 return false;
7921 }
7922
7923 PinBuffer_Locked(desc); /* releases spinlock */
7924
7925 /* If it was dirty, try to clean it once. */
7926 if (buf_state & BM_DIRTY)
7927 {
7929 *buffer_flushed = true;
7930 }
7931
7932 /* This will return false if it becomes dirty or someone else pins it. */
7934
7935 UnpinBuffer(desc);
7936
7937 return result;
7938}
#define BM_LOCKED
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4635
static void PinBuffer_Locked(BufferDesc *buf)
Definition bufmgr.c:3397
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition bufmgr.c:2471

References Assert, BM_DIRTY, BM_LOCKED, BM_VALID, BUF_STATE_GET_REFCOUNT, fb(), FlushUnlockedBuffer(), InvalidateVictimBuffer(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, pg_atomic_read_u64(), PinBuffer_Locked(), result, BufferDesc::state, UnlockBufHdr(), and UnpinBuffer().

Referenced by EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), and EvictUnpinnedBuffer().

◆ ExtendBufferedRel()

Buffer ExtendBufferedRel ( BufferManagerRelation  bmr,
ForkNumber  forkNum,
BufferAccessStrategy  strategy,
uint32  flags 
)

Definition at line 970 of file bufmgr.c.

974{
975 Buffer buf;
976 uint32 extend_by = 1;
977
978 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
979 &buf, &extend_by);
980
981 return buf;
982}
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:1002

References buf, ExtendBufferedRelBy(), and fb().

Referenced by _bt_allocbuf(), _hash_getnewbuf(), BloomNewBuffer(), brinbuild(), brinbuildempty(), fill_seq_fork_with_data(), ginbuildempty(), GinNewBuffer(), gistbuildempty(), gistNewBuffer(), ReadBuffer_common(), revmap_physical_extend(), and SpGistNewBuffer().

◆ ExtendBufferedRelBy()

BlockNumber ExtendBufferedRelBy ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
Buffer buffers,
uint32 extended_by 
)

Definition at line 1002 of file bufmgr.c.

1009{
1010 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1011 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1012 Assert(extend_by > 0);
1013
1014 if (bmr.relpersistence == '\0')
1015 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1016
1017 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1019 buffers, extended_by);
1020}
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2751

References Assert, ExtendBufferedRelCommon(), fb(), and InvalidBlockNumber.

Referenced by ExtendBufferedRel(), grow_rel(), and RelationAddBlocks().

◆ ExtendBufferedRelCommon()

static BlockNumber ExtendBufferedRelCommon ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2751 of file bufmgr.c.

2759{
2761
2763 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2764 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2765 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2766 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2767 extend_by);
2768
2769 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2772 buffers, &extend_by);
2773 else
2774 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2776 buffers, &extend_by);
2778
2780 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2781 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2782 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2783 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2784 *extended_by,
2785 first_block);
2786
2787 return first_block;
2788}
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition bufmgr.c:2795
#define BMR_GET_SMGR(bmr)
Definition bufmgr.h:118
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition localbuf.c:355

References BMR_GET_SMGR, ExtendBufferedRelLocal(), ExtendBufferedRelShared(), and fb().

Referenced by ExtendBufferedRelBy(), and ExtendBufferedRelTo().

◆ ExtendBufferedRelShared()

static BlockNumber ExtendBufferedRelShared ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2795 of file bufmgr.c.

2803{
2807
2809
2810 /*
2811 * Acquire victim buffers for extension without holding extension lock.
2812 * Writing out victim buffers is the most expensive part of extending the
2813 * relation, particularly when doing so requires WAL flushes. Zeroing out
2814 * the buffers is also quite expensive, so do that before holding the
2815 * extension lock as well.
2816 *
2817 * These pages are pinned by us and not valid. While we hold the pin they
2818 * can't be acquired as victim buffers by another backend.
2819 */
2820 for (uint32 i = 0; i < extend_by; i++)
2821 {
2823
2824 buffers[i] = GetVictimBuffer(strategy, io_context);
2826
2827 /* new buffers are zero-filled */
2828 MemSet(buf_block, 0, BLCKSZ);
2829 }
2830
2831 /*
2832 * Lock relation against concurrent extensions, unless requested not to.
2833 *
2834 * We use the same extension lock for all forks. That's unnecessarily
2835 * restrictive, but currently extensions for forks don't happen often
2836 * enough to make it worth locking more granularly.
2837 *
2838 * Note that another backend might have extended the relation by the time
2839 * we get the lock.
2840 */
2841 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2843
2844 /*
2845 * If requested, invalidate size cache, so that smgrnblocks asks the
2846 * kernel.
2847 */
2848 if (flags & EB_CLEAR_SIZE_CACHE)
2849 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2850
2852
2853 /*
2854 * Now that we have the accurate relation size, check if the caller wants
2855 * us to extend to only up to a specific size. If there were concurrent
2856 * extensions, we might have acquired too many buffers and need to release
2857 * them.
2858 */
2860 {
2862
2864 extend_by = 0;
2865 else if ((uint64) first_block + extend_by > extend_upto)
2867
2868 for (uint32 i = extend_by; i < orig_extend_by; i++)
2869 {
2870 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2871
2873 }
2874
2875 if (extend_by == 0)
2876 {
2877 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2880 return first_block;
2881 }
2882 }
2883
2884 /* Fail if relation is already at maximum possible length */
2886 ereport(ERROR,
2888 errmsg("cannot extend relation %s beyond %u blocks",
2889 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2890 MaxBlockNumber)));
2891
2892 /*
2893 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2894 *
2895 * This needs to happen before we extend the relation, because as soon as
2896 * we do, other backends can start to read in those pages.
2897 */
2898 for (uint32 i = 0; i < extend_by; i++)
2899 {
2900 Buffer victim_buf = buffers[i];
2902 BufferTag tag;
2903 uint32 hash;
2905 int existing_id;
2906
2907 /* in case we need to pin an existing buffer below */
2910
2911 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2912 first_block + i);
2913 hash = BufTableHashCode(&tag);
2915
2917
2919
2920 /*
2921 * We get here only in the corner case where we are trying to extend
2922 * the relation but we found a pre-existing buffer. This can happen
2923 * because a prior attempt at extending the relation failed, and
2924 * because mdread doesn't complain about reads beyond EOF (when
2925 * zero_damaged_pages is ON) and so a previous attempt to read a block
2926 * beyond EOF could have left a "valid" zero-filled buffer.
2927 *
2928 * This has also been observed when relation was overwritten by
2929 * external process. Since the legitimate cases should always have
2930 * left a zero-filled buffer, complain if not PageIsNew.
2931 */
2932 if (existing_id >= 0)
2933 {
2936 bool valid;
2937
2938 /*
2939 * Pin the existing buffer before releasing the partition lock,
2940 * preventing it from being evicted.
2941 */
2942 valid = PinBuffer(existing_hdr, strategy, false);
2943
2946
2949
2950 if (valid && !PageIsNew((Page) buf_block))
2951 ereport(ERROR,
2952 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2953 existing_hdr->tag.blockNum,
2954 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2955
2956 /*
2957 * We *must* do smgr[zero]extend before succeeding, else the page
2958 * will not be reserved by the kernel, and the next P_NEW call
2959 * will decide to return the same page. Clear the BM_VALID bit,
2960 * do StartSharedBufferIO() and proceed.
2961 *
2962 * Loop to handle the very small possibility that someone re-sets
2963 * BM_VALID between our clearing it and StartSharedBufferIO
2964 * inspecting it.
2965 */
2966 while (true)
2967 {
2969
2971
2973
2975 break;
2976 }
2977 }
2978 else
2979 {
2981 uint64 set_bits = 0;
2982
2984
2985 /* some sanity checks while we hold the buffer header lock */
2988
2989 victim_buf_hdr->tag = tag;
2990
2992 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2994
2996 set_bits, 0,
2997 0);
2998
3000
3001 /* XXX: could combine the locked operations in it with the above */
3003 }
3004 }
3005
3007
3008 /*
3009 * Note: if smgrzeroextend fails, we will end up with buffers that are
3010 * allocated but not marked BM_VALID. The next relation extension will
3011 * still select the same block number (because the relation didn't get any
3012 * longer on disk) and so future attempts to extend the relation will find
3013 * the same buffers (if they have not been recycled) but come right back
3014 * here to try smgrzeroextend again.
3015 *
3016 * We don't need to set checksum for all-zero pages.
3017 */
3019
3020 /*
3021 * Release the file-extension lock; it's now OK for someone else to extend
3022 * the relation some more.
3023 *
3024 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
3025 * take noticeable time.
3026 */
3027 if (!(flags & EB_SKIP_EXTENSION_LOCK))
3029
3031 io_start, 1, extend_by * BLCKSZ);
3032
3033 /* Set BM_VALID, terminate IO, and wake up any waiters */
3034 for (uint32 i = 0; i < extend_by; i++)
3035 {
3036 Buffer buf = buffers[i];
3038 bool lock = false;
3039
3040 if (flags & EB_LOCK_FIRST && i == 0)
3041 lock = true;
3042 else if (flags & EB_LOCK_TARGET)
3043 {
3045 if (first_block + i + 1 == extend_upto)
3046 lock = true;
3047 }
3048
3049 if (lock)
3051
3052 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
3053 }
3054
3056
3058
3059 return first_block;
3060}
#define MaxBlockNumber
Definition block.h:35
#define BufHdrGetBlock(bufHdr)
Definition bufmgr.c:76
StartBufferIOResult StartSharedBufferIO(BufferDesc *buf, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition bufmgr.c:7250
void LimitAdditionalPins(uint32 *additional_pins)
Definition bufmgr.c:2733
void * Block
Definition bufmgr.h:26
@ EB_LOCK_TARGET
Definition bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition bufmgr.h:90
@ EB_SKIP_EXTENSION_LOCK
Definition bufmgr.h:75
@ EB_LOCK_FIRST
Definition bufmgr.h:87
static bool PageIsNew(const PageData *page)
Definition bufpage.h:258
#define MemSet(start, val, len)
Definition c.h:1107
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition lmgr.c:474
#define ExclusiveLock
Definition lockdefs.h:42
@ IOOP_EXTEND
Definition pgstat.h:318
static unsigned hash(unsigned *uv, int n)
Definition rege_dfa.c:724
#define relpath(rlocator, forknum)
Definition relpath.h:150
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition smgr.c:819
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition smgr.c:649
int64 shared_blks_written
Definition instrument.h:29

References Assert, BM_DIRTY, BM_PERMANENT, BM_TAG_VALID, BM_VALID, BMR_GET_SMGR, buf, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BUFFER_IO_ALREADY_DONE, BUFFER_LOCK_EXCLUSIVE, BufferDescriptorGetBuffer(), BufHdrGetBlock, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), CurrentResourceOwner, EB_CLEAR_SIZE_CACHE, EB_LOCK_FIRST, EB_LOCK_TARGET, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errmsg, ERROR, ExclusiveLock, fb(), GetBufferDescriptor(), GetVictimBuffer(), hash(), i, INIT_FORKNUM, InitBufferTag(), InvalidBlockNumber, IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EXTEND, LimitAdditionalPins(), LockBuffer(), LockBufHdr(), LockRelationForExtension(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MaxBlockNumber, MemSet, PageIsNew(), pg_atomic_fetch_and_u64(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer(), relpath, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_written, smgrnblocks(), smgrzeroextend(), StartSharedBufferIO(), str, TerminateBufferIO(), track_io_timing, UnlockBufHdrExt(), UnlockRelationForExtension(), and UnpinBuffer().

Referenced by ExtendBufferedRelCommon().

◆ ExtendBufferedRelTo()

Buffer ExtendBufferedRelTo ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
BlockNumber  extend_to,
ReadBufferMode  mode 
)

Definition at line 1031 of file bufmgr.c.

1037{
1039 uint32 extended_by = 0;
1040 Buffer buffer = InvalidBuffer;
1041 Buffer buffers[64];
1042
1043 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
1044 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
1046
1047 if (bmr.relpersistence == '\0')
1048 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
1049
1050 /*
1051 * If desired, create the file if it doesn't exist. If
1052 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
1053 * an smgrexists call.
1054 */
1055 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
1056 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
1057 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
1059 {
1061
1062 /* recheck, fork might have been created concurrently */
1065
1067 }
1068
1069 /*
1070 * If requested, invalidate size cache, so that smgrnblocks asks the
1071 * kernel.
1072 */
1073 if (flags & EB_CLEAR_SIZE_CACHE)
1074 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
1075
1076 /*
1077 * Estimate how many pages we'll need to extend by. This avoids acquiring
1078 * unnecessarily many victim buffers.
1079 */
1081
1082 /*
1083 * Since no-one else can be looking at the page contents yet, there is no
1084 * difference between an exclusive lock and a cleanup-strength lock. Note
1085 * that we pass the original mode to ReadBuffer_common() below, when
1086 * falling back to reading the buffer to a concurrent relation extension.
1087 */
1089 flags |= EB_LOCK_TARGET;
1090
1091 while (current_size < extend_to)
1092 {
1093 uint32 num_pages = lengthof(buffers);
1095
1096 if ((uint64) current_size + num_pages > extend_to)
1097 num_pages = extend_to - current_size;
1098
1099 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
1100 num_pages, extend_to,
1101 buffers, &extended_by);
1102
1104 Assert(num_pages != 0 || current_size >= extend_to);
1105
1106 for (uint32 i = 0; i < extended_by; i++)
1107 {
1108 if (first_block + i != extend_to - 1)
1109 ReleaseBuffer(buffers[i]);
1110 else
1111 buffer = buffers[i];
1112 }
1113 }
1114
1115 /*
1116 * It's possible that another backend concurrently extended the relation.
1117 * In that case read the buffer.
1118 *
1119 * XXX: Should we control this via a flag?
1120 */
1121 if (buffer == InvalidBuffer)
1122 {
1123 Assert(extended_by == 0);
1124 buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1125 fork, extend_to - 1, mode, strategy);
1126 }
1127
1128 return buffer;
1129}
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:1276
void ReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5595
@ EB_PERFORMING_RECOVERY
Definition bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition bufmgr.h:84
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition bufmgr.h:47
#define lengthof(array)
Definition c.h:873
static int64 current_size

References Assert, BMR_GET_SMGR, PrivateRefCountEntry::buffer, current_size, EB_CLEAR_SIZE_CACHE, EB_CREATE_FORK_IF_NEEDED, EB_LOCK_TARGET, EB_PERFORMING_RECOVERY, ExclusiveLock, ExtendBufferedRelCommon(), fb(), i, InvalidBlockNumber, InvalidBuffer, lengthof, LockRelationForExtension(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, ReadBuffer_common(), ReleaseBuffer(), smgrcreate(), smgrexists(), smgrnblocks(), and UnlockRelationForExtension().

Referenced by fsm_extend(), vm_extend(), and XLogReadBufferExtended().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 5064 of file bufmgr.c.

5067{
5068 BlockNumber curBlock;
5069
5070 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
5071 {
5072 uint32 bufHash; /* hash value for tag */
5073 BufferTag bufTag; /* identity of requested block */
5074 LWLock *bufPartitionLock; /* buffer partition lock for it */
5075 int buf_id;
5077
5078 /* create a tag so we can lookup the buffer */
5079 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
5080
5081 /* determine its hash code and partition lock ID */
5084
5085 /* Check that it is in the buffer pool. If not, do nothing. */
5087 buf_id = BufTableLookup(&bufTag, bufHash);
5089
5090 if (buf_id < 0)
5091 continue;
5092
5093 bufHdr = GetBufferDescriptor(buf_id);
5094
5095 /*
5096 * We need to lock the buffer header and recheck if the buffer is
5097 * still associated with the same block because the buffer could be
5098 * evicted by some other backend loading blocks for a different
5099 * relation after we release lock on the BufMapping table.
5100 */
5102
5103 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
5104 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
5105 bufHdr->tag.blockNum >= firstDelBlock)
5106 InvalidateBuffer(bufHdr); /* releases spinlock */
5107 else
5109 }
5110}

References BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), fb(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 4512 of file bufmgr.c.

4514{
4516 ErrorContextCallback errcallback;
4519
4522
4523 /*
4524 * Try to start an I/O operation. If StartBufferIO returns false, then
4525 * someone else flushed the buffer before we could, so we need not do
4526 * anything.
4527 */
4528 if (StartSharedBufferIO(buf, false, true, NULL) == BUFFER_IO_ALREADY_DONE)
4529 return;
4530
4531 /* Setup error traceback support for ereport() */
4533 errcallback.arg = buf;
4534 errcallback.previous = error_context_stack;
4535 error_context_stack = &errcallback;
4536
4537 /* Find smgr relation for buffer */
4538 if (reln == NULL)
4540
4542 buf->tag.blockNum,
4543 reln->smgr_rlocator.locator.spcOid,
4544 reln->smgr_rlocator.locator.dbOid,
4545 reln->smgr_rlocator.locator.relNumber);
4546
4547 /*
4548 * As we hold at least a share-exclusive lock on the buffer, the LSN
4549 * cannot change during the flush (and thus can't be torn).
4550 */
4552
4553 /*
4554 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4555 * rule that log updates must hit disk before any of the data-file changes
4556 * they describe do.
4557 *
4558 * However, this rule does not apply to unlogged relations, which will be
4559 * lost after a crash anyway. Most unlogged relation pages do not bear
4560 * LSNs since we never emit WAL records for them, and therefore flushing
4561 * up through the buffer LSN would be useless, but harmless. However,
4562 * some index AMs use LSNs internally to detect concurrent page
4563 * modifications, and therefore unlogged index pages bear "fake" LSNs
4564 * generated by XLogGetFakeLSN. It is unlikely but possible that the fake
4565 * LSN counter could advance past the WAL insertion point; and if it did
4566 * happen, attempting to flush WAL through that location would fail, with
4567 * disastrous system-wide consequences. To make sure that can't happen,
4568 * skip the flush if the buffer isn't permanent.
4569 */
4570 if (pg_atomic_read_u64(&buf->state) & BM_PERMANENT)
4572
4573 /*
4574 * Now it's safe to write the buffer to disk. Note that no one else should
4575 * have been able to write it, while we were busy with log flushing,
4576 * because we got the exclusive right to perform I/O by setting the
4577 * BM_IO_IN_PROGRESS bit.
4578 */
4580
4581 /* Update page checksum if desired. */
4582 PageSetChecksum((Page) bufBlock, buf->tag.blockNum);
4583
4585
4587 BufTagGetForkNum(&buf->tag),
4588 buf->tag.blockNum,
4589 bufBlock,
4590 false);
4591
4592 /*
4593 * When a strategy is in use, only flushes of dirty buffers already in the
4594 * strategy ring are counted as strategy writes (IOCONTEXT
4595 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4596 * statistics tracking.
4597 *
4598 * If a shared buffer initially added to the ring must be flushed before
4599 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4600 *
4601 * If a shared buffer which was added to the ring later because the
4602 * current strategy buffer is pinned or in use or because all strategy
4603 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4604 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4605 * (from_ring will be false).
4606 *
4607 * When a strategy is not in use, the write can only be a "regular" write
4608 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4609 */
4612
4614
4615 /*
4616 * Mark the buffer as clean and end the BM_IO_IN_PROGRESS state.
4617 */
4618 TerminateBufferIO(buf, true, 0, true, false);
4619
4621 buf->tag.blockNum,
4622 reln->smgr_rlocator.locator.spcOid,
4623 reln->smgr_rlocator.locator.dbOid,
4624 reln->smgr_rlocator.locator.relNumber);
4625
4626 /* Pop the error context stack */
4627 error_context_stack = errcallback.previous;
4628}
#define BufferGetLSN(bufHdr)
Definition bufmgr.c:77
static void shared_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7468
void PageSetChecksum(Page page, BlockNumber blkno)
Definition bufpage.c:1518
ErrorContextCallback * error_context_stack
Definition elog.c:100
@ IOOP_WRITE
Definition pgstat.h:320
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.h:131
struct ErrorContextCallback * previous
Definition elog.h:299
void(* callback)(void *arg)
Definition elog.h:300
void XLogFlush(XLogRecPtr record)
Definition xlog.c:2801

References ErrorContextCallback::arg, Assert, BM_PERMANENT, buf, BUFFER_IO_ALREADY_DONE, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferGetLSN, BufferLockHeldByMeInMode(), BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, error_context_stack, fb(), INVALID_PROC_NUMBER, IOOP_WRITE, PageSetChecksum(), pg_atomic_read_u64(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), ErrorContextCallback::previous, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), smgropen(), smgrwrite(), StartSharedBufferIO(), TerminateBufferIO(), track_io_timing, and XLogFlush().

Referenced by FlushOneBuffer(), FlushUnlockedBuffer(), and GetVictimBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 5535 of file bufmgr.c.

5536{
5537 int i;
5539
5540 for (i = 0; i < NBuffers; i++)
5541 {
5543
5545
5546 /*
5547 * As in DropRelationBuffers, an unlocked precheck should be safe and
5548 * saves some cycles.
5549 */
5550 if (bufHdr->tag.dbOid != dbid)
5551 continue;
5552
5553 /* Make sure we can handle the pin */
5556
5558 if (bufHdr->tag.dbOid == dbid &&
5560 {
5564 }
5565 else
5567 }
5568}

References BM_DIRTY, BM_VALID, CurrentResourceOwner, fb(), FlushUnlockedBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 5575 of file bufmgr.c.

5576{
5578
5579 /* currently not needed, but no fundamental reason not to support */
5580 Assert(!BufferIsLocal(buffer));
5581
5582 Assert(BufferIsPinned(buffer));
5583
5584 bufHdr = GetBufferDescriptor(buffer - 1);
5585
5586 Assert(BufferIsLockedByMe(buffer));
5587
5589}
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition bufmgr.c:4512
bool BufferIsLockedByMe(Buffer buffer)
Definition bufmgr.c:3070

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsLockedByMe(), BufferIsPinned, fb(), FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, and IOOBJECT_RELATION.

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), invalidate_one_block(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 5171 of file bufmgr.c.

5172{
5173 int i;
5175 SMgrRelation srel = RelationGetSmgr(rel);
5176
5177 if (RelationUsesLocalBuffers(rel))
5178 {
5179 for (i = 0; i < NLocBuffer; i++)
5180 {
5182
5184 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5185 ((buf_state = pg_atomic_read_u64(&bufHdr->state)) &
5186 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5187 {
5188 ErrorContextCallback errcallback;
5189
5190 /* Setup error traceback support for ereport() */
5192 errcallback.arg = bufHdr;
5193 errcallback.previous = error_context_stack;
5194 error_context_stack = &errcallback;
5195
5196 /* Make sure we can handle the pin */
5199
5200 /*
5201 * Pin/unpin mostly to make valgrind work, but it also seems
5202 * like the right thing to do.
5203 */
5204 PinLocalBuffer(bufHdr, false);
5205
5206
5207 FlushLocalBuffer(bufHdr, srel);
5208
5210
5211 /* Pop the error context stack */
5212 error_context_stack = errcallback.previous;
5213 }
5214 }
5215
5216 return;
5217 }
5218
5219 for (i = 0; i < NBuffers; i++)
5220 {
5222
5224
5225 /*
5226 * As in DropRelationBuffers, an unlocked precheck should be safe and
5227 * saves some cycles.
5228 */
5230 continue;
5231
5232 /* Make sure we can handle the pin */
5235
5237 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5239 {
5243 }
5244 else
5246 }
5247}
static void local_buffer_write_error_callback(void *arg)
Definition bufmgr.c:7484
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition localbuf.c:183
void UnpinLocalBuffer(Buffer buffer)
Definition localbuf.c:865
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition localbuf.c:829
int NLocBuffer
Definition localbuf.c:45
static SMgrRelation RelationGetSmgr(Relation rel)
Definition rel.h:578

References ErrorContextCallback::arg, BM_DIRTY, BM_VALID, BufferDescriptorGetBuffer(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, fb(), FlushLocalBuffer(), FlushUnlockedBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, local_buffer_write_error_callback(), LockBufHdr(), NBuffers, NLocBuffer, pg_atomic_read_u64(), PinBuffer_Locked(), PinLocalBuffer(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), UnlockBufHdr(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 5259 of file bufmgr.c.

5260{
5261 int i;
5263 bool use_bsearch;
5264
5265 if (nrels == 0)
5266 return;
5267
5268 /* fill-in array for qsort */
5270
5271 for (i = 0; i < nrels; i++)
5272 {
5273 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5274
5275 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5276 srels[i].srel = smgrs[i];
5277 }
5278
5279 /*
5280 * Save the bsearch overhead for low number of relations to sync. See
5281 * DropRelationsAllBuffers for details.
5282 */
5284
5285 /* sort the list of SMgrRelations if necessary */
5286 if (use_bsearch)
5287 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5288
5289 for (i = 0; i < NBuffers; i++)
5290 {
5294
5295 /*
5296 * As in DropRelationBuffers, an unlocked precheck should be safe and
5297 * saves some cycles.
5298 */
5299
5300 if (!use_bsearch)
5301 {
5302 int j;
5303
5304 for (j = 0; j < nrels; j++)
5305 {
5306 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5307 {
5308 srelent = &srels[j];
5309 break;
5310 }
5311 }
5312 }
5313 else
5314 {
5315 RelFileLocator rlocator;
5316
5317 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5318 srelent = bsearch(&rlocator,
5319 srels, nrels, sizeof(SMgrSortArray),
5321 }
5322
5323 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5324 if (srelent == NULL)
5325 continue;
5326
5327 /* Make sure we can handle the pin */
5330
5332 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5334 {
5338 }
5339 else
5341 }
5342
5343 pfree(srels);
5344}

References Assert, BM_DIRTY, BM_VALID, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, fb(), FlushUnlockedBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, LockBufHdr(), NBuffers, palloc_array, pfree(), PinBuffer_Locked(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), rlocator_comparator(), UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ FlushUnlockedBuffer()

static void FlushUnlockedBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 4635 of file bufmgr.c.

4637{
4639
4642 BufferLockUnlock(buffer, buf);
4643}
static void BufferLockAcquire(Buffer buffer, BufferDesc *buf_hdr, BufferLockMode mode)
Definition bufmgr.c:5907
static void BufferLockUnlock(Buffer buffer, BufferDesc *buf_hdr)
Definition bufmgr.c:6023

References buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferDescriptorGetBuffer(), BufferLockAcquire(), BufferLockUnlock(), fb(), and FlushBuffer().

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), and SyncOneBuffer().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 565 of file bufmgr.c.

566{
567 Assert(ref->data.refcount == 0);
568 Assert(ref->data.lockmode == BUFFER_LOCK_UNLOCK);
569
570 if (ref >= &PrivateRefCountArray[0] &&
572 {
573 ref->buffer = InvalidBuffer;
575
576
577 /*
578 * Mark the just used entry as reserved - in many scenarios that
579 * allows us to avoid ever having to search the array/hash for free
580 * entries.
581 */
583 }
584 else
585 {
589 }
590}
static int ReservedRefCountSlot
Definition bufmgr.c:268

References Assert, BUFFER_LOCK_UNLOCK, fb(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountSlot.

Referenced by UnlockReleaseBuffer(), and UnpinBufferNoOwner().

◆ GetAdditionalPinLimit()

uint32 GetAdditionalPinLimit ( void  )

Definition at line 2707 of file bufmgr.c.

2708{
2710
2711 /*
2712 * We get the number of "overflowed" pins for free, but don't know the
2713 * number of pins in PrivateRefCountArray. The cost of calculating that
2714 * exactly doesn't seem worth it, so just assume the max.
2715 */
2717
2718 /* Is this backend already holding more than its fair share? */
2720 return 0;
2721
2723}
static uint32 MaxProportionalPins
Definition bufmgr.c:271

References fb(), MaxProportionalPins, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by LimitAdditionalPins(), and read_stream_start_pending_read().

◆ GetPinLimit()

uint32 GetPinLimit ( void  )

Definition at line 2695 of file bufmgr.c.

2696{
2697 return MaxProportionalPins;
2698}

References MaxProportionalPins.

Referenced by GetAccessStrategy(), and read_stream_begin_impl().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 542 of file bufmgr.c.

543{
545
546 Assert(BufferIsValid(buffer));
547 Assert(!BufferIsLocal(buffer));
548
549 /*
550 * Not moving the entry - that's ok for the current users, but we might
551 * want to change this one day.
552 */
553 ref = GetPrivateRefCountEntry(buffer, false);
554
555 if (ref == NULL)
556 return 0;
557 return ref->data.refcount;
558}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), fb(), and GetPrivateRefCountEntry().

Referenced by CheckBufferIsPinnedOnce(), ConditionalLockBufferForCleanup(), DebugPrintBufferRefcount(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), and MarkSharedBufferDirtyHint().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
inlinestatic

Definition at line 507 of file bufmgr.c.

508{
509 Assert(BufferIsValid(buffer));
510 Assert(!BufferIsLocal(buffer));
511
512 /*
513 * It's very common to look up the same buffer repeatedly. To make that
514 * fast, we have a one-entry cache.
515 *
516 * In contrast to the loop in GetPrivateRefCountEntrySlow(), here it
517 * faster to check PrivateRefCountArray[].buffer, as in the case of a hit
518 * fewer addresses are computed and fewer cachelines are accessed. Whereas
519 * in GetPrivateRefCountEntrySlow()'s case, checking
520 * PrivateRefCountArrayKeys saves a lot of memory accesses.
521 */
522 if (likely(PrivateRefCountEntryLast != -1) &&
524 {
526 }
527
528 /*
529 * The code for the cached lookup is small enough to be worth inlining
530 * into the caller. In the miss case however, that empirically doesn't
531 * seem worth it.
532 */
533 return GetPrivateRefCountEntrySlow(buffer, do_move);
534}
static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
Definition bufmgr.c:419
static int PrivateRefCountEntryLast
Definition bufmgr.c:269

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), fb(), GetPrivateRefCountEntrySlow(), likely, PrivateRefCountArray, and PrivateRefCountEntryLast.

Referenced by BufferLockAcquire(), BufferLockConditional(), BufferLockDisownInternal(), BufferLockHeldByMe(), BufferLockHeldByMeInMode(), GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), ResOwnerReleaseBuffer(), SharedBufferBeginSetHintBits(), UnlockReleaseBuffer(), and UnpinBufferNoOwner().

◆ GetPrivateRefCountEntrySlow()

static pg_noinline PrivateRefCountEntry * GetPrivateRefCountEntrySlow ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 419 of file bufmgr.c.

420{
422 int match = -1;
423 int i;
424
425 /*
426 * First search for references in the array, that'll be sufficient in the
427 * majority of cases.
428 */
429 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
430 {
431 if (PrivateRefCountArrayKeys[i] == buffer)
432 {
433 match = i;
434 /* see ReservePrivateRefCountEntry() for why we don't return */
435 }
436 }
437
438 if (likely(match != -1))
439 {
440 /* update cache for the next lookup */
442
443 return &PrivateRefCountArray[match];
444 }
445
446 /*
447 * By here we know that the buffer, if already pinned, isn't residing in
448 * the array.
449 *
450 * Only look up the buffer in the hashtable if we've previously overflowed
451 * into it.
452 */
454 return NULL;
455
457
458 if (res == NULL)
459 return NULL;
460 else if (!do_move)
461 {
462 /* caller doesn't want us to move the hash entry into the array */
463 return res;
464 }
465 else
466 {
467 /* move buffer from hashtable into the free array slot */
470
471 /* Save data and delete from hashtable while res is still valid */
472 data = res->data;
476
477 /* Ensure there's a free array slot */
479
480 /* Use up the reserved slot */
484 Assert(free->buffer == InvalidBuffer);
485
486 /* and fill it */
487 free->buffer = buffer;
488 free->data = data;
490 /* update cache for the next lookup */
492
494
495 return free;
496 }
497}
const void * data
#define free(a)

References Assert, PrivateRefCountEntry::buffer, PrivateRefCountEntry::data, data, fb(), free, i, InvalidBuffer, likely, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountEntryLast, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, ReservedRefCountSlot, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCountEntry().

◆ GetVictimBuffer()

static Buffer GetVictimBuffer ( BufferAccessStrategy  strategy,
IOContext  io_context 
)
static

Definition at line 2548 of file bufmgr.c.

2549{
2551 Buffer buf;
2553 bool from_ring;
2554
2555 /*
2556 * Ensure, before we pin a victim buffer, that there's a free refcount
2557 * entry and resource owner slot for the pin.
2558 */
2561
2562 /* we return here if a prospective victim buffer gets used concurrently */
2563again:
2564
2565 /*
2566 * Select a victim buffer. The buffer is returned pinned and owned by
2567 * this backend.
2568 */
2571
2572 /*
2573 * We shouldn't have any other pins for this buffer.
2574 */
2576
2577 /*
2578 * If the buffer was dirty, try to write it out. There is a race
2579 * condition here, another backend could dirty the buffer between
2580 * StrategyGetBuffer() checking that it is not in use and invalidating the
2581 * buffer below. That's addressed by InvalidateVictimBuffer() verifying
2582 * that the buffer is not dirty.
2583 */
2584 if (buf_state & BM_DIRTY)
2585 {
2588
2589 /*
2590 * We need a share-exclusive lock on the buffer contents to write it
2591 * out (else we might write invalid data, eg because someone else is
2592 * compacting the page contents while we write). We must use a
2593 * conditional lock acquisition here to avoid deadlock. Even though
2594 * the buffer was not pinned (and therefore surely not locked) when
2595 * StrategyGetBuffer returned it, someone else could have pinned and
2596 * (share-)exclusive-locked it by the time we get here. If we try to
2597 * get the lock unconditionally, we'd block waiting for them; if they
2598 * later block waiting for us, deadlock ensues. (This has been
2599 * observed to happen when two backends are both trying to split btree
2600 * index pages, and the second one just happens to be trying to split
2601 * the page the first one got from StrategyGetBuffer.)
2602 */
2604 {
2605 /*
2606 * Someone else has locked the buffer, so give it up and loop back
2607 * to get another one.
2608 */
2610 goto again;
2611 }
2612
2613 /*
2614 * If using a nondefault strategy, and this victim came from the
2615 * strategy ring, let the strategy decide whether to reject it when
2616 * reusing it would require a WAL flush. This only applies to
2617 * permanent buffers; unlogged buffers can have fake LSNs, so
2618 * XLogNeedsFlush() is not meaningful for them.
2619 *
2620 * We need to hold the content lock in at least share-exclusive mode
2621 * to safely inspect the page LSN, so this couldn't have been done
2622 * inside StrategyGetBuffer().
2623 */
2624 if (strategy && from_ring &&
2628 {
2630 goto again;
2631 }
2632
2633 /* OK, do the I/O */
2636
2638 &buf_hdr->tag);
2639 }
2640
2641
2642 if (buf_state & BM_VALID)
2643 {
2644 /*
2645 * When a BufferAccessStrategy is in use, blocks evicted from shared
2646 * buffers are counted as IOOP_EVICT in the corresponding context
2647 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2648 * strategy in two cases: 1) while initially claiming buffers for the
2649 * strategy ring 2) to replace an existing strategy ring buffer
2650 * because it is pinned or in use and cannot be reused.
2651 *
2652 * Blocks evicted from buffers already in the strategy ring are
2653 * counted as IOOP_REUSE in the corresponding strategy context.
2654 *
2655 * At this point, we can accurately count evictions and reuses,
2656 * because we have successfully claimed the valid buffer. Previously,
2657 * we may have been forced to release the buffer due to concurrent
2658 * pinners or erroring out.
2659 */
2661 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2662 }
2663
2664 /*
2665 * If the buffer has an entry in the buffer mapping table, delete it. This
2666 * can fail because another backend could have pinned or dirtied the
2667 * buffer.
2668 */
2670 {
2672 goto again;
2673 }
2674
2675 /* a final set of sanity checks */
2676#ifdef USE_ASSERT_CHECKING
2678
2681
2683#endif
2684
2685 return buf;
2686}
WritebackContext BackendWritebackContext
Definition buf_init.c:27
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition bufmgr.c:6646
void UnlockReleaseBuffer(Buffer buffer)
Definition bufmgr.c:5612
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition bufmgr.c:7699
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
Definition freelist.c:184
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition freelist.c:752
@ IOOP_EVICT
Definition pgstat.h:311
@ IOOP_REUSE
Definition pgstat.h:314
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition pgstat_io.c:68
bool XLogNeedsFlush(XLogRecPtr record)
Definition xlog.c:3159

References Assert, BackendWritebackContext, BM_DIRTY, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetBuffer(), BufferGetLSN, BufferLockConditional(), CheckBufferIsPinnedOnce(), CurrentResourceOwner, fb(), FlushBuffer(), InvalidateVictimBuffer(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, LockBuffer(), pg_atomic_read_u64(), pgstat_count_io_op(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), StrategyGetBuffer(), StrategyRejectBuffer(), UnlockReleaseBuffer(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by BufferAlloc(), and ExtendBufferedRelShared().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 6826 of file bufmgr.c.

6827{
6829
6830 /*
6831 * If we get woken slowly then it's possible that the Startup process was
6832 * already woken by other backends before we got here. Also possible that
6833 * we get here by multiple interrupts or interrupts at inappropriate
6834 * times, so make sure we do nothing if the bufid is not set.
6835 */
6836 if (bufid < 0)
6837 return false;
6838
6839 if (GetPrivateRefCount(bufid + 1) > 0)
6840 return true;
6841
6842 return false;
6843}
int GetStartupBufferPinWaitBufId(void)
Definition proc.c:771

References fb(), GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and ProcessRecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 5679 of file bufmgr.c.

5680{
5681 Assert(BufferIsPinned(buffer));
5683 if (BufferIsLocal(buffer))
5684 LocalRefCount[-buffer - 1]++;
5685 else
5686 {
5688
5689 ref = GetPrivateRefCountEntry(buffer, true);
5690 Assert(ref != NULL);
5691 ref->data.refcount++;
5692 }
5694}
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, fb(), GetPrivateRefCountEntry(), LocalRefCount, ResourceOwnerEnlarge(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), RelationAddBlocks(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

◆ InitBufferManagerAccess()

void InitBufferManagerAccess ( void  )

Definition at line 4225 of file bufmgr.c.

4226{
4227 /*
4228 * An advisory limit on the number of pins each backend should hold, based
4229 * on shared_buffers and the maximum number of connections possible.
4230 * That's very pessimistic, but outside toy-sized shared_buffers it should
4231 * allow plenty of pins. LimitAdditionalPins() and
4232 * GetAdditionalPinLimit() can be used to check the remaining balance.
4233 */
4235
4238
4240
4241 /*
4242 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4243 * the corresponding phase of backend shutdown.
4244 */
4245 Assert(MyProc != NULL);
4247}
static void AtProcExit_Buffers(int code, Datum arg)
Definition bufmgr.c:4254
int MaxBackends
Definition globals.c:149
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition ipc.c:372
MemoryContext CurrentMemoryContext
Definition mcxt.c:161
#define NUM_AUXILIARY_PROCS
Definition proc.h:527

References Assert, AtProcExit_Buffers(), CurrentMemoryContext, fb(), MaxBackends, MaxProportionalPins, MyProc, NBuffers, NUM_AUXILIARY_PROCS, on_shmem_exit(), PrivateRefCountArray, PrivateRefCountArrayKeys, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 2370 of file bufmgr.c.

2371{
2373 uint32 oldHash; /* hash value for oldTag */
2374 LWLock *oldPartitionLock; /* buffer partition lock for it */
2377
2378 /* Save the original buffer tag before dropping the spinlock */
2379 oldTag = buf->tag;
2380
2382
2383 /*
2384 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2385 * worth storing the hashcode in BufferDesc so we need not recompute it
2386 * here? Probably not.
2387 */
2390
2391retry:
2392
2393 /*
2394 * Acquire exclusive mapping lock in preparation for changing the buffer's
2395 * association.
2396 */
2398
2399 /* Re-lock the buffer header */
2401
2402 /* If it's changed while we were waiting for lock, do nothing */
2403 if (!BufferTagsEqual(&buf->tag, &oldTag))
2404 {
2407 return;
2408 }
2409
2410 /*
2411 * We assume the reason for it to be pinned is that either we were
2412 * asynchronously reading the page in before erroring out or someone else
2413 * is flushing the page out. Wait for the IO to finish. (This could be
2414 * an infinite loop if the refcount is messed up... it would be nice to
2415 * time out after awhile, but there seems no way to be sure how many loops
2416 * may be needed. Note that if the other guy has pinned the buffer but
2417 * not yet done StartBufferIO, WaitIO will fall through and we'll
2418 * effectively be busy-looping here.)
2419 */
2421 {
2424 /* safety check: should definitely not be our *own* pin */
2426 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2427 WaitIO(buf);
2428 goto retry;
2429 }
2430
2431 /*
2432 * An invalidated buffer should not have any backends waiting to lock the
2433 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2434 */
2436
2437 /*
2438 * Clear out the buffer's tag and flags. We must do this to ensure that
2439 * linear scans of the buffer array don't think the buffer is valid.
2440 */
2442 ClearBufferTag(&buf->tag);
2443
2445 0,
2447 0);
2448
2449 /*
2450 * Remove the buffer from the lookup hashtable, if it was in there.
2451 */
2452 if (oldFlags & BM_TAG_VALID)
2454
2455 /*
2456 * Done with mapping lock.
2457 */
2459}
#define BUF_USAGECOUNT_MASK
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static void ClearBufferTag(BufferTag *tag)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition buf_table.c:154
static void WaitIO(BufferDesc *buf)
Definition bufmgr.c:7148

References Assert, BM_LOCK_WAKE_IN_PROGRESS, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog, ERROR, fb(), GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), UnlockBufHdr(), UnlockBufHdrExt(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ InvalidateVictimBuffer()

static bool InvalidateVictimBuffer ( BufferDesc buf_hdr)
static

Definition at line 2471 of file bufmgr.c.

2472{
2474 uint32 hash;
2476 BufferTag tag;
2477
2479
2480 /* have buffer pinned, so it's safe to read tag without lock */
2481 tag = buf_hdr->tag;
2482
2483 hash = BufTableHashCode(&tag);
2485
2487
2488 /* lock the buffer header */
2490
2491 /*
2492 * We have the buffer pinned nobody else should have been able to unset
2493 * this concurrently.
2494 */
2497 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2498
2499 /*
2500 * If somebody else pinned the buffer since, or even worse, dirtied it,
2501 * give up on this buffer: It's clearly in use.
2502 */
2504 {
2506
2509
2510 return false;
2511 }
2512
2513 /*
2514 * An invalidated buffer should not have any backends waiting to lock the
2515 * buffer, therefore BM_LOCK_WAKE_IN_PROGRESS should not be set.
2516 */
2518
2519 /*
2520 * Clear out the buffer's tag and flags and usagecount. This is not
2521 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2522 * doing anything with the buffer. But currently it's beneficial, as the
2523 * cheaper pre-check for several linear scans of shared buffers use the
2524 * tag (see e.g. FlushDatabaseBuffers()).
2525 */
2526 ClearBufferTag(&buf_hdr->tag);
2528 0,
2530 0);
2531
2533
2534 /* finally delete buffer from the buffer mapping table */
2535 BufTableDelete(&tag, hash);
2536
2538
2543
2544 return true;
2545}

References Assert, BM_DIRTY, BM_LOCK_WAKE_IN_PROGRESS, BM_TAG_VALID, BM_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), fb(), GetPrivateRefCount(), hash(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u64(), UnlockBufHdr(), and UnlockBufHdrExt().

Referenced by EvictUnpinnedBufferInternal(), and GetVictimBuffer().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 6910 of file bufmgr.c.

6911{
6914
6915 Assert(BufferIsValid(buffer));
6916
6917 /* see AIO related comment in LockBufferForCleanup() */
6918
6919 if (BufferIsLocal(buffer))
6920 {
6921 /* There should be exactly one pin */
6922 if (LocalRefCount[-buffer - 1] != 1)
6923 return false;
6924 /* Nobody else to wait for */
6925 return true;
6926 }
6927
6928 /* There should be exactly one local pin */
6929 if (GetPrivateRefCount(buffer) != 1)
6930 return false;
6931
6932 bufHdr = GetBufferDescriptor(buffer - 1);
6933
6934 /* caller must hold exclusive lock on buffer */
6936
6938
6941 {
6942 /* pincount is OK. */
6944 return true;
6945 }
6946
6948 return false;
6949}

References Assert, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), BufferIsValid(), fb(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext wb_context,
IOContext  io_context 
)

Definition at line 7749 of file bufmgr.c.

7750{
7752 int i;
7753
7754 if (wb_context->nr_pending == 0)
7755 return;
7756
7757 /*
7758 * Executing the writes in-order can make them a lot faster, and allows to
7759 * merge writeback requests to consecutive blocks into larger writebacks.
7760 */
7761 sort_pending_writebacks(wb_context->pending_writebacks,
7762 wb_context->nr_pending);
7763
7765
7766 /*
7767 * Coalesce neighbouring writes, but nothing else. For that we iterate
7768 * through the, now sorted, array of pending flushes, and look forward to
7769 * find all neighbouring (or identical) writes.
7770 */
7771 for (i = 0; i < wb_context->nr_pending; i++)
7772 {
7776 int ahead;
7777 BufferTag tag;
7779 Size nblocks = 1;
7780
7781 cur = &wb_context->pending_writebacks[i];
7782 tag = cur->tag;
7784
7785 /*
7786 * Peek ahead, into following writeback requests, to see if they can
7787 * be combined with the current one.
7788 */
7789 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
7790 {
7791
7792 next = &wb_context->pending_writebacks[i + ahead + 1];
7793
7794 /* different file, stop */
7796 BufTagGetRelFileLocator(&next->tag)) ||
7797 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
7798 break;
7799
7800 /* ok, block queued twice, skip */
7801 if (cur->tag.blockNum == next->tag.blockNum)
7802 continue;
7803
7804 /* only merge consecutive writes */
7805 if (cur->tag.blockNum + 1 != next->tag.blockNum)
7806 break;
7807
7808 nblocks++;
7809 cur = next;
7810 }
7811
7812 i += ahead;
7813
7814 /* and finally tell the kernel to write the data to storage */
7816 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
7817 }
7818
7819 /*
7820 * Assume that writeback requests are only issued for buffers containing
7821 * blocks of permanent relations.
7822 */
7824 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
7825
7826 wb_context->nr_pending = 0;
7827}
static int32 next
Definition blutils.c:225
struct cursor * cur
Definition ecpg.c:29
@ IOOP_WRITEBACK
Definition pgstat.h:315
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition smgr.c:805

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, fb(), i, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITEBACK, next, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RelFileLocatorEquals, smgropen(), smgrwriteback(), and track_io_timing.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ LimitAdditionalPins()

void LimitAdditionalPins ( uint32 additional_pins)

Definition at line 2733 of file bufmgr.c.

2734{
2735 uint32 limit;
2736
2737 if (*additional_pins <= 1)
2738 return;
2739
2740 limit = GetAdditionalPinLimit();
2741 limit = Max(limit, 1);
2742 if (limit < *additional_pins)
2743 *additional_pins = limit;
2744}
uint32 GetAdditionalPinLimit(void)
Definition bufmgr.c:2707
#define Max(x, y)
Definition c.h:1085

References fb(), GetAdditionalPinLimit(), and Max.

Referenced by ExtendBufferedRelShared().

◆ local_buffer_readv_complete()

static PgAioResult local_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 8940 of file bufmgr.c.

8942{
8944}
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition bufmgr.c:8685

References buffer_readv_complete(), and fb().

◆ local_buffer_readv_stage()

static void local_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 8934 of file bufmgr.c.

8935{
8936 buffer_stage_common(ioh, false, true);
8937}
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition bufmgr.c:8290

References buffer_stage_common(), and fb().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void arg)
static

Definition at line 7484 of file bufmgr.c.

7485{
7487
7488 if (bufHdr != NULL)
7489 errcontext("writing block %u of relation \"%s\"",
7490 bufHdr->tag.blockNum,
7493 BufTagGetForkNum(&bufHdr->tag)).str);
7494}
Datum arg
Definition elog.c:1323
#define errcontext
Definition elog.h:200

References arg, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, fb(), MyProcNumber, and relpathbackend.

Referenced by FlushRelationBuffers().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 6679 of file bufmgr.c.

6680{
6682 TimestampTz waitStart = 0;
6683 bool waiting = false;
6684 bool logged_recovery_conflict = false;
6685
6686 Assert(BufferIsPinned(buffer));
6688
6690
6691 /*
6692 * We do not yet need to be worried about in-progress AIOs holding a pin,
6693 * as we, so far, only support doing reads via AIO and this function can
6694 * only be called once the buffer is valid (i.e. no read can be in
6695 * flight).
6696 */
6697
6698 /* Nobody else to wait for */
6699 if (BufferIsLocal(buffer))
6700 return;
6701
6702 bufHdr = GetBufferDescriptor(buffer - 1);
6703
6704 for (;;)
6705 {
6707 uint64 unset_bits = 0;
6708
6709 /* Try to acquire lock */
6712
6715 {
6716 /* Successfully acquired exclusive lock with pincount 1 */
6718
6719 /*
6720 * Emit the log message if recovery conflict on buffer pin was
6721 * resolved but the startup process waited longer than
6722 * deadlock_timeout for it.
6723 */
6726 waitStart, GetCurrentTimestamp(),
6727 NULL, false);
6728
6729 if (waiting)
6730 {
6731 /* reset ps display to remove the suffix if we added one */
6733 waiting = false;
6734 }
6735 return;
6736 }
6737 /* Failed, so mark myself as waiting for pincount 1 */
6739 {
6742 elog(ERROR, "multiple backends attempting to wait for pincount 1");
6743 }
6744 bufHdr->wait_backend_pgprocno = MyProcNumber;
6748 0);
6750
6751 /* Wait to be signaled by UnpinBuffer() */
6752 if (InHotStandby)
6753 {
6754 if (!waiting)
6755 {
6756 /* adjust the process title to indicate that it's waiting */
6757 set_ps_display_suffix("waiting");
6758 waiting = true;
6759 }
6760
6761 /*
6762 * Emit the log message if the startup process is waiting longer
6763 * than deadlock_timeout for recovery conflict on buffer pin.
6764 *
6765 * Skip this if first time through because the startup process has
6766 * not started waiting yet in this case. So, the wait start
6767 * timestamp is set after this logic.
6768 */
6769 if (waitStart != 0 && !logged_recovery_conflict)
6770 {
6772
6773 if (TimestampDifferenceExceeds(waitStart, now,
6775 {
6777 waitStart, now, NULL, true);
6779 }
6780 }
6781
6782 /*
6783 * Set the wait start timestamp if logging is enabled and first
6784 * time through.
6785 */
6786 if (log_recovery_conflict_waits && waitStart == 0)
6787 waitStart = GetCurrentTimestamp();
6788
6789 /* Publish the bufid that Startup process waits on */
6790 SetStartupBufferPinWaitBufId(buffer - 1);
6791 /* Set alarm and then wait to be signaled by UnpinBuffer() */
6793 /* Reset the published bufid */
6795 }
6796 else
6798
6799 /*
6800 * Remove flag marking us as waiter. Normally this will not be set
6801 * anymore, but ProcWaitForSignal() can return for other signals as
6802 * well. We take care to only reset the flag if we're the waiter, as
6803 * theoretically another backend could have started waiting. That's
6804 * impossible with the current usages due to table level locking, but
6805 * better be safe.
6806 */
6808 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
6809 bufHdr->wait_backend_pgprocno == MyProcNumber)
6811
6813 0, unset_bits,
6814 0);
6815
6817 /* Loop back and try again */
6818 }
6819}
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition timestamp.c:1789
TimestampTz GetCurrentTimestamp(void)
Definition timestamp.c:1649
Datum now(PG_FUNCTION_ARGS)
Definition timestamp.c:1613
#define BM_PIN_COUNT_WAITER
static BufferDesc * PinCountWaitBuf
Definition bufmgr.c:228
int64 TimestampTz
Definition timestamp.h:39
void set_ps_display_remove_suffix(void)
Definition ps_status.c:440
void set_ps_display_suffix(const char *suffix)
Definition ps_status.c:388
int DeadlockTimeout
Definition proc.c:62
void SetStartupBufferPinWaitBufId(int bufid)
Definition proc.c:759
void ProcWaitForSignal(uint32 wait_event_info)
Definition proc.c:2015
void ResolveRecoveryConflictWithBufferPin(void)
Definition standby.c:795
bool log_recovery_conflict_waits
Definition standby.c:43
void LogRecoveryConflict(RecoveryConflictReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition standby.c:275
@ RECOVERY_CONFLICT_BUFFERPIN
Definition standby.h:49
static volatile sig_atomic_t waiting
#define InHotStandby
Definition xlogutils.h:60

References Assert, BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, CheckBufferIsPinnedOnce(), DeadlockTimeout, elog, ERROR, fb(), GetBufferDescriptor(), GetCurrentTimestamp(), InHotStandby, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProcNumber, now(), PinCountWaitBuf, ProcWaitForSignal(), RECOVERY_CONFLICT_BUFFERPIN, ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), UnlockBufHdrExt(), and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), XLogReadBufferForRedoExtended(), and ZeroAndLockBuffer().

◆ LockBufferInternal()

void LockBufferInternal ( Buffer  buffer,
BufferLockMode  mode 
)

Definition at line 6583 of file bufmgr.c.

6584{
6586
6587 /*
6588 * We can't wait if we haven't got a PGPROC. This should only occur
6589 * during bootstrap or shared memory initialization. Put an Assert here
6590 * to catch unsafe coding practices.
6591 */
6593
6594 /* handled in LockBuffer() wrapper */
6596
6597 Assert(BufferIsPinned(buffer));
6598 if (BufferIsLocal(buffer))
6599 return; /* local buffers need no lock */
6600
6601 buf_hdr = GetBufferDescriptor(buffer - 1);
6602
6603 /*
6604 * Test the most frequent lock modes first. While a switch (mode) would be
6605 * nice, at least gcc generates considerably worse code for it.
6606 *
6607 * Call BufferLockAcquire() with a constant argument for mode, to generate
6608 * more efficient code for the different lock modes.
6609 */
6610 if (mode == BUFFER_LOCK_SHARE)
6612 else if (mode == BUFFER_LOCK_EXCLUSIVE)
6616 else
6617 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
6618}
bool IsUnderPostmaster
Definition globals.c:122

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, BufferLockAcquire(), elog, ERROR, fb(), GetBufferDescriptor(), IsUnderPostmaster, mode, and MyProc.

Referenced by LockBuffer().

◆ LockBufHdr()

uint64 LockBufHdr ( BufferDesc desc)

Definition at line 7527 of file bufmgr.c.

7528{
7530
7532
7533 while (true)
7534 {
7535 /*
7536 * Always try once to acquire the lock directly, without setting up
7537 * the spin-delay infrastructure. The work necessary for that shows up
7538 * in profiles and is rarely necessary.
7539 */
7541 if (likely(!(old_buf_state & BM_LOCKED)))
7542 break; /* got lock */
7543
7544 /* and then spin without atomic operations until lock is released */
7545 {
7547
7549
7550 while (old_buf_state & BM_LOCKED)
7551 {
7554 }
7556 }
7557
7558 /*
7559 * Retry. The lock might obviously already be re-acquired by the time
7560 * we're attempting to get it again.
7561 */
7562 }
7563
7564 return old_buf_state | BM_LOCKED;
7565}
void perform_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition s_lock.c:186
#define init_local_spin_delay(status)
Definition s_lock.h:749

References Assert, BM_LOCKED, BufferDescriptorGetBuffer(), BufferIsLocal, fb(), finish_spin_delay(), init_local_spin_delay, likely, perform_spin_delay(), pg_atomic_fetch_or_u64(), pg_atomic_read_u64(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), buffer_stage_common(), BufferAlloc(), BufferGetLSNAtomic(), BufferLockDequeueSelf(), BufferLockQueueSelf(), BufferLockWakeup(), BufferSync(), ConditionalLockBufferForCleanup(), create_toy_buffer(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FindAndDropRelationBuffers(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), MarkDirtyUnpinnedBuffer(), MarkSharedBufferDirtyHint(), pg_buffercache_os_pages_internal(), pg_buffercache_pages(), StartSharedBufferIO(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), WaitIO(), and WakePinCountWaiter().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 3156 of file bufmgr.c.

3157{
3161
3162 if (!BufferIsValid(buffer))
3163 elog(ERROR, "bad buffer ID: %d", buffer);
3164
3165 if (BufferIsLocal(buffer))
3166 {
3167 MarkLocalBufferDirty(buffer);
3168 return;
3169 }
3170
3171 bufHdr = GetBufferDescriptor(buffer - 1);
3172
3173 Assert(BufferIsPinned(buffer));
3175
3176 /*
3177 * NB: We have to wait for the buffer header spinlock to be not held, as
3178 * TerminateBufferIO() relies on the spinlock.
3179 */
3181 for (;;)
3182 {
3185
3187
3190
3192 buf_state))
3193 break;
3194 }
3195
3196 /*
3197 * If the buffer was not dirty already, do vacuum accounting.
3198 */
3199 if (!(old_buf_state & BM_DIRTY))
3200 {
3202 if (VacuumCostActive)
3204 }
3205}
pg_noinline uint64 WaitBufHdrUnlocked(BufferDesc *buf)
Definition bufmgr.c:7575
int VacuumCostPageDirty
Definition globals.c:156
int64 shared_blks_dirtied
Definition instrument.h:28

References Assert, BM_DIRTY, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), BufferIsPinned, BufferIsValid(), elog, ERROR, fb(), GetBufferDescriptor(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), pgBufferUsage, BufferUsage::shared_blks_dirtied, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), dataExecPlaceToPageInternal(), dataExecPlaceToPageLeaf(), doPickSplit(), entryExecPlaceToPage(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePostingPage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_inplace_update_and_unlock(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune_and_freeze(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), lazy_scan_new_or_empty(), lazy_vacuum_heap_page(), log_newpage_range(), MarkDirtyUnpinnedBufferInternal(), moveLeafs(), nextval_internal(), ProcessSingleRelationFork(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), SetSequence(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), XLogReadBufferForRedoExtended(), and XLogRecordPageWithFreeSpace().

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)
inline

◆ MarkDirtyAllUnpinnedBuffers()

void MarkDirtyAllUnpinnedBuffers ( int32 buffers_dirtied,
int32 buffers_already_dirty,
int32 buffers_skipped 
)

Definition at line 8241 of file bufmgr.c.

8244{
8245 *buffers_dirtied = 0;
8247 *buffers_skipped = 0;
8248
8249 for (int buf = 1; buf <= NBuffers; buf++)
8250 {
8251 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8254
8256
8258 if (!(buf_state & BM_VALID))
8259 continue;
8260
8263
8264 LockBufHdr(desc);
8265
8267 (*buffers_dirtied)++;
8268 else if (buffer_already_dirty)
8269 (*buffers_already_dirty)++;
8270 else
8271 (*buffers_skipped)++;
8272 }
8273}
static bool MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
Definition bufmgr.c:8092

References BM_VALID, buf, CHECK_FOR_INTERRUPTS, CurrentResourceOwner, fb(), GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), NBuffers, pg_atomic_read_u64(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), and BufferDesc::state.

Referenced by pg_buffercache_mark_dirty_all().

◆ MarkDirtyRelUnpinnedBuffers()

void MarkDirtyRelUnpinnedBuffers ( Relation  rel,
int32 buffers_dirtied,
int32 buffers_already_dirty,
int32 buffers_skipped 
)

Definition at line 8184 of file bufmgr.c.

8188{
8190
8191 *buffers_dirtied = 0;
8193 *buffers_skipped = 0;
8194
8195 for (int buf = 1; buf <= NBuffers; buf++)
8196 {
8197 BufferDesc *desc = GetBufferDescriptor(buf - 1);
8200
8202
8203 /* An unlocked precheck should be safe and saves some cycles. */
8204 if ((buf_state & BM_VALID) == 0 ||
8206 continue;
8207
8208 /* Make sure we can pin the buffer. */
8211
8212 buf_state = LockBufHdr(desc);
8213
8214 /* recheck, could have changed without the lock */
8215 if ((buf_state & BM_VALID) == 0 ||
8217 {
8218 UnlockBufHdr(desc);
8219 continue;
8220 }
8221
8223 (*buffers_dirtied)++;
8224 else if (buffer_already_dirty)
8225 (*buffers_already_dirty)++;
8226 else
8227 (*buffers_skipped)++;
8228 }
8229}

References Assert, BM_VALID, buf, BufTagMatchesRelFileLocator(), CHECK_FOR_INTERRUPTS, CurrentResourceOwner, fb(), GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), NBuffers, pg_atomic_read_u64(), RelationData::rd_locator, RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by pg_buffercache_mark_dirty_relation().

◆ MarkDirtyUnpinnedBuffer()

bool MarkDirtyUnpinnedBuffer ( Buffer  buf,
bool buffer_already_dirty 
)

Definition at line 8148 of file bufmgr.c.

8149{
8150 BufferDesc *desc;
8151 bool buffer_dirtied = false;
8152
8154
8155 /* Make sure we can pin the buffer. */
8158
8159 desc = GetBufferDescriptor(buf - 1);
8160 LockBufHdr(desc);
8161
8163 /* Both can not be true at the same time */
8165
8166 return buffer_dirtied;
8167}

References Assert, buf, BufferIsLocal, CurrentResourceOwner, fb(), GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), ReservePrivateRefCountEntry(), and ResourceOwnerEnlarge().

Referenced by pg_buffercache_mark_dirty().

◆ MarkDirtyUnpinnedBufferInternal()

static bool MarkDirtyUnpinnedBufferInternal ( Buffer  buf,
BufferDesc desc,
bool buffer_already_dirty 
)
static

Definition at line 8092 of file bufmgr.c.

8094{
8096 bool result = false;
8097
8098 *buffer_already_dirty = false;
8099
8102
8103 if ((buf_state & BM_VALID) == 0)
8104 {
8105 UnlockBufHdr(desc);
8106 return false;
8107 }
8108
8109 /* Check that it's not pinned already. */
8111 {
8112 UnlockBufHdr(desc);
8113 return false;
8114 }
8115
8116 /* Pin the buffer and then release the buffer spinlock */
8117 PinBuffer_Locked(desc);
8118
8119 /* If it was not already dirty, mark it as dirty. */
8120 if (!(buf_state & BM_DIRTY))
8121 {
8124 result = true;
8125 BufferLockUnlock(buf, desc);
8126 }
8127 else
8128 *buffer_already_dirty = true;
8129
8130 UnpinBuffer(desc);
8131
8132 return result;
8133}
void MarkBufferDirty(Buffer buffer)
Definition bufmgr.c:3156

References Assert, BM_DIRTY, BM_LOCKED, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BUFFER_LOCK_EXCLUSIVE, BufferLockAcquire(), BufferLockUnlock(), fb(), MarkBufferDirty(), pg_atomic_read_u64(), PinBuffer_Locked(), result, BufferDesc::state, UnlockBufHdr(), and UnpinBuffer().

Referenced by MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), and MarkDirtyUnpinnedBuffer().

◆ MarkSharedBufferDirtyHint()

static void MarkSharedBufferDirtyHint ( Buffer  buffer,
BufferDesc bufHdr,
uint64  lockstate,
bool  buffer_std 
)
inlinestatic

Definition at line 5705 of file bufmgr.c.

5707{
5708 Page page = BufferGetPage(buffer);
5709
5710 Assert(GetPrivateRefCount(buffer) > 0);
5711
5712 /* here, either share-exclusive or exclusive lock is OK */
5715
5716 /*
5717 * This routine might get called many times on the same page, if we are
5718 * making the first scan after commit of an xact that added/deleted many
5719 * tuples. So, be as quick as we can if the buffer is already dirty.
5720 *
5721 * As we are holding (at least) a share-exclusive lock, nobody could have
5722 * cleaned or dirtied the page concurrently, so we can just rely on the
5723 * previously fetched value here without any danger of races.
5724 */
5725 if (unlikely(!(lockstate & BM_DIRTY)))
5726 {
5728 bool wal_log = false;
5730
5731 /*
5732 * If we need to protect hint bit updates from torn writes, WAL-log a
5733 * full page image of the page. This full page image is only necessary
5734 * if the hint bit update is the first change to the page since the
5735 * last checkpoint.
5736 *
5737 * We don't check full_page_writes here because that logic is included
5738 * when we call XLogInsert() since the value changes dynamically.
5739 */
5741 {
5742 /*
5743 * If we must not write WAL, due to a relfilelocator-specific
5744 * condition or being in recovery, don't dirty the page. We can
5745 * set the hint, just not dirty the page as a result so the hint
5746 * is lost when we evict the page or shutdown.
5747 *
5748 * See src/backend/storage/page/README for longer discussion.
5749 */
5750 if (RecoveryInProgress() ||
5752 return;
5753
5754 wal_log = true;
5755 }
5756
5757 /*
5758 * We must mark the page dirty before we emit the WAL record, as per
5759 * the usual rules, to ensure that BufferSync()/SyncOneBuffer() try to
5760 * flush the buffer, even if we haven't inserted the WAL record yet.
5761 * As we hold at least a share-exclusive lock, checkpoints will wait
5762 * for this backend to be done with the buffer before continuing. If
5763 * we did it the other way round, a checkpoint could start between
5764 * writing the WAL record and marking the buffer dirty.
5765 */
5767
5768 /*
5769 * It should not be possible for the buffer to already be dirty, see
5770 * comment above.
5771 */
5775 BM_DIRTY,
5776 0, 0);
5777
5778 /*
5779 * If the block is already dirty because we either made a change or
5780 * set a hint already, then we don't need to write a full page image.
5781 * Note that aggressive cleaning of blocks dirtied by hint bit setting
5782 * would increase the call rate. Bulk setting of hint bits would
5783 * reduce the call rate...
5784 */
5785 if (wal_log)
5786 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5787
5788 if (XLogRecPtrIsValid(lsn))
5789 {
5790 /*
5791 * Set the page LSN if we wrote a backup block. To allow backends
5792 * that only hold a share lock on the buffer to read the LSN in a
5793 * tear-free manner, we set the page LSN while holding the buffer
5794 * header lock. This allows any reader of an LSN who holds only a
5795 * share lock to also obtain a buffer header lock before using
5796 * PageGetLSN() to read the LSN in a tear free way. This is done
5797 * in BufferGetLSNAtomic().
5798 *
5799 * If checksums are enabled, you might think we should reset the
5800 * checksum here. That will happen when the page is written
5801 * sometime later in this checkpoint cycle.
5802 */
5804 PageSetLSN(page, lsn);
5806 }
5807
5809 if (VacuumCostActive)
5811 }
5812}
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition bufpage.h:416
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition storage.c:573
bool RecoveryInProgress(void)
Definition xlog.c:6832
#define XLogRecPtrIsValid(r)
Definition xlogdefs.h:29
#define InvalidXLogRecPtr
Definition xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)

References Assert, BM_DIRTY, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE_EXCLUSIVE, BufferGetPage(), BufferLockHeldByMeInMode(), BufTagGetRelFileLocator(), fb(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), PageSetLSN(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, unlikely, UnlockBufHdr(), UnlockBufHdrExt(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsValid, and XLogSaveBufferForHint().

Referenced by BufferSetHintBits16(), and MarkBufferDirtyHint().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 388 of file bufmgr.c.

389{
391
392 /* only allowed to be called when a reservation has been made */
394
395 /* use up the reserved entry */
397
398 /* and fill it */
400 res->buffer = buffer;
401 res->data.refcount = 0;
403
404 /* update cache for the next lookup */
406
408
409 return res;
410}

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, PrivateRefCountEntry::data, PrivateRefCountData::lockmode, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountEntryLast, PrivateRefCountData::refcount, and ReservedRefCountSlot.

Referenced by TrackNewBufferPin().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy,
bool  skip_if_not_valid 
)
static

Definition at line 3281 of file bufmgr.c.

3283{
3285 bool result;
3287
3290
3291 ref = GetPrivateRefCountEntry(b, true);
3292
3293 if (ref == NULL)
3294 {
3297
3299 for (;;)
3300 {
3302 return false;
3303
3304 /*
3305 * We're not allowed to increase the refcount while the buffer
3306 * header spinlock is held. Wait for the lock to be released.
3307 */
3309 {
3311
3312 /* perform checks at the top of the loop again */
3313 continue;
3314 }
3315
3317
3318 /* increase refcount */
3320
3321 if (strategy == NULL)
3322 {
3323 /* Default case: increase usagecount unless already max. */
3326 }
3327 else
3328 {
3329 /*
3330 * Ring buffers shouldn't evict others from pool. Thus we
3331 * don't make usagecount more than 1.
3332 */
3335 }
3336
3338 buf_state))
3339 {
3340 result = (buf_state & BM_VALID) != 0;
3341
3343 break;
3344 }
3345 }
3346 }
3347 else
3348 {
3349 /*
3350 * If we previously pinned the buffer, it is likely to be valid, but
3351 * it may not be if StartReadBuffers() was called and
3352 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3353 * the flags without locking. This is racy, but it's OK to return
3354 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3355 * it'll see that it's now valid.
3356 *
3357 * Note: We deliberately avoid a Valgrind client request here.
3358 * Individual access methods can optionally superimpose buffer page
3359 * client requests on top of our client requests to enforce that
3360 * buffers are only accessed while locked (and pinned). It's possible
3361 * that the buffer page is legitimately non-accessible here. We
3362 * cannot meddle with that.
3363 */
3364 result = (pg_atomic_read_u64(&buf->state) & BM_VALID) != 0;
3365
3366 Assert(ref->data.refcount > 0);
3367 ref->data.refcount++;
3369 }
3370
3371 return result;
3372}
#define BM_MAX_USAGE_COUNT
#define BUF_STATE_GET_USAGECOUNT(state)
void TrackNewBufferPin(Buffer buf)
Definition bufmgr.c:3521

References Assert, b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, CurrentResourceOwner, fb(), GetPrivateRefCountEntry(), pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), ReservedRefCountSlot, ResourceOwnerRememberBuffer(), result, TrackNewBufferPin(), unlikely, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), ExtendBufferedRelShared(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 3397 of file bufmgr.c.

3398{
3400
3401 /*
3402 * As explained, We don't expect any preexisting pins. That allows us to
3403 * manipulate the PrivateRefCount after releasing the spinlock
3404 */
3406
3407 /*
3408 * Since we hold the buffer spinlock, we can update the buffer state and
3409 * release the lock in one operation.
3410 */
3412
3414 0, 0, 1);
3415
3417}

References Assert, buf, BufferDescriptorGetBuffer(), fb(), GetPrivateRefCountEntry(), pg_atomic_read_u64(), TrackNewBufferPin(), and UnlockBufHdrExt().

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), MarkDirtyUnpinnedBufferInternal(), and SyncOneBuffer().

◆ PinBufferForBlock()

static pg_attribute_always_inline Buffer PinBufferForBlock ( Relation  rel,
SMgrRelation  smgr,
char  persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
IOObject  io_object,
IOContext  io_context,
bool foundPtr 
)
static

Definition at line 1223 of file bufmgr.c.

1232{
1234
1235 Assert(blockNum != P_NEW);
1236
1237 /* Persistence should be set before */
1238 Assert((persistence == RELPERSISTENCE_TEMP ||
1239 persistence == RELPERSISTENCE_PERMANENT ||
1240 persistence == RELPERSISTENCE_UNLOGGED));
1241
1242 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1246 smgr->smgr_rlocator.backend);
1247
1248 if (persistence == RELPERSISTENCE_TEMP)
1249 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1250 else
1251 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1252 strategy, foundPtr, io_context);
1253
1254 if (*foundPtr)
1255 TrackBufferHit(io_object, io_context, rel, persistence, smgr, forkNum, blockNum);
1256
1257 if (rel)
1258 {
1259 /*
1260 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1261 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1262 * zeroed instead), the per-relation stats always count them.
1263 */
1265 }
1266
1268}
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition bufmgr.c:2197
#define P_NEW
Definition bufmgr.h:200
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition localbuf.c:119
#define pgstat_count_buffer_read(rel)
Definition pgstat.h:742
RelFileNumber relNumber

References Assert, RelFileLocatorBackend::backend, BufferAlloc(), BufferDescriptorGetBuffer(), RelFileLocator::dbOid, fb(), LocalBufferAlloc(), RelFileLocatorBackend::locator, P_NEW, pgstat_count_buffer_read, RelFileLocator::relNumber, SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, and TrackBufferHit().

Referenced by ReadBuffer_common(), and StartReadBuffersImpl().

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 787 of file bufmgr.c.

788{
790 Assert(BlockNumberIsValid(blockNum));
791
793 {
794 /* see comments in ReadBuffer_common */
798 errmsg("cannot access temporary tables of other sessions")));
799
800 /* pass it off to localbuf.c */
801 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
802 }
803 else
804 {
805 /* pass it to the shared buffer version */
806 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
807 }
808}
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition bufmgr.c:697
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition localbuf.c:72
#define RELATION_IS_OTHER_TEMP(relation)
Definition rel.h:678
#define RelationIsValid(relation)
Definition rel.h:491

References Assert, BlockNumberIsValid(), ereport, errcode(), errmsg, ERROR, fb(), PrefetchLocalBuffer(), PrefetchSharedBuffer(), RELATION_IS_OTHER_TEMP, RelationGetSmgr(), RelationIsValid, and RelationUsesLocalBuffers.

Referenced by count_nondeletable_pages(), invalidate_one_block(), and pg_prewarm().

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 697 of file bufmgr.c.

700{
702 BufferTag newTag; /* identity of requested block */
703 uint32 newHash; /* hash value for newTag */
704 LWLock *newPartitionLock; /* buffer partition lock for it */
705 int buf_id;
706
707 Assert(BlockNumberIsValid(blockNum));
708
709 /* create a tag so we can lookup the buffer */
710 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
711 forkNum, blockNum);
712
713 /* determine its hash code and partition lock ID */
716
717 /* see if the block is in the buffer pool already */
719 buf_id = BufTableLookup(&newTag, newHash);
721
722 /* If not in buffers, initiate prefetch */
723 if (buf_id < 0)
724 {
725#ifdef USE_PREFETCH
726 /*
727 * Try to initiate an asynchronous read. This returns false in
728 * recovery if the relation file doesn't exist.
729 */
730 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
731 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
732 {
733 result.initiated_io = true;
734 }
735#endif /* USE_PREFETCH */
736 }
737 else
738 {
739 /*
740 * Report the buffer it was in at that time. The caller may be able
741 * to avoid a buffer table lookup, but it's not pinned and it must be
742 * rechecked!
743 */
744 result.recent_buffer = buf_id + 1;
745 }
746
747 /*
748 * If the block *is* in buffers, we do nothing. This is not really ideal:
749 * the block might be just about to be evicted, which would be stupid
750 * since we know we are going to need it soon. But the only easy answer
751 * is to bump the usage_count, which does not seem like a great solution:
752 * when the caller does ultimately touch the block, usage_count would get
753 * bumped again, resulting in too much favoritism for blocks that are
754 * involved in a prefetch sequence. A real fix would involve some
755 * additional per-buffer state, and it's not clear that there's enough of
756 * a problem to justify that.
757 */
758
759 return result;
760}
int io_direct_flags
Definition fd.c:172
#define IO_DIRECT_DATA
Definition fd.h:54
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition smgr.c:678

References Assert, BlockNumberIsValid(), BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), fb(), InitBufferTag(), InvalidBuffer, IO_DIRECT_DATA, io_direct_flags, LW_SHARED, LWLockAcquire(), LWLockRelease(), result, and smgrprefetch().

Referenced by PrefetchBuffer(), and XLogPrefetcherNextBlock().

◆ ProcessReadBuffersResult()

static void ProcessReadBuffersResult ( ReadBuffersOperation operation)
static

Definition at line 1714 of file bufmgr.c.

1715{
1716 PgAioReturn *aio_ret = &operation->io_return;
1718 int newly_read_blocks = 0;
1719
1720 Assert(pgaio_wref_valid(&operation->io_wref));
1721 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1722
1723 /*
1724 * SMGR reports the number of blocks successfully read as the result of
1725 * the IO operation. Thus we can simply add that to ->nblocks_done.
1726 */
1727
1728 if (likely(rs != PGAIO_RS_ERROR))
1729 newly_read_blocks = aio_ret->result.result;
1730
1731 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1732 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1733 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1734 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1735 {
1736 /*
1737 * We'll retry, so we just emit a debug message to the server log (or
1738 * not even that in prod scenarios).
1739 */
1740 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1741 elog(DEBUG3, "partial read, will retry");
1742 }
1743
1746
1747 operation->nblocks_done += newly_read_blocks;
1748
1749 Assert(operation->nblocks_done <= operation->nblocks);
1750}
PgAioResultStatus
Definition aio_types.h:79
@ PGAIO_RS_UNKNOWN
Definition aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition aio_types.h:82
#define DEBUG3
Definition elog.h:29
uint32 status
Definition aio_types.h:108
PgAioResult result
Definition aio_types.h:132

References Assert, DEBUG1, DEBUG3, elog, ERROR, fb(), likely, MAX_IO_COMBINE_LIMIT, operation, pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PGAIO_RS_UNKNOWN, PGAIO_RS_WARNING, pgaio_wref_valid(), PgAioReturn::result, PgAioResult::status, and WARNING.

Referenced by WaitReadBuffers().

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 879 of file bufmgr.c.

880{
882}
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition bufmgr.c:926
@ RBM_NORMAL
Definition bufmgr.h:46

References fb(), MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_allocbuf(), _bt_getbuf(), _bt_relandgetbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_lock_tuple(), heap_update(), heapam_index_fetch_tuple(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

◆ ReadBuffer_common()

static pg_attribute_always_inline Buffer ReadBuffer_common ( Relation  rel,
SMgrRelation  smgr,
char  smgr_persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
static

Definition at line 1276 of file bufmgr.c.

1280{
1282 Buffer buffer;
1283 int flags;
1284 char persistence;
1285
1286 /*
1287 * Reject attempts to read non-local temporary relations; we would be
1288 * likely to get wrong data since we have no visibility into the owning
1289 * session's local buffers. This is the canonical place for the check,
1290 * covering the ReadBufferExtended() entry point and any other caller that
1291 * supplies a Relation.
1292 */
1293 if (rel && RELATION_IS_OTHER_TEMP(rel))
1294 ereport(ERROR,
1296 errmsg("cannot access temporary tables of other sessions")));
1297
1298 /*
1299 * Backward compatibility path, most code should use ExtendBufferedRel()
1300 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1301 * scales a lot better.
1302 */
1303 if (unlikely(blockNum == P_NEW))
1304 {
1306
1307 /*
1308 * Since no-one else can be looking at the page contents yet, there is
1309 * no difference between an exclusive lock and a cleanup-strength
1310 * lock.
1311 */
1313 flags |= EB_LOCK_FIRST;
1314
1315 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1316 }
1317
1318 if (rel)
1319 persistence = rel->rd_rel->relpersistence;
1320 else
1321 persistence = smgr_persistence;
1322
1325 {
1326 bool found;
1329
1330 if (persistence == RELPERSISTENCE_TEMP)
1331 {
1334 }
1335 else
1336 {
1337 io_context = IOContextForStrategy(strategy);
1339 }
1340
1341 buffer = PinBufferForBlock(rel, smgr, persistence,
1342 forkNum, blockNum, strategy,
1343 io_object, io_context, &found);
1344 ZeroAndLockBuffer(buffer, mode, found);
1345 return buffer;
1346 }
1347
1348 /*
1349 * Signal that we are going to immediately wait. If we're immediately
1350 * waiting, there is no benefit in actually executing the IO
1351 * asynchronously, it would just add dispatch overhead.
1352 */
1354 if (mode == RBM_ZERO_ON_ERROR)
1356 operation.smgr = smgr;
1357 operation.rel = rel;
1358 operation.persistence = persistence;
1359 operation.forknum = forkNum;
1360 operation.strategy = strategy;
1362 &buffer,
1363 blockNum,
1364 flags))
1366
1367 return buffer;
1368}
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition bufmgr.c:970
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition bufmgr.c:1137
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, IOObject io_object, IOContext io_context, bool *foundPtr)
Definition bufmgr.c:1223
bool WaitReadBuffers(ReadBuffersOperation *operation)
Definition bufmgr.c:1759
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition bufmgr.c:1637
@ RBM_ZERO_ON_ERROR
Definition bufmgr.h:51
#define BMR_REL(p_rel)
Definition bufmgr.h:114
Form_pg_class rd_rel
Definition rel.h:111

References BMR_REL, PrivateRefCountEntry::buffer, EB_LOCK_FIRST, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errmsg, ERROR, ExtendBufferedRel(), fb(), IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, mode, operation, P_NEW, PinBufferForBlock(), RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelationData::rd_rel, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, RELATION_IS_OTHER_TEMP, StartReadBuffer(), unlikely, WaitReadBuffers(), and ZeroAndLockBuffer().

Referenced by ExtendBufferedRelTo(), ReadBufferExtended(), and ReadBufferWithoutRelcache().

◆ ReadBufferExtended()

◆ ReadBufferWithoutRelcache()

Buffer ReadBufferWithoutRelcache ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool  permanent 
)

Definition at line 954 of file bufmgr.c.

957{
958 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
959
960 return ReadBuffer_common(NULL, smgr,
962 forkNum, blockNum,
963 mode, strategy);
964}

References fb(), INVALID_PROC_NUMBER, mode, ReadBuffer_common(), and smgropen().

Referenced by RelationCopyStorageUsingBuffer(), ScanSourceDatabasePgClass(), and XLogReadBufferExtended().

◆ ReadRecentBuffer()

bool ReadRecentBuffer ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
Buffer  recent_buffer 
)

Definition at line 818 of file bufmgr.c.

820{
822 BufferTag tag;
824
825 Assert(BufferIsValid(recent_buffer));
826
829 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
830
831 if (BufferIsLocal(recent_buffer))
832 {
833 int b = -recent_buffer - 1;
834
837
838 /* Is it still valid and holding the right tag? */
839 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
840 {
841 PinLocalBuffer(bufHdr, true);
842
844
845 return true;
846 }
847 }
848 else
849 {
850 bufHdr = GetBufferDescriptor(recent_buffer - 1);
851
852 /*
853 * Is it still valid and holding the right tag? We do an unlocked tag
854 * comparison first, to make it unlikely that we'll increment the
855 * usage counter of the wrong buffer, if someone calls us with a very
856 * out of date recent_buffer. Then we'll check it again if we get the
857 * pin.
858 */
859 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
860 PinBuffer(bufHdr, NULL, true))
861 {
862 if (BufferTagsEqual(&tag, &bufHdr->tag))
863 {
865 return true;
866 }
868 }
869 }
870
871 return false;
872}
int64 local_blks_hit
Definition instrument.h:30
int64 shared_blks_hit
Definition instrument.h:26

References Assert, b, BM_VALID, BufferIsLocal, BufferIsValid(), BufferTagsEqual(), CurrentResourceOwner, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), InitBufferTag(), BufferUsage::local_blks_hit, pg_atomic_read_u64(), pgBufferUsage, PinBuffer(), PinLocalBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_hit, and UnpinBuffer().

Referenced by invalidate_one_block(), and XLogReadBufferExtended().

◆ RelationCopyStorageUsingBuffer()

static void RelationCopyStorageUsingBuffer ( RelFileLocator  srclocator,
RelFileLocator  dstlocator,
ForkNumber  forkNum,
bool  permanent 
)
static

Definition at line 5357 of file bufmgr.c.

5360{
5361 Buffer srcBuf;
5362 Buffer dstBuf;
5363 Page srcPage;
5364 Page dstPage;
5365 bool use_wal;
5366 BlockNumber nblocks;
5367 BlockNumber blkno;
5374
5375 /*
5376 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5377 * can skip it when copying any fork of an unlogged relation other than
5378 * the init fork.
5379 */
5380 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5381
5382 /* Get number of blocks in the source relation. */
5384 forkNum);
5385
5386 /* Nothing to copy; just return. */
5387 if (nblocks == 0)
5388 return;
5389
5390 /*
5391 * Bulk extend the destination relation of the same size as the source
5392 * relation before starting to copy block by block.
5393 */
5394 memset(buf.data, 0, BLCKSZ);
5395 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5396 buf.data, true);
5397
5398 /* This is a bulk operation, so use buffer access strategies. */
5401
5402 /* Initialize streaming read */
5403 p.current_blocknum = 0;
5404 p.last_exclusive = nblocks;
5406
5407 /*
5408 * It is safe to use batchmode as block_range_read_stream_cb takes no
5409 * locks.
5410 */
5414 src_smgr,
5416 forkNum,
5418 &p,
5419 0);
5420
5421 /* Iterate over each block of the source relation file. */
5422 for (blkno = 0; blkno < nblocks; blkno++)
5423 {
5425
5426 /* Read block from source relation. */
5430
5434 permanent);
5436
5438
5439 /* Copy page data from the source to the destination. */
5442
5443 /* WAL-log the copied page. */
5444 if (use_wal)
5446
5448
5451 }
5454
5457}
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition bufmgr.c:954
@ BAS_BULKREAD
Definition bufmgr.h:37
@ BAS_BULKWRITE
Definition bufmgr.h:39
memcpy(sums, checksumBaseOffsets, sizeof(checksumBaseOffsets))
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition freelist.c:426
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition freelist.c:608
#define START_CRIT_SECTION()
Definition miscadmin.h:152
#define END_CRIT_SECTION()
Definition miscadmin.h:154
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
void read_stream_end(ReadStream *stream)
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
#define READ_STREAM_USE_BATCHING
Definition read_stream.h:64
#define READ_STREAM_FULL
Definition read_stream.h:43
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition smgr.c:620
#define XLogIsNeeded()
Definition xlog.h:112
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)

References Assert, BAS_BULKREAD, BAS_BULKWRITE, block_range_read_stream_cb(), buf, BUFFER_LOCK_SHARE, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, BlockRangeReadStreamPrivate::current_blocknum, END_CRIT_SECTION, fb(), FreeAccessStrategy(), GetAccessStrategy(), INIT_FORKNUM, INVALID_PROC_NUMBER, InvalidBuffer, BlockRangeReadStreamPrivate::last_exclusive, LockBuffer(), log_newpage_buffer(), MarkBufferDirty(), memcpy(), RBM_ZERO_AND_LOCK, read_stream_begin_smgr_relation(), read_stream_end(), READ_STREAM_FULL, read_stream_next_buffer(), READ_STREAM_USE_BATCHING, ReadBufferWithoutRelcache(), smgrextend(), smgrnblocks(), smgropen(), START_CRIT_SECTION, UnlockReleaseBuffer(), and XLogIsNeeded.

Referenced by CreateAndCopyRelationData().

◆ RelationGetNumberOfBlocksInFork()

BlockNumber RelationGetNumberOfBlocksInFork ( Relation  relation,
ForkNumber  forkNum 
)

Definition at line 4654 of file bufmgr.c.

4655{
4656 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4657 {
4658 /*
4659 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4660 * tableam returns the size in bytes - but for the purpose of this
4661 * routine, we want the number of blocks. Therefore divide, rounding
4662 * up.
4663 */
4665
4666 szbytes = table_relation_size(relation, forkNum);
4667
4668 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4669 }
4670 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4671 {
4672 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4673 }
4674 else
4675 Assert(false);
4676
4677 return 0; /* keep compiler quiet */
4678}
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition tableam.h:1938

References Assert, fb(), RelationData::rd_rel, RelationGetSmgr(), smgrnblocks(), and table_relation_size().

Referenced by _hash_getnewbuf(), _hash_init(), autoprewarm_database_main(), get_raw_page_internal(), pg_prewarm(), and ProcessSingleRelationFork().

◆ ReleaseAndReadBuffer()

Buffer ReleaseAndReadBuffer ( Buffer  buffer,
Relation  relation,
BlockNumber  blockNum 
)

Definition at line 3221 of file bufmgr.c.

3224{
3225 ForkNumber forkNum = MAIN_FORKNUM;
3227
3228 if (BufferIsValid(buffer))
3229 {
3230 Assert(BufferIsPinned(buffer));
3231 if (BufferIsLocal(buffer))
3232 {
3233 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3234 if (bufHdr->tag.blockNum == blockNum &&
3235 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3236 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3237 return buffer;
3238 UnpinLocalBuffer(buffer);
3239 }
3240 else
3241 {
3242 bufHdr = GetBufferDescriptor(buffer - 1);
3243 /* we have pin, so it's ok to examine tag without spinlock */
3244 if (bufHdr->tag.blockNum == blockNum &&
3245 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3246 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3247 return buffer;
3249 }
3250 }
3251
3252 return ReadBuffer(relation, blockNum);
3253}
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition bufmgr.c:879

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), MAIN_FORKNUM, RelationData::rd_locator, ReadBuffer(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by ginFindLeafPage().

◆ ReleaseBuffer()

void ReleaseBuffer ( Buffer  buffer)

Definition at line 5595 of file bufmgr.c.

5596{
5597 if (!BufferIsValid(buffer))
5598 elog(ERROR, "bad buffer ID: %d", buffer);
5599
5600 if (BufferIsLocal(buffer))
5601 UnpinLocalBuffer(buffer);
5602 else
5603 UnpinBuffer(GetBufferDescriptor(buffer - 1));
5604}

References PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by _bt_allocbuf(), _bt_pagedel(), _bt_search_insert(), _bt_unlink_halfdead_page(), _hash_dropbuf(), _hash_getbuf_with_condlock_cleanup(), autoprewarm_database_main(), BitmapHeapScanNextBlock(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brin_vacuum_scan(), bringetbitmap(), brinGetTupleForHeapBlock(), brininsert(), brinRevmapTerminate(), brinsummarize(), buffer_create_toy(), collect_corrupt_items(), collect_visibility_data(), entryLoadMoreItems(), ExecEndIndexOnlyScan(), ExtendBufferedRelTo(), FreeBulkInsertState(), freeGinBtreeStack(), fsm_search(), get_actual_variable_endpoint(), GetRecordedFreeSpace(), ginFindParents(), ginFinishSplit(), ginFreeScanKeys(), ginInsertCleanup(), GinNewBuffer(), gistdoinsert(), gistFindCorrectParent(), gistNewBuffer(), gistvacuum_delete_empty_pages(), grow_rel(), heap_abort_speculative(), heap_delete(), heap_endscan(), heap_fetch(), heap_fetch_next_buffer(), heap_force_common(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_rescan(), heap_update(), heap_vac_scan_next_block(), heap_xlog_delete(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heapam_index_fetch_end(), heapam_index_fetch_tuple(), heapam_scan_sample_next_block(), heapam_tuple_lock(), heapgettup(), heapgettup_pagemode(), lazy_scan_heap(), lazy_vacuum_heap_rel(), pg_prewarm(), pg_visibility(), pg_visibility_map(), pgstatindex_impl(), read_buffers(), read_rel_block_ll(), read_stream_for_blocks(), read_stream_reset(), ReadBufferBI(), RelationAddBlocks(), ReleaseBulkInsertStatePin(), revmap_get_buffer(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), statapprox_heap(), summarize_range(), terminate_brin_buildstate(), tts_buffer_heap_clear(), tts_buffer_heap_materialize(), tts_buffer_heap_store_tuple(), verify_heapam(), visibilitymap_count(), visibilitymap_get_status(), visibilitymap_pin(), and XLogReadBufferExtended().

◆ ReservePrivateRefCountEntry()

static void ReservePrivateRefCountEntry ( void  )
static

Definition at line 309 of file bufmgr.c.

310{
311 /* Already reserved (or freed), nothing to do */
312 if (ReservedRefCountSlot != -1)
313 return;
314
315 /*
316 * First search for a free entry the array, that'll be sufficient in the
317 * majority of cases.
318 */
319 {
320 int i;
321
322 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
323 {
325 {
327
328 /*
329 * We could return immediately, but iterating till the end of
330 * the array allows compiler-autovectorization.
331 */
332 }
333 }
334
335 if (ReservedRefCountSlot != -1)
336 return;
337 }
338
339 /*
340 * No luck. All array entries are full. Move one array entry into the hash
341 * table.
342 */
343 {
344 /*
345 * Move entry from the current clock position in the array into the
346 * hashtable. Use that slot.
347 */
348 int victim_slot;
351 bool found;
352
353 /* select victim slot */
357
358 /* Better be used, otherwise we shouldn't get here. */
362
363 /* enter victim array entry into hashtable */
366 &found);
367 Assert(!found);
368 /* move data from the entry in the array to the hash entry */
369 hashent->data = victim_entry->data;
370
371 /* clear the now free array slot */
373 victim_entry->buffer = InvalidBuffer;
374
375 /* clear the whole data member, just for future proofing */
376 memset(&victim_entry->data, 0, sizeof(victim_entry->data));
377 victim_entry->data.refcount = 0;
378 victim_entry->data.lockmode = BUFFER_LOCK_UNLOCK;
379
381 }
382}
static uint32 PrivateRefCountClock
Definition bufmgr.c:267

References Assert, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, fb(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountArrayKeys, PrivateRefCountClock, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountSlot.

Referenced by BufferAlloc(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetPrivateRefCountEntrySlow(), GetVictimBuffer(), MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), MarkDirtyUnpinnedBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ ResOwnerPrintBuffer()

static char * ResOwnerPrintBuffer ( Datum  res)
static

Definition at line 7890 of file bufmgr.c.

7891{
7893}
static int32 DatumGetInt32(Datum X)
Definition postgres.h:202

References DatumGetInt32(), and DebugPrintBufferRefcount().

◆ ResOwnerPrintBufferIO()

static char * ResOwnerPrintBufferIO ( Datum  res)
static

Definition at line 7840 of file bufmgr.c.

7841{
7842 Buffer buffer = DatumGetInt32(res);
7843
7844 return psprintf("lost track of buffer IO on buffer %d", buffer);
7845}

References PrivateRefCountEntry::buffer, DatumGetInt32(), and psprintf().

◆ ResOwnerReleaseBuffer()

static void ResOwnerReleaseBuffer ( Datum  res)
static

Definition at line 7854 of file bufmgr.c.

7855{
7856 Buffer buffer = DatumGetInt32(res);
7857
7858 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
7859 if (!BufferIsValid(buffer))
7860 elog(ERROR, "bad buffer ID: %d", buffer);
7861
7862 if (BufferIsLocal(buffer))
7864 else
7865 {
7867
7868 ref = GetPrivateRefCountEntry(buffer, false);
7869
7870 /* not having a private refcount would imply resowner corruption */
7871 Assert(ref != NULL);
7872
7873 /*
7874 * If the buffer was locked at the time of the resowner release,
7875 * release the lock now. This should only happen after errors.
7876 */
7877 if (ref->data.lockmode != BUFFER_LOCK_UNLOCK)
7878 {
7879 BufferDesc *buf = GetBufferDescriptor(buffer - 1);
7880
7881 HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
7882 BufferLockUnlock(buffer, buf);
7883 }
7884
7886 }
7887}
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition bufmgr.c:3474
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition localbuf.c:872

References Assert, buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), BufferLockUnlock(), DatumGetInt32(), elog, ERROR, fb(), GetBufferDescriptor(), GetPrivateRefCountEntry(), HOLD_INTERRUPTS, UnpinBufferNoOwner(), and UnpinLocalBufferNoOwner().

◆ ResOwnerReleaseBufferIO()

static void ResOwnerReleaseBufferIO ( Datum  res)
static

Definition at line 7832 of file bufmgr.c.

7833{
7834 Buffer buffer = DatumGetInt32(res);
7835
7836 AbortBufferIO(buffer);
7837}
static void AbortBufferIO(Buffer buffer)
Definition bufmgr.c:7429

References AbortBufferIO(), PrivateRefCountEntry::buffer, and DatumGetInt32().

◆ rlocator_comparator()

static int rlocator_comparator ( const void p1,
const void p2 
)
static

Definition at line 7500 of file bufmgr.c.

7501{
7502 RelFileLocator n1 = *(const RelFileLocator *) p1;
7503 RelFileLocator n2 = *(const RelFileLocator *) p2;
7504
7505 if (n1.relNumber < n2.relNumber)
7506 return -1;
7507 else if (n1.relNumber > n2.relNumber)
7508 return 1;
7509
7510 if (n1.dbOid < n2.dbOid)
7511 return -1;
7512 else if (n1.dbOid > n2.dbOid)
7513 return 1;
7514
7515 if (n1.spcOid < n2.spcOid)
7516 return -1;
7517 else if (n1.spcOid > n2.spcOid)
7518 return 1;
7519 else
7520 return 0;
7521}

References fb().

Referenced by buffertag_comparator(), DropRelationsAllBuffers(), and FlushRelationsAllBuffers().

◆ ScheduleBufferTagForWriteback()

void ScheduleBufferTagForWriteback ( WritebackContext wb_context,
IOContext  io_context,
BufferTag tag 
)

Definition at line 7699 of file bufmgr.c.

7701{
7702 PendingWriteback *pending;
7703
7704 /*
7705 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
7706 * point in tracking in that case.
7707 */
7709 !enableFsync)
7710 return;
7711
7712 /*
7713 * Add buffer to the pending writeback array, unless writeback control is
7714 * disabled.
7715 */
7716 if (*wb_context->max_pending > 0)
7717 {
7719
7720 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
7721
7722 pending->tag = *tag;
7723 }
7724
7725 /*
7726 * Perform pending flushes if the writeback limit is exceeded. This
7727 * includes the case where previously an item has been added, but control
7728 * is now disabled.
7729 */
7730 if (wb_context->nr_pending >= *wb_context->max_pending)
7732}
bool enableFsync
Definition globals.c:131
#define WRITEBACK_MAX_PENDING_FLUSHES

References Assert, enableFsync, fb(), IO_DIRECT_DATA, io_direct_flags, IssuePendingWritebacks(), PendingWriteback::tag, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by GetVictimBuffer(), and SyncOneBuffer().

◆ shared_buffer_readv_complete()

static PgAioResult shared_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 8889 of file bufmgr.c.

8891{
8893}

References buffer_readv_complete(), and fb().

◆ shared_buffer_readv_complete_local()

static PgAioResult shared_buffer_readv_complete_local ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

◆ shared_buffer_readv_stage()

static void shared_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 8883 of file bufmgr.c.

8884{
8885 buffer_stage_common(ioh, false, false);
8886}

References buffer_stage_common(), and fb().

◆ shared_buffer_write_error_callback()

static void shared_buffer_write_error_callback ( void arg)
static

Definition at line 7468 of file bufmgr.c.

7469{
7471
7472 /* Buffer is pinned, so we can read the tag without locking the spinlock */
7473 if (bufHdr != NULL)
7474 errcontext("writing block %u of relation \"%s\"",
7475 bufHdr->tag.blockNum,
7477 BufTagGetForkNum(&bufHdr->tag)).str);
7478}

References arg, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, fb(), and relpathperm.

Referenced by FlushBuffer().

◆ SharedBufferBeginSetHintBits()

static bool SharedBufferBeginSetHintBits ( Buffer  buffer,
BufferDesc buf_hdr,
uint64 lockstate 
)
inlinestatic

Definition at line 6960 of file bufmgr.c.

6961{
6965
6966 ref = GetPrivateRefCountEntry(buffer, true);
6967
6968 if (ref == NULL)
6969 elog(ERROR, "buffer is not pinned");
6970
6971 mode = ref->data.lockmode;
6972 if (mode == BUFFER_LOCK_UNLOCK)
6973 elog(ERROR, "buffer is not locked");
6974
6975 /* we're done if we are already holding a sufficient lock level */
6977 {
6979 return true;
6980 }
6981
6982 /*
6983 * We are only holding a share lock right now, try to upgrade it to
6984 * SHARE_EXCLUSIVE.
6985 */
6987
6989 while (true)
6990 {
6992
6994
6995 /*
6996 * Can't upgrade if somebody else holds the lock in exclusive or
6997 * share-exclusive mode.
6998 */
7000 {
7001 return false;
7002 }
7003
7004 /* currently held lock state */
7006
7007 /* new lock level */
7009
7012 {
7013 ref->data.lockmode = BUFFER_LOCK_SHARE_EXCLUSIVE;
7015
7016 return true;
7017 }
7018 }
7019}

References Assert, BM_LOCK_VAL_EXCLUSIVE, BM_LOCK_VAL_SHARE_EXCLUSIVE, BM_LOCK_VAL_SHARED, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_SHARE_EXCLUSIVE, BUFFER_LOCK_UNLOCK, elog, ERROR, fb(), GetPrivateRefCountEntry(), likely, mode, pg_atomic_compare_exchange_u64(), pg_atomic_read_u64(), and unlikely.

Referenced by BufferBeginSetHintBits(), and BufferSetHintBits16().

◆ StartBufferIO()

StartBufferIOResult StartBufferIO ( Buffer  buffer,
bool  forInput,
bool  wait,
PgAioWaitRef io_wref 
)

Definition at line 7330 of file bufmgr.c.

7331{
7333
7334 if (BufferIsLocal(buffer))
7335 {
7336 buf_hdr = GetLocalBufferDescriptor(-buffer - 1);
7337
7338 return StartLocalBufferIO(buf_hdr, forInput, wait, io_wref);
7339 }
7340 else
7341 {
7342 buf_hdr = GetBufferDescriptor(buffer - 1);
7343
7344 return StartSharedBufferIO(buf_hdr, forInput, wait, io_wref);
7345 }
7346}
StartBufferIOResult StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool wait, PgAioWaitRef *io_wref)
Definition localbuf.c:532

References PrivateRefCountEntry::buffer, BufferIsLocal, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), StartLocalBufferIO(), and StartSharedBufferIO().

Referenced by AsyncReadBuffers().

◆ StartReadBuffer()

bool StartReadBuffer ( ReadBuffersOperation operation,
Buffer buffer,
BlockNumber  blocknum,
int  flags 
)

Definition at line 1637 of file bufmgr.c.

1641{
1642 int nblocks = 1;
1643 bool result;
1644
1645 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1646 false /* single block, no forwarding */ );
1647 Assert(nblocks == 1); /* single block can't be short */
1648
1649 return result;
1650}
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition bufmgr.c:1371

References Assert, PrivateRefCountEntry::buffer, operation, result, and StartReadBuffersImpl().

Referenced by read_stream_next_buffer(), and ReadBuffer_common().

◆ StartReadBuffers()

bool StartReadBuffers ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int nblocks,
int  flags 
)

Definition at line 1618 of file bufmgr.c.

1623{
1624 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1625 true /* expect forwarded buffers */ );
1626}

References operation, and StartReadBuffersImpl().

Referenced by read_buffers(), and read_stream_start_pending_read().

◆ StartReadBuffersImpl()

static pg_attribute_always_inline bool StartReadBuffersImpl ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int nblocks,
int  flags,
bool  allow_forwarding 
)
static

Definition at line 1371 of file bufmgr.c.

1377{
1378 int actual_nblocks = *nblocks;
1379 int maxcombine = 0;
1380 bool did_start_io;
1383
1384 Assert(*nblocks == 1 || allow_forwarding);
1385 Assert(*nblocks > 0);
1386 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1387
1388 /* see comments in ReadBuffer_common */
1389 if (operation->rel && RELATION_IS_OTHER_TEMP(operation->rel))
1390 ereport(ERROR,
1392 errmsg("cannot access temporary tables of other sessions")));
1393
1394 if (operation->persistence == RELPERSISTENCE_TEMP)
1395 {
1398 }
1399 else
1400 {
1403 }
1404
1405 for (int i = 0; i < actual_nblocks; ++i)
1406 {
1407 bool found;
1408
1409 if (allow_forwarding && buffers[i] != InvalidBuffer)
1410 {
1412
1413 /*
1414 * This is a buffer that was pinned by an earlier call to
1415 * StartReadBuffers(), but couldn't be handled in one operation at
1416 * that time. The operation was split, and the caller has passed
1417 * an already pinned buffer back to us to handle the rest of the
1418 * operation. It must continue at the expected block number.
1419 */
1420 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1421
1422 /*
1423 * It might be an already valid buffer (a hit) that followed the
1424 * final contiguous block of an earlier I/O (a miss) marking the
1425 * end of it, or a buffer that some other backend has since made
1426 * valid by performing the I/O for us, in which case we can handle
1427 * it as a hit now. It is safe to check for a BM_VALID flag with
1428 * a relaxed load, because we got a fresh view of it while pinning
1429 * it in the previous call.
1430 *
1431 * On the other hand if we don't see BM_VALID yet, it must be an
1432 * I/O that was split by the previous call and we need to try to
1433 * start a new I/O from this block. We're also racing against any
1434 * other backend that might start the I/O or even manage to mark
1435 * it BM_VALID after this check, but StartBufferIO() will handle
1436 * those cases.
1437 */
1438 if (BufferIsLocal(buffers[i]))
1439 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1440 else
1441 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1443 found = pg_atomic_read_u64(&bufHdr->state) & BM_VALID;
1444 }
1445 else
1446 {
1447 buffers[i] = PinBufferForBlock(operation->rel,
1448 operation->smgr,
1449 operation->persistence,
1450 operation->forknum,
1451 blockNum + i,
1452 operation->strategy,
1454 &found);
1455 }
1456
1457 if (found)
1458 {
1459 /*
1460 * We have a hit. If it's the first block in the requested range,
1461 * we can return it immediately and report that WaitReadBuffers()
1462 * does not need to be called. If the initial value of *nblocks
1463 * was larger, the caller will have to call again for the rest.
1464 */
1465 if (i == 0)
1466 {
1467 *nblocks = 1;
1468
1469#ifdef USE_ASSERT_CHECKING
1470
1471 /*
1472 * Initialize enough of ReadBuffersOperation to make
1473 * CheckReadBuffersOperation() work. Outside of assertions
1474 * that's not necessary when no IO is issued.
1475 */
1476 operation->buffers = buffers;
1477 operation->blocknum = blockNum;
1478 operation->nblocks = 1;
1479 operation->nblocks_done = 1;
1481#endif
1482 return false;
1483 }
1484
1485 /*
1486 * Otherwise we already have an I/O to perform, but this block
1487 * can't be included as it is already valid. Split the I/O here.
1488 * There may or may not be more blocks requiring I/O after this
1489 * one, we haven't checked, but they can't be contiguous with this
1490 * one in the way. We'll leave this buffer pinned, forwarding it
1491 * to the next call, avoiding the need to unpin it here and re-pin
1492 * it in the next call.
1493 */
1494 actual_nblocks = i;
1495 break;
1496 }
1497 else
1498 {
1499 /*
1500 * Check how many blocks we can cover with the same IO. The smgr
1501 * implementation might e.g. be limited due to a segment boundary.
1502 */
1503 if (i == 0 && actual_nblocks > 1)
1504 {
1506 operation->forknum,
1507 blockNum);
1509 {
1510 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1511 blockNum, actual_nblocks, maxcombine);
1513 }
1514 }
1515 }
1516 }
1517 *nblocks = actual_nblocks;
1518
1519 /* Populate information needed for I/O. */
1520 operation->buffers = buffers;
1521 operation->blocknum = blockNum;
1522 operation->flags = flags;
1523 operation->nblocks = actual_nblocks;
1524 operation->nblocks_done = 0;
1525 pgaio_wref_clear(&operation->io_wref);
1526
1527 /*
1528 * When using AIO, start the IO in the background. If not, issue prefetch
1529 * requests if desired by the caller.
1530 *
1531 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1532 * de-risk the introduction of AIO somewhat. It's a large architectural
1533 * change, with lots of chances for unanticipated performance effects.
1534 *
1535 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1536 * asynchronously, but without the check here we'd execute IO earlier than
1537 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1538 */
1539 if (io_method != IOMETHOD_SYNC)
1540 {
1541 /*
1542 * Try to start IO asynchronously. It's possible that no IO needs to
1543 * be started, if another backend already performed the IO.
1544 *
1545 * Note that if an IO is started, it might not cover the entire
1546 * requested range, e.g. because an intermediary block has been read
1547 * in by another backend. In that case any "trailing" buffers we
1548 * already pinned above will be "forwarded" by read_stream.c to the
1549 * next call to StartReadBuffers().
1550 *
1551 * This is signalled to the caller by decrementing *nblocks *and*
1552 * reducing operation->nblocks. The latter is done here, but not below
1553 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1554 * overall read size anymore, we need to retry until done in its
1555 * entirety or until failed.
1556 */
1558
1559 operation->nblocks = *nblocks;
1560 }
1561 else
1562 {
1564
1565 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1566 {
1567 /*
1568 * In theory we should only do this if PinBufferForBlock() had to
1569 * allocate new buffers above. That way, if two calls to
1570 * StartReadBuffers() were made for the same blocks before
1571 * WaitReadBuffers(), only the first would issue the advice.
1572 * That'd be a better simulation of true asynchronous I/O, which
1573 * would only start the I/O once, but isn't done here for
1574 * simplicity.
1575 */
1576 smgrprefetch(operation->smgr,
1577 operation->forknum,
1578 blockNum,
1580 }
1581
1582 /*
1583 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1584 * will initiate the necessary IO.
1585 */
1586 did_start_io = true;
1587 }
1588
1590
1591 return did_start_io;
1592}
int io_method
Definition aio.c:74
@ IOMETHOD_SYNC
Definition aio.h:34
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition bufmgr.c:1656
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition bufmgr.c:1938
#define READ_BUFFERS_ISSUE_ADVICE
Definition bufmgr.h:124
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition smgr.c:697

References Assert, AsyncReadBuffers(), BM_TAG_VALID, BM_VALID, BufferGetBlockNumber(), BufferIsLocal, CheckReadBuffersOperation(), DEBUG2, elog, ereport, errcode(), errmsg, ERROR, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, InvalidBuffer, io_method, IOCONTEXT_NORMAL, IOContextForStrategy(), IOMETHOD_SYNC, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, MAX_IO_COMBINE_LIMIT, operation, pg_atomic_read_u64(), pgaio_wref_clear(), PinBufferForBlock(), READ_BUFFERS_ISSUE_ADVICE, READ_BUFFERS_SYNCHRONOUSLY, RELATION_IS_OTHER_TEMP, smgrmaxcombine(), smgrprefetch(), and unlikely.

Referenced by StartReadBuffer(), and StartReadBuffers().

◆ StartSharedBufferIO()

StartBufferIOResult StartSharedBufferIO ( BufferDesc buf,
bool  forInput,
bool  wait,
PgAioWaitRef io_wref 
)

Definition at line 7250 of file bufmgr.c.

7251{
7253
7255
7256 for (;;)
7257 {
7259
7261 break;
7262
7263 /* Join the existing IO */
7264 if (io_wref != NULL && pgaio_wref_valid(&buf->io_wref))
7265 {
7266 *io_wref = buf->io_wref;
7268
7269 return BUFFER_IO_IN_PROGRESS;
7270 }
7271 else if (!wait)
7272 {
7274 return BUFFER_IO_IN_PROGRESS;
7275 }
7276 else
7277 {
7278 /*
7279 * With wait = true, we always have to wait if the caller has
7280 * passed io_wref = NULL.
7281 *
7282 * Even with io_wref != NULL, we have to wait if the buffer's wait
7283 * ref is not valid but the IO is in progress, someone else
7284 * started IO but hasn't set the wait ref yet. We have no choice
7285 * but to wait until the IO completes.
7286 */
7288
7289 /*
7290 * If this backend currently has staged IO, submit it before
7291 * waiting for in-progress IO, to avoid potential deadlocks and
7292 * unnecessary delays.
7293 */
7295
7296 WaitIO(buf);
7297 }
7298 }
7299
7300 /* Once we get here, there is definitely no I/O active on this buffer */
7301
7302 /* Check if someone else already did the I/O */
7303 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
7304 {
7307 }
7308
7309 /*
7310 * No IO in progress and not already done; we will start IO. It's possible
7311 * that the IO was in progress but we're not done, because the IO errored
7312 * out. We'll do the IO ourselves.
7313 */
7316 0);
7317
7320
7322}
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)

References BM_DIRTY, BM_IO_IN_PROGRESS, BM_VALID, buf, BUFFER_IO_ALREADY_DONE, BUFFER_IO_IN_PROGRESS, BUFFER_IO_READY_FOR_IO, BufferDescriptorGetBuffer(), CurrentResourceOwner, fb(), LockBufHdr(), pgaio_submit_staged(), pgaio_wref_valid(), ResourceOwnerEnlarge(), ResourceOwnerRememberBufferIO(), UnlockBufHdr(), UnlockBufHdrExt(), and WaitIO().

Referenced by buffer_call_start_io(), ExtendBufferedRelShared(), FlushBuffer(), read_rel_block_ll(), StartBufferIO(), and ZeroAndLockBuffer().

◆ SyncOneBuffer()

static int SyncOneBuffer ( int  buf_id,
bool  skip_recently_used,
WritebackContext wb_context 
)
static

Definition at line 4138 of file bufmgr.c.

4139{
4141 int result = 0;
4143 BufferTag tag;
4144
4145 /* Make sure we can handle the pin */
4148
4149 /*
4150 * Check whether buffer needs writing.
4151 *
4152 * We can make this check without taking the buffer content lock so long
4153 * as we mark pages dirty in access methods *before* logging changes with
4154 * XLogInsert(): if someone marks the buffer dirty just after our check we
4155 * don't worry because our checkpoint.redo points before log record for
4156 * upcoming changes and so we are not required to write such dirty buffer.
4157 */
4159
4162 {
4164 }
4165 else if (skip_recently_used)
4166 {
4167 /* Caller told us not to write recently-used buffers */
4169 return result;
4170 }
4171
4172 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
4173 {
4174 /* It's clean, so nothing to do */
4176 return result;
4177 }
4178
4179 /*
4180 * Pin it, share-exclusive-lock it, write it. (FlushBuffer will do
4181 * nothing if the buffer is clean by the time we've locked it.)
4182 */
4184
4186
4187 tag = bufHdr->tag;
4188
4190
4191 /*
4192 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
4193 * IOContext will always be IOCONTEXT_NORMAL.
4194 */
4196
4197 return result | BUF_WRITTEN;
4198}

References BM_DIRTY, BM_VALID, BUF_REUSABLE, BUF_STATE_GET_REFCOUNT, BUF_STATE_GET_USAGECOUNT, BUF_WRITTEN, CurrentResourceOwner, fb(), FlushUnlockedBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), result, ScheduleBufferTagForWriteback(), UnlockBufHdr(), and UnpinBuffer().

Referenced by BgBufferSync(), and BufferSync().

◆ TerminateBufferIO()

void TerminateBufferIO ( BufferDesc buf,
bool  clear_dirty,
uint64  set_flag_bits,
bool  forget_owner,
bool  release_aio 
)

Definition at line 7367 of file bufmgr.c.

7369{
7372 int refcount_change = 0;
7373
7375
7378
7379 /* Clear earlier errors, if this IO failed, it'll be marked again */
7381
7382 if (clear_dirty)
7384
7385 if (release_aio)
7386 {
7387 /* release ownership by the AIO subsystem */
7389 refcount_change = -1;
7390 pgaio_wref_clear(&buf->io_wref);
7391 }
7392
7396
7397 if (forget_owner)
7400
7402
7403 /*
7404 * Support LockBufferForCleanup()
7405 *
7406 * We may have just released the last pin other than the waiter's. In most
7407 * cases, this backend holds another pin on the buffer. But, if, for
7408 * example, this backend is completing an IO issued by another backend, it
7409 * may be time to wake the waiter.
7410 */
7413}
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void WakePinCountWaiter(BufferDesc *buf)
Definition bufmgr.c:3429
void ConditionVariableBroadcast(ConditionVariable *cv)

References Assert, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_PIN_COUNT_WAITER, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetIOCV(), ConditionVariableBroadcast(), CurrentResourceOwner, fb(), LockBufHdr(), pgaio_wref_clear(), ResourceOwnerForgetBufferIO(), UnlockBufHdrExt(), and WakePinCountWaiter().

Referenced by AbortBufferIO(), buffer_call_terminate_io(), buffer_readv_complete_one(), ExtendBufferedRelShared(), FlushBuffer(), and ZeroAndLockBuffer().

◆ TrackBufferHit()

◆ TrackNewBufferPin()

void TrackNewBufferPin ( Buffer  buf)
inline

Definition at line 3521 of file bufmgr.c.

3522{
3524
3526 ref->data.refcount++;
3527
3529
3530 /*
3531 * This is the first pin for this page by this backend, mark its page as
3532 * defined to valgrind. While the page contents might not actually be
3533 * valid yet, we don't currently guarantee that such pages are marked
3534 * undefined or non-accessible.
3535 *
3536 * It's not necessarily the prettiest to do this here, but otherwise we'd
3537 * need this block of code in multiple places.
3538 */
3540 BLCKSZ);
3541}
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition bufmgr.c:388

References buf, BufHdrGetBlock, CurrentResourceOwner, fb(), GetBufferDescriptor(), NewPrivateRefCountEntry(), ResourceOwnerRememberBuffer(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by GetBufferFromRing(), PinBuffer(), PinBuffer_Locked(), and StrategyGetBuffer().

◆ ts_ckpt_progress_comparator()

static int ts_ckpt_progress_comparator ( Datum  a,
Datum  b,
void arg 
)
static

Definition at line 7664 of file bufmgr.c.

7665{
7668
7669 /* we want a min-heap, so return 1 for the a < b */
7670 if (sa->progress < sb->progress)
7671 return 1;
7672 else if (sa->progress == sb->progress)
7673 return 0;
7674 else
7675 return -1;
7676}

References a, b, DatumGetPointer(), and fb().

Referenced by BufferSync().

◆ UnlockBuffer()

void UnlockBuffer ( Buffer  buffer)

Definition at line 6567 of file bufmgr.c.

6568{
6570
6571 Assert(BufferIsPinned(buffer));
6572 if (BufferIsLocal(buffer))
6573 return; /* local buffers need no lock */
6574
6575 buf_hdr = GetBufferDescriptor(buffer - 1);
6576 BufferLockUnlock(buffer, buf_hdr);
6577}

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockUnlock(), fb(), and GetBufferDescriptor().

Referenced by LockBuffer().

◆ UnlockBuffers()

void UnlockBuffers ( void  )

Definition at line 5861 of file bufmgr.c.

5862{
5864
5865 if (buf)
5866 {
5868 uint64 unset_bits = 0;
5869
5871
5872 /*
5873 * Don't complain if flag bit not set; it could have been reset but we
5874 * got a cancel/die interrupt before getting the signal.
5875 */
5876 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5877 buf->wait_backend_pgprocno == MyProcNumber)
5879
5881 0, unset_bits,
5882 0);
5883
5885 }
5886}

References BM_PIN_COUNT_WAITER, buf, fb(), LockBufHdr(), MyProcNumber, PinCountWaitBuf, and UnlockBufHdrExt().

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

◆ UnlockReleaseBuffer()

void UnlockReleaseBuffer ( Buffer  buffer)

Definition at line 5612 of file bufmgr.c.

5613{
5614 int mode;
5615 BufferDesc *buf;
5617 uint64 sub;
5619
5620 Assert(BufferIsPinned(buffer));
5621
5622 if (BufferIsLocal(buffer))
5623 {
5624 UnpinLocalBuffer(buffer);
5625 return;
5626 }
5627
5629
5630 buf = GetBufferDescriptor(buffer - 1);
5631
5633
5634 /* compute state modification for lock release */
5636
5637 /* compute state modification for pin release */
5638 ref = GetPrivateRefCountEntry(buffer, false);
5639 Assert(ref != NULL);
5640 Assert(ref->data.refcount > 0);
5641 ref->data.refcount--;
5642
5643 /* no more backend local pins, reduce shared pin count */
5644 if (likely(ref->data.refcount == 0))
5645 {
5646 /* See comment in UnpinBufferNoOwner() */
5648
5649 sub |= BUF_REFCOUNT_ONE;
5651 }
5652
5653 /* perform the lock and pin release in one atomic op */
5654 lockstate = pg_atomic_sub_fetch_u64(&buf->state, sub);
5655
5656 /* wake up waiters for the lock */
5658
5659 /* wake up waiter for the pin release */
5662
5663 /*
5664 * Now okay to allow cancel/die interrupts again, which were held when the
5665 * lock was acquired.
5666 */
5668}
static void ResourceOwnerForgetBuffer(ResourceOwner owner, Buffer buffer)
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition bufmgr.c:565

References Assert, BM_PIN_COUNT_WAITER, buf, BUF_REFCOUNT_ONE, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferLockDisownInternal(), BufferLockProcessRelease(), BufferLockReleaseSub(), BufHdrGetBlock, CurrentResourceOwner, fb(), ForgetPrivateRefCountEntry(), GetBufferDescriptor(), GetPrivateRefCountEntry(), likely, mode, pg_atomic_sub_fetch_u64(), ResourceOwnerForgetBuffer(), RESUME_INTERRUPTS, UnpinLocalBuffer(), VALGRIND_MAKE_MEM_NOACCESS, and WakePinCountWaiter().

Referenced by _bt_clear_incomplete_split(), _bt_relbuf(), _bt_restore_meta(), _hash_relbuf(), allocNewBuffer(), AlterSequence(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinGetStats(), brinRevmapDesummarizeRange(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), collect_corrupt_items(), collect_visibility_data(), count_nondeletable_pages(), createPostingTree(), doPickSplit(), entryLoadMoreItems(), fill_seq_fork_with_data(), flushCachedPage(), FreeSpaceMapPrepareTruncateRel(), fsm_search(), fsm_set_and_search(), fsm_vacuum_page(), generic_redo(), get_raw_page_internal(), GetVictimBuffer(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), gin_refind_parent(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoSplit(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginRedoVacuumPage(), ginScanPostingTreeToDelete(), ginStepRight(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbufferinginserttuples(), gistbuild(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistplacetopage(), gistProcessItup(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_split_page(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_insert(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heapam_scan_analyze_next_tuple(), initBloomState(), invalidate_one_block(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), log_newpage_range(), modify_rel_block(), moveLeafs(), nextval_internal(), palloc_btree_page(), pg_get_sequence_data(), pg_sequence_last_value(), pg_visibility(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), ProcessSingleRelationFork(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), ResetSequence(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), scanPostingTree(), ScanSourceDatabasePgClass(), seq_redo(), SequenceChangePersistence(), SetSequence(), shiftList(), spgAddNodeAction(), spgbuild(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistUpdateMetaPage(), spgMatchNodeAction(), spgprocesspending(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), statapprox_heap(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_prepare_truncate(), writeListPage(), xlog_redo(), and XLogRecordPageWithFreeSpace().

◆ UnpinBuffer()

◆ UnpinBufferNoOwner()

static void UnpinBufferNoOwner ( BufferDesc buf)
static

Definition at line 3474 of file bufmgr.c.

3475{
3478
3480
3481 /* not moving as we're likely deleting it soon anyway */
3482 ref = GetPrivateRefCountEntry(b, false);
3483 Assert(ref != NULL);
3484 Assert(ref->data.refcount > 0);
3485 ref->data.refcount--;
3486 if (ref->data.refcount == 0)
3487 {
3489
3490 /*
3491 * Mark buffer non-accessible to Valgrind.
3492 *
3493 * Note that the buffer may have already been marked non-accessible
3494 * within access method code that enforces that buffers are only
3495 * accessed while a buffer lock is held.
3496 */
3498
3499 /*
3500 * I'd better not still hold the buffer content lock. Can't use
3501 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3502 */
3504
3505 /* decrement the shared reference count */
3507
3508 /* Support LockBufferForCleanup() */
3511
3513 }
3514}
static uint64 pg_atomic_fetch_sub_u64(volatile pg_atomic_uint64 *ptr, int64 sub_)
Definition atomics.h:541

References Assert, b, BM_PIN_COUNT_WAITER, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, BufferLockHeldByMe(), BufHdrGetBlock, fb(), ForgetPrivateRefCountEntry(), GetPrivateRefCountEntry(), pg_atomic_fetch_sub_u64(), VALGRIND_MAKE_MEM_NOACCESS, and WakePinCountWaiter().

Referenced by ResOwnerReleaseBuffer(), and UnpinBuffer().

◆ WaitBufHdrUnlocked()

pg_noinline uint64 WaitBufHdrUnlocked ( BufferDesc buf)

◆ WaitIO()

static void WaitIO ( BufferDesc buf)
static

Definition at line 7148 of file bufmgr.c.

7149{
7151
7152 /*
7153 * Should never end up here with unsubmitted IO, as no AIO unaware code
7154 * may be used while in batch mode and AIO aware code needs to have
7155 * submitted all staged IO to avoid deadlocks & slowness.
7156 */
7158
7160 for (;;)
7161 {
7164
7165 /*
7166 * It may not be necessary to acquire the spinlock to check the flag
7167 * here, but since this test is essential for correctness, we'd better
7168 * play it safe.
7169 */
7171
7172 /*
7173 * Copy the wait reference while holding the spinlock. This protects
7174 * against a concurrent TerminateBufferIO() in another backend from
7175 * clearing the wref while it's being read.
7176 */
7177 iow = buf->io_wref;
7179
7180 /* no IO in progress, we don't need to wait */
7182 break;
7183
7184 /*
7185 * The buffer has asynchronous IO in progress, wait for it to
7186 * complete.
7187 */
7188 if (pgaio_wref_valid(&iow))
7189 {
7191
7192 /*
7193 * The AIO subsystem internally uses condition variables and thus
7194 * might remove this backend from the BufferDesc's CV. While that
7195 * wouldn't cause a correctness issue (the first CV sleep just
7196 * immediately returns if not already registered), it seems worth
7197 * avoiding unnecessary loop iterations, given that we take care
7198 * to do so at the start of the function.
7199 */
7201 continue;
7202 }
7203
7204 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
7206 }
7208}
bool pgaio_have_staged(void)
Definition aio.c:1117
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition aio.c:991
bool ConditionVariableCancelSleep(void)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)

References Assert, BM_IO_IN_PROGRESS, buf, BufferDescriptorGetIOCV(), ConditionVariableCancelSleep(), ConditionVariablePrepareToSleep(), ConditionVariableSleep(), fb(), LockBufHdr(), pgaio_have_staged(), pgaio_wref_valid(), pgaio_wref_wait(), and UnlockBufHdr().

Referenced by InvalidateBuffer(), and StartSharedBufferIO().

◆ WaitReadBuffers()

bool WaitReadBuffers ( ReadBuffersOperation operation)

Definition at line 1759 of file bufmgr.c.

1760{
1761 PgAioReturn *aio_ret = &operation->io_return;
1764 bool needed_wait = false;
1765
1766 if (operation->persistence == RELPERSISTENCE_TEMP)
1767 {
1770 }
1771 else
1772 {
1775 }
1776
1777 /*
1778 * If we get here without an IO operation having been issued, the
1779 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1780 * caller should not have called WaitReadBuffers().
1781 *
1782 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1783 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1784 * of the retry logic below, no extra code is required.
1785 *
1786 * This path is expected to eventually go away.
1787 */
1788 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1789 elog(ERROR, "waiting for read operation that didn't read");
1790
1791 /*
1792 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1793 * done. We may need multiple retries, not just because we could get
1794 * multiple partial reads, but also because some of the remaining
1795 * to-be-read buffers may have been read in by other backends, limiting
1796 * the IO size.
1797 */
1798 while (true)
1799 {
1801
1803
1804 /*
1805 * If there is an IO associated with the operation, we may need to
1806 * wait for it.
1807 */
1808 if (pgaio_wref_valid(&operation->io_wref))
1809 {
1810 /*
1811 * Track the time spent waiting for the IO to complete. As
1812 * tracking a wait even if we don't actually need to wait
1813 *
1814 * a) is not cheap, due to the timestamping overhead
1815 *
1816 * b) reports some time as waiting, even if we never waited
1817 *
1818 * we first check if we already know the IO is complete.
1819 *
1820 * Note that operation->io_return is uninitialized for foreign IO,
1821 * so we cannot use the cheaper PGAIO_RS_UNKNOWN pre-check.
1822 */
1823 if ((operation->foreign_io || aio_ret->result.status == PGAIO_RS_UNKNOWN) &&
1824 !pgaio_wref_check_done(&operation->io_wref))
1825 {
1827
1828 pgaio_wref_wait(&operation->io_wref);
1829 needed_wait = true;
1830
1831 /*
1832 * The IO operation itself was already counted earlier, in
1833 * AsyncReadBuffers(), this just accounts for the wait time.
1834 */
1836 io_start, 0, 0);
1837 }
1838 else
1839 {
1841 }
1842
1843 if (unlikely(operation->foreign_io))
1844 {
1845 Buffer buffer = operation->buffers[operation->nblocks_done];
1846 BufferDesc *desc = BufferIsLocal(buffer) ?
1847 GetLocalBufferDescriptor(-buffer - 1) :
1848 GetBufferDescriptor(buffer - 1);
1850
1851 if (buf_state & BM_VALID)
1852 {
1853 BlockNumber blocknum = operation->blocknum + operation->nblocks_done;
1854
1855 operation->nblocks_done += 1;
1856 Assert(operation->nblocks_done <= operation->nblocks);
1857
1858 /*
1859 * Track this as a 'hit' for this backend. The backend
1860 * performing the IO will track it as a 'read'.
1861 */
1863 operation->rel, operation->persistence,
1864 operation->smgr, operation->forknum,
1865 blocknum);
1866 }
1867
1868 /*
1869 * If the foreign IO failed and left the buffer invalid,
1870 * nblocks_done is not incremented. The retry loop below will
1871 * call AsyncReadBuffers() which will attempt the IO itself.
1872 */
1873 }
1874 else
1875 {
1876 /*
1877 * We now are sure the IO completed. Check the results. This
1878 * includes reporting on errors if there were any.
1879 */
1881 }
1882 }
1883
1884 /*
1885 * Most of the time, the one IO we already started, will read in
1886 * everything. But we need to deal with partial reads and buffers not
1887 * needing IO anymore.
1888 */
1889 if (operation->nblocks_done == operation->nblocks)
1890 break;
1891
1893
1894 /*
1895 * If the IO completed only partially, we need to perform additional
1896 * work, consider that a form of having had to wait.
1897 */
1898 needed_wait = true;
1899
1900 /*
1901 * This may only complete the IO partially, either because some
1902 * buffers were already valid, or because of a partial read.
1903 *
1904 * NB: In contrast to after the AsyncReadBuffers() call in
1905 * StartReadBuffers(), we do *not* reduce
1906 * ReadBuffersOperation->nblocks here, callers expect the full
1907 * operation to be completed at this point (as more operations may
1908 * have been queued).
1909 */
1911 }
1912
1914
1915 /* NB: READ_DONE tracepoint was already executed in completion callback */
1916 return needed_wait;
1917}
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition aio.c:1005
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition bufmgr.c:1714

References Assert, AsyncReadBuffers(), BM_VALID, PrivateRefCountEntry::buffer, BufferIsLocal, CHECK_FOR_INTERRUPTS, CheckReadBuffersOperation(), elog, ERROR, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), io_method, IOCONTEXT_NORMAL, IOContextForStrategy(), IOMETHOD_SYNC, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_READ, operation, pg_atomic_read_u64(), PGAIO_RS_UNKNOWN, pgaio_wref_check_done(), pgaio_wref_valid(), pgaio_wref_wait(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), ProcessReadBuffersResult(), BufferDesc::state, track_io_timing, TrackBufferHit(), and unlikely.

Referenced by read_buffers(), read_stream_next_buffer(), and ReadBuffer_common().

◆ WakePinCountWaiter()

static void WakePinCountWaiter ( BufferDesc buf)
static

Definition at line 3429 of file bufmgr.c.

3430{
3431 /*
3432 * Acquire the buffer header lock, re-check that there's a waiter. Another
3433 * backend could have unpinned this buffer, and already woken up the
3434 * waiter.
3435 *
3436 * There's no danger of the buffer being replaced after we unpinned it
3437 * above, as it's pinned by the waiter. The waiter removes
3438 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3439 * backend waking it up.
3440 */
3442
3445 {
3446 /* we just released the last pin other than the waiter's */
3447 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3448
3451 0);
3452 ProcSendSignal(wait_backend_pgprocno);
3453 }
3454 else
3456}
void ProcSendSignal(ProcNumber procNumber)
Definition proc.c:2027

References BM_PIN_COUNT_WAITER, buf, BUF_STATE_GET_REFCOUNT, fb(), LockBufHdr(), ProcSendSignal(), UnlockBufHdr(), and UnlockBufHdrExt().

Referenced by TerminateBufferIO(), UnlockReleaseBuffer(), and UnpinBufferNoOwner().

◆ WritebackContextInit()

void WritebackContextInit ( WritebackContext context,
int max_pending 
)

Definition at line 7687 of file bufmgr.c.

7688{
7689 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
7690
7691 context->max_pending = max_pending;
7692 context->nr_pending = 0;
7693}

References Assert, WritebackContext::max_pending, WritebackContext::nr_pending, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by BackgroundWriterMain(), BufferManagerShmemAttach(), BufferManagerShmemInit(), and BufferSync().

◆ ZeroAndLockBuffer()

static void ZeroAndLockBuffer ( Buffer  buffer,
ReadBufferMode  mode,
bool  already_valid 
)
static

Definition at line 1137 of file bufmgr.c.

1138{
1140 bool need_to_zero;
1141 bool isLocalBuf = BufferIsLocal(buffer);
1143
1145
1146 if (already_valid)
1147 {
1148 /*
1149 * If the caller already knew the buffer was valid, we can skip some
1150 * header interaction. The caller just wants to lock the buffer.
1151 */
1152 need_to_zero = false;
1153 }
1154 else
1155 {
1156 if (isLocalBuf)
1157 {
1158 /* Simple case for non-shared buffers. */
1159 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1160 sbres = StartLocalBufferIO(bufHdr, true, true, NULL);
1161 }
1162 else
1163 {
1164 /*
1165 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1166 * concurrently. Even though we aren't doing I/O, that ensures
1167 * that we don't zero a page that someone else has pinned. An
1168 * exclusive content lock wouldn't be enough, because readers are
1169 * allowed to drop the content lock after determining that a tuple
1170 * is visible (see buffer access rules in README).
1171 */
1172 bufHdr = GetBufferDescriptor(buffer - 1);
1173 sbres = StartSharedBufferIO(bufHdr, true, true, NULL);
1174 }
1175
1178 }
1179
1180 if (need_to_zero)
1181 {
1182 memset(BufferGetPage(buffer), 0, BLCKSZ);
1183
1184 /*
1185 * Grab the buffer content lock before marking the page as valid, to
1186 * make sure that no other backend sees the zeroed page before the
1187 * caller has had a chance to initialize it.
1188 *
1189 * Since no-one else can be looking at the page contents yet, there is
1190 * no difference between an exclusive lock and a cleanup-strength
1191 * lock. (Note that we cannot use LockBuffer() or
1192 * LockBufferForCleanup() here, because they assert that the buffer is
1193 * already valid.)
1194 */
1195 if (!isLocalBuf)
1197
1198 /* Set BM_VALID, terminate IO, and wake up any waiters */
1199 if (isLocalBuf)
1200 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1201 else
1202 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1203 }
1204 else if (!isLocalBuf)
1205 {
1206 /*
1207 * The buffer is valid, so we can't zero it. The caller still expects
1208 * the page to be locked on return.
1209 */
1210 if (mode == RBM_ZERO_AND_LOCK)
1212 else
1213 LockBufferForCleanup(buffer);
1214 }
1215}
void LockBufferForCleanup(Buffer buffer)
Definition bufmgr.c:6679

References Assert, BM_VALID, PrivateRefCountEntry::buffer, BUFFER_IO_IN_PROGRESS, BUFFER_IO_READY_FOR_IO, BUFFER_LOCK_EXCLUSIVE, BufferGetPage(), BufferIsLocal, fb(), GetBufferDescriptor(), GetLocalBufferDescriptor(), LockBuffer(), LockBufferForCleanup(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, StartLocalBufferIO(), StartSharedBufferIO(), TerminateBufferIO(), and TerminateLocalBufferIO().

Referenced by ReadBuffer_common().

Variable Documentation

◆ aio_local_buffer_readv_cb

const PgAioHandleCallbacks aio_local_buffer_readv_cb
Initial value:
= {
.complete_local = local_buffer_readv_complete,
}
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8940
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8934
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition bufmgr.c:8788

Definition at line 8956 of file bufmgr.c.

8956 {
8957 .stage = local_buffer_readv_stage,
8958
8959 /*
8960 * Note that this, in contrast to the shared_buffers case, uses
8961 * complete_local, as only the issuing backend has access to the required
8962 * datastructures. This is important in case the IO completion may be
8963 * consumed incidentally by another backend.
8964 */
8965 .complete_local = local_buffer_readv_complete,
8966 .report = buffer_readv_report,
8967};

◆ aio_shared_buffer_readv_cb

const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Initial value:
= {
.complete_shared = shared_buffer_readv_complete,
}
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8903
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition bufmgr.c:8883
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition bufmgr.c:8889

Definition at line 8947 of file bufmgr.c.

8947 {
8949 .complete_shared = shared_buffer_readv_complete,
8950 /* need a local callback to report checksum failures */
8951 .complete_local = shared_buffer_readv_complete_local,
8952 .report = buffer_readv_report,
8953};

◆ backend_flush_after

int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER

Definition at line 225 of file bufmgr.c.

Referenced by BufferManagerShmemAttach(), and BufferManagerShmemInit().

◆ bgwriter_flush_after

int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER

Definition at line 224 of file bufmgr.c.

Referenced by BackgroundWriterMain().

◆ bgwriter_lru_maxpages

int bgwriter_lru_maxpages = 100

Definition at line 190 of file bufmgr.c.

Referenced by BgBufferSync().

◆ bgwriter_lru_multiplier

double bgwriter_lru_multiplier = 2.0

Definition at line 191 of file bufmgr.c.

Referenced by BgBufferSync().

◆ buffer_io_resowner_desc

const ResourceOwnerDesc buffer_io_resowner_desc
Initial value:
=
{
.name = "buffer io",
.release_priority = RELEASE_PRIO_BUFFER_IOS,
.ReleaseResource = ResOwnerReleaseBufferIO,
.DebugPrint = ResOwnerPrintBufferIO
}
static void ResOwnerReleaseBufferIO(Datum res)
Definition bufmgr.c:7832
static char * ResOwnerPrintBufferIO(Datum res)
Definition bufmgr.c:7840
#define RELEASE_PRIO_BUFFER_IOS
Definition resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition resowner.h:54

Definition at line 285 of file bufmgr.c.

286{
287 .name = "buffer io",
288 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
289 .release_priority = RELEASE_PRIO_BUFFER_IOS,
290 .ReleaseResource = ResOwnerReleaseBufferIO,
291 .DebugPrint = ResOwnerPrintBufferIO
292};

Referenced by ResourceOwnerForgetBufferIO(), and ResourceOwnerRememberBufferIO().

◆ buffer_resowner_desc

const ResourceOwnerDesc buffer_resowner_desc
Initial value:
=
{
.name = "buffer",
.release_priority = RELEASE_PRIO_BUFFER_PINS,
.ReleaseResource = ResOwnerReleaseBuffer,
.DebugPrint = ResOwnerPrintBuffer
}
static void ResOwnerReleaseBuffer(Datum res)
Definition bufmgr.c:7854
static char * ResOwnerPrintBuffer(Datum res)
Definition bufmgr.c:7890
#define RELEASE_PRIO_BUFFER_PINS
Definition resowner.h:63

Definition at line 294 of file bufmgr.c.

295{
296 .name = "buffer",
297 .release_phase = RESOURCE_RELEASE_BEFORE_LOCKS,
298 .release_priority = RELEASE_PRIO_BUFFER_PINS,
299 .ReleaseResource = ResOwnerReleaseBuffer,
300 .DebugPrint = ResOwnerPrintBuffer
301};

Referenced by ResourceOwnerForgetBuffer(), and ResourceOwnerRememberBuffer().

◆ checkpoint_flush_after

int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER

Definition at line 223 of file bufmgr.c.

Referenced by BufferSync().

◆ effective_io_concurrency

◆ io_combine_limit

◆ io_combine_limit_guc

int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT

Definition at line 216 of file bufmgr.c.

Referenced by assign_io_max_combine_limit().

◆ io_max_combine_limit

◆ maintenance_io_concurrency

◆ MaxProportionalPins

uint32 MaxProportionalPins
static

Definition at line 271 of file bufmgr.c.

Referenced by GetAdditionalPinLimit(), GetPinLimit(), and InitBufferManagerAccess().

◆ PinCountWaitBuf

BufferDesc* PinCountWaitBuf = NULL
static

Definition at line 228 of file bufmgr.c.

Referenced by LockBufferForCleanup(), and UnlockBuffers().

◆ PrivateRefCountArray

◆ PrivateRefCountArrayKeys

◆ PrivateRefCountClock

uint32 PrivateRefCountClock = 0
static

Definition at line 267 of file bufmgr.c.

Referenced by ReservePrivateRefCountEntry().

◆ PrivateRefCountEntryLast

int PrivateRefCountEntryLast = -1
static

◆ PrivateRefCountHash

◆ PrivateRefCountOverflowed

◆ ReservedRefCountSlot

int ReservedRefCountSlot = -1
static

◆ track_io_timing

◆ zero_damaged_pages

bool zero_damaged_pages = false

Definition at line 189 of file bufmgr.c.

Referenced by AsyncReadBuffers(), mdreadv(), and read_rel_block_ll().