PostgreSQL Source Code git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/aio.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
#include "lib/sort_template.h"
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define READV_COUNT_BITS   7
 
#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static void ResOwnerReleaseBufferIO (Datum res)
 
static char * ResOwnerPrintBufferIO (Datum res)
 
static void ResOwnerReleaseBufferPin (Datum res)
 
static char * ResOwnerPrintBufferPin (Datum res)
 
static Buffer ReadBuffer_common (Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static BlockNumber ExtendBufferedRelCommon (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static BlockNumber ExtendBufferedRelShared (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void UnpinBufferNoOwner (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static void AbortBufferIO (Buffer buffer)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
 
static bool AsyncReadBuffers (ReadBuffersOperation *operation, int *nblocks_progress)
 
static void CheckReadBuffersOperation (ReadBuffersOperation *operation, bool is_complete)
 
static Buffer GetVictimBuffer (BufferAccessStrategy strategy, IOContext io_context)
 
static void FlushUnlockedBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
Buffer ExtendBufferedRel (BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
 
BlockNumber ExtendBufferedRelBy (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
 
Buffer ExtendBufferedRelTo (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
 
static void ZeroAndLockBuffer (Buffer buffer, ReadBufferMode mode, bool already_valid)
 
static pg_attribute_always_inline Buffer PinBufferForBlock (Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static pg_attribute_always_inline bool StartReadBuffersImpl (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
 
bool StartReadBuffers (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
 
bool StartReadBuffer (ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
 
static bool ReadBuffersCanStartIOOnce (Buffer buffer, bool nowait)
 
static bool ReadBuffersCanStartIO (Buffer buffer, bool nowait)
 
static void ProcessReadBuffersResult (ReadBuffersOperation *operation)
 
void WaitReadBuffers (ReadBuffersOperation *operation)
 
static void InvalidateBuffer (BufferDesc *buf)
 
static bool InvalidateVictimBuffer (BufferDesc *buf_hdr)
 
uint32 GetPinLimit (void)
 
uint32 GetAdditionalPinLimit (void)
 
void LimitAdditionalPins (uint32 *additional_pins)
 
bool BufferIsLockedByMe (Buffer buffer)
 
bool BufferIsLockedByMeInMode (Buffer buffer, BufferLockMode mode)
 
bool BufferIsDirty (Buffer buffer)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
static void WakePinCountWaiter (BufferDesc *buf)
 
void TrackNewBufferPin (Buffer buf)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferManagerAccess (void)
 
char * DebugPrintBufferRefcount (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, BufferLockMode mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void CheckBufferIsPinnedOnce (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
bool StartBufferIO (BufferDesc *buf, bool forInput, bool nowait)
 
void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
 
uint32 LockBufHdr (BufferDesc *desc)
 
pg_noinline uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *wb_context, IOContext io_context)
 
static bool EvictUnpinnedBufferInternal (BufferDesc *desc, bool *buffer_flushed)
 
bool EvictUnpinnedBuffer (Buffer buf, bool *buffer_flushed)
 
void EvictAllUnpinnedBuffers (int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
void EvictRelUnpinnedBuffers (Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
static bool MarkDirtyUnpinnedBufferInternal (Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
 
bool MarkDirtyUnpinnedBuffer (Buffer buf, bool *buffer_already_dirty)
 
void MarkDirtyRelUnpinnedBuffers (Relation rel, int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
 
void MarkDirtyAllUnpinnedBuffers (int32 *buffers_dirtied, int32 *buffers_already_dirty, int32 *buffers_skipped)
 
static pg_attribute_always_inline void buffer_stage_common (PgAioHandle *ioh, bool is_write, bool is_temp)
 
static void buffer_readv_decode_error (PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
 
static void buffer_readv_encode_error (PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
 
static pg_attribute_always_inline void buffer_readv_complete_one (PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
 
static pg_attribute_always_inline PgAioResult buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
 
static void buffer_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static void shared_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete_local (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void local_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult local_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT
 
int io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 
static uint32 MaxProportionalPins
 
const ResourceOwnerDesc buffer_io_resowner_desc
 
const ResourceOwnerDesc buffer_pin_resowner_desc
 
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
 
const PgAioHandleCallbacks aio_local_buffer_readv_cb
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 91 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 81 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 80 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 73 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:425
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:387
int32 * LocalRefCount
Definition: localbuf.c:49

Definition at line 483 of file bufmgr.c.

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 72 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 76 of file bufmgr.c.

◆ READV_COUNT_BITS

#define READV_COUNT_BITS   7

◆ READV_COUNT_MASK

#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 100 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 83 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 6455 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 6455 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 6457 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 6457 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 6454 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 6454 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 6456 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 6456 of file bufmgr.c.

◆ ST_SORT [1/2]

#define ST_SORT   sort_checkpoint_bufferids

Definition at line 6453 of file bufmgr.c.

◆ ST_SORT [2/2]

#define ST_SORT   sort_pending_writebacks

Definition at line 6453 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

static void AbortBufferIO ( Buffer  buffer)
static

Definition at line 6166 of file bufmgr.c.

6167{
6168 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
6169 uint32 buf_state;
6170
6171 buf_state = LockBufHdr(buf_hdr);
6172 Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
6173
6174 if (!(buf_state & BM_VALID))
6175 {
6176 Assert(!(buf_state & BM_DIRTY));
6177 UnlockBufHdr(buf_hdr);
6178 }
6179 else
6180 {
6181 Assert(buf_state & BM_DIRTY);
6182 UnlockBufHdr(buf_hdr);
6183
6184 /* Issue notice if this is not the first failure... */
6185 if (buf_state & BM_IO_ERROR)
6186 {
6187 /* Buffer is pinned, so we can read tag without spinlock */
6189 (errcode(ERRCODE_IO_ERROR),
6190 errmsg("could not write block %u of %s",
6191 buf_hdr->tag.blockNum,
6193 BufTagGetForkNum(&buf_hdr->tag)).str),
6194 errdetail("Multiple failures --- write error might be permanent.")));
6195 }
6196 }
6197
6198 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
6199}
#define BM_TAG_VALID
Definition: buf_internals.h:71
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc)
#define BM_DIRTY
Definition: buf_internals.h:69
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:72
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:70
#define BM_IO_ERROR
Definition: buf_internals.h:73
static BufferDesc * GetBufferDescriptor(uint32 id)
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
Definition: bufmgr.c:6104
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:6264
uint32_t uint32
Definition: c.h:541
int errdetail(const char *fmt,...)
Definition: elog.c:1216
int errcode(int sqlerrcode)
Definition: elog.c:863
int errmsg(const char *fmt,...)
Definition: elog.c:1080
#define WARNING
Definition: elog.h:36
#define ereport(elevel,...)
Definition: elog.h:150
Assert(PointerIsAligned(start, uint64))
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146
BufferTag tag
BlockNumber blockNum

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg(), GetBufferDescriptor(), LockBufHdr(), relpathperm, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by ResOwnerReleaseBufferIO().

◆ AsyncReadBuffers()

static bool AsyncReadBuffers ( ReadBuffersOperation operation,
int *  nblocks_progress 
)
static

Definition at line 1745 of file bufmgr.c.

1746{
1747 Buffer *buffers = &operation->buffers[0];
1748 int flags = operation->flags;
1749 BlockNumber blocknum = operation->blocknum;
1750 ForkNumber forknum = operation->forknum;
1751 char persistence = operation->persistence;
1752 int16 nblocks_done = operation->nblocks_done;
1753 Buffer *io_buffers = &operation->buffers[nblocks_done];
1754 int io_buffers_len = 0;
1755 PgAioHandle *ioh;
1756 uint32 ioh_flags = 0;
1757 void *io_pages[MAX_IO_COMBINE_LIMIT];
1758 IOContext io_context;
1759 IOObject io_object;
1760 bool did_start_io;
1761
1762 /*
1763 * When this IO is executed synchronously, either because the caller will
1764 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1765 * the AIO subsystem needs to know.
1766 */
1767 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1768 ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1769
1770 if (persistence == RELPERSISTENCE_TEMP)
1771 {
1772 io_context = IOCONTEXT_NORMAL;
1773 io_object = IOOBJECT_TEMP_RELATION;
1774 ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1775 }
1776 else
1777 {
1778 io_context = IOContextForStrategy(operation->strategy);
1779 io_object = IOOBJECT_RELATION;
1780 }
1781
1782 /*
1783 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1784 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1785 * set globally, but on a per-session basis. The completion callback,
1786 * which may be run in other processes, e.g. in IO workers, may have a
1787 * different value of the zero_damaged_pages GUC.
1788 *
1789 * XXX: We probably should eventually use a different flag for
1790 * zero_damaged_pages, so we can report different log levels / error codes
1791 * for zero_damaged_pages and ZERO_ON_ERROR.
1792 */
1795
1796 /*
1797 * For the same reason as with zero_damaged_pages we need to use this
1798 * backend's ignore_checksum_failure value.
1799 */
1802
1803
1804 /*
1805 * To be allowed to report stats in the local completion callback we need
1806 * to prepare to report stats now. This ensures we can safely report the
1807 * checksum failure even in a critical section.
1808 */
1810
1811 /*
1812 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1813 * might block, which we don't want after setting IO_IN_PROGRESS.
1814 *
1815 * If we need to wait for IO before we can get a handle, submit
1816 * already-staged IO first, so that other backends don't need to wait.
1817 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1818 * wait for already submitted IO, which doesn't require additional locks,
1819 * but it could still cause undesirable waits.
1820 *
1821 * A secondary benefit is that this would allow us to measure the time in
1822 * pgaio_io_acquire() without causing undue timer overhead in the common,
1823 * non-blocking, case. However, currently the pgstats infrastructure
1824 * doesn't really allow that, as it a) asserts that an operation can't
1825 * have time without operations b) doesn't have an API to report
1826 * "accumulated" time.
1827 */
1829 if (unlikely(!ioh))
1830 {
1832
1834 }
1835
1836 /*
1837 * Check if we can start IO on the first to-be-read buffer.
1838 *
1839 * If an I/O is already in progress in another backend, we want to wait
1840 * for the outcome: either done, or something went wrong and we will
1841 * retry.
1842 */
1843 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1844 {
1845 /*
1846 * Someone else has already completed this block, we're done.
1847 *
1848 * When IO is necessary, ->nblocks_done is updated in
1849 * ProcessReadBuffersResult(), but that is not called if no IO is
1850 * necessary. Thus update here.
1851 */
1852 operation->nblocks_done += 1;
1853 *nblocks_progress = 1;
1854
1855 pgaio_io_release(ioh);
1856 pgaio_wref_clear(&operation->io_wref);
1857 did_start_io = false;
1858
1859 /*
1860 * Report and track this as a 'hit' for this backend, even though it
1861 * must have started out as a miss in PinBufferForBlock(). The other
1862 * backend will track this as a 'read'.
1863 */
1864 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1865 operation->smgr->smgr_rlocator.locator.spcOid,
1866 operation->smgr->smgr_rlocator.locator.dbOid,
1867 operation->smgr->smgr_rlocator.locator.relNumber,
1868 operation->smgr->smgr_rlocator.backend,
1869 true);
1870
1871 if (persistence == RELPERSISTENCE_TEMP)
1873 else
1875
1876 if (operation->rel)
1877 pgstat_count_buffer_hit(operation->rel);
1878
1879 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1880
1881 if (VacuumCostActive)
1883 }
1884 else
1885 {
1886 instr_time io_start;
1887
1888 /* We found a buffer that we need to read in. */
1889 Assert(io_buffers[0] == buffers[nblocks_done]);
1890 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
1891 io_buffers_len = 1;
1892
1893 /*
1894 * How many neighboring-on-disk blocks can we scatter-read into other
1895 * buffers at the same time? In this case we don't wait if we see an
1896 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
1897 * head block, so we should get on with that I/O as soon as possible.
1898 */
1899 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
1900 {
1901 if (!ReadBuffersCanStartIO(buffers[i], true))
1902 break;
1903 /* Must be consecutive block numbers. */
1904 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
1905 BufferGetBlockNumber(buffers[i]) - 1);
1906 Assert(io_buffers[io_buffers_len] == buffers[i]);
1907
1908 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1909 }
1910
1911 /* get a reference to wait for in WaitReadBuffers() */
1912 pgaio_io_get_wref(ioh, &operation->io_wref);
1913
1914 /* provide the list of buffers to the completion callbacks */
1915 pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
1916
1918 persistence == RELPERSISTENCE_TEMP ?
1921 flags);
1922
1923 pgaio_io_set_flag(ioh, ioh_flags);
1924
1925 /* ---
1926 * Even though we're trying to issue IO asynchronously, track the time
1927 * in smgrstartreadv():
1928 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
1929 * immediately
1930 * - the io method might not support the IO (e.g. worker IO for a temp
1931 * table)
1932 * ---
1933 */
1935 smgrstartreadv(ioh, operation->smgr, forknum,
1936 blocknum + nblocks_done,
1937 io_pages, io_buffers_len);
1938 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1939 io_start, 1, io_buffers_len * BLCKSZ);
1940
1941 if (persistence == RELPERSISTENCE_TEMP)
1942 pgBufferUsage.local_blks_read += io_buffers_len;
1943 else
1944 pgBufferUsage.shared_blks_read += io_buffers_len;
1945
1946 /*
1947 * Track vacuum cost when issuing IO, not after waiting for it.
1948 * Otherwise we could end up issuing a lot of IO in a short timespan,
1949 * despite a low cost limit.
1950 */
1951 if (VacuumCostActive)
1952 VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1953
1954 *nblocks_progress = io_buffers_len;
1955 did_start_io = true;
1956 }
1957
1958 return did_start_io;
1959}
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:162
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:964
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:366
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:330
void pgaio_submit_staged(void)
Definition: aio.c:1123
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:240
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:188
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition: aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition: aio.h:198
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition: aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
Definition: aio_callback.c:140
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
bool track_io_timing
Definition: bufmgr.c:147
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4223
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition: bufmgr.c:1545
bool zero_damaged_pages
Definition: bufmgr.c:144
#define READ_BUFFERS_ZERO_ON_ERROR
Definition: bufmgr.h:122
static Block BufferGetBlock(Buffer buffer)
Definition: bufmgr.h:403
#define MAX_IO_COMBINE_LIMIT
Definition: bufmgr.h:173
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition: bufmgr.h:126
#define READ_BUFFERS_SYNCHRONOUSLY
Definition: bufmgr.h:128
bool ignore_checksum_failure
Definition: bufpage.c:27
int16_t int16
Definition: c.h:536
#define unlikely(x)
Definition: c.h:407
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:747
int VacuumCostPageMiss
Definition: globals.c:152
bool VacuumCostActive
Definition: globals.c:158
int VacuumCostBalance
Definition: globals.c:157
int VacuumCostPageHit
Definition: globals.c:151
BufferUsage pgBufferUsage
Definition: instrument.c:20
int i
Definition: isn.c:77
IOObject
Definition: pgstat.h:276
@ IOOBJECT_RELATION
Definition: pgstat.h:277
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:278
IOContext
Definition: pgstat.h:285
@ IOCONTEXT_NORMAL
Definition: pgstat.h:289
@ IOOP_READ
Definition: pgstat.h:315
@ IOOP_HIT
Definition: pgstat.h:309
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:720
void pgstat_prepare_report_checksum_failure(Oid dboid)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:91
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:122
ForkNumber
Definition: relpath.h:56
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:753
int64 local_blks_hit
Definition: instrument.h:30
int64 shared_blks_read
Definition: instrument.h:27
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
ForkNumber forknum
Definition: bufmgr.h:137
PgAioWaitRef io_wref
Definition: bufmgr.h:150
Buffer * buffers
Definition: bufmgr.h:145
SMgrRelation smgr
Definition: bufmgr.h:135
BufferAccessStrategy strategy
Definition: bufmgr.h:138
BlockNumber blocknum
Definition: bufmgr.h:146
PgAioReturn io_return
Definition: bufmgr.h:151
RelFileLocator locator
RelFileNumber relNumber
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38

References Assert(), RelFileLocatorBackend::backend, ReadBuffersOperation::blocknum, BufferGetBlock(), BufferGetBlockNumber(), ReadBuffersOperation::buffers, CurrentResourceOwner, RelFileLocator::dbOid, ReadBuffersOperation::flags, ReadBuffersOperation::forknum, i, ignore_checksum_failure, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_HIT, IOOP_READ, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, RelFileLocatorBackend::locator, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_HF_REFERENCES_LOCAL, PGAIO_HF_SYNCHRONOUS, pgaio_io_acquire(), pgaio_io_acquire_nb(), pgaio_io_get_wref(), pgaio_io_register_callbacks(), pgaio_io_release(), pgaio_io_set_flag(), pgaio_io_set_handle_data_32(), pgaio_submit_staged(), pgaio_wref_clear(), pgBufferUsage, pgstat_count_buffer_hit, pgstat_count_io_op(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), pgstat_prepare_report_checksum_failure(), READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, ReadBuffersCanStartIO(), ReadBuffersOperation::rel, RelFileLocator::relNumber, BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, ReadBuffersOperation::smgr, SMgrRelationData::smgr_rlocator, smgrstartreadv(), RelFileLocator::spcOid, ReadBuffersOperation::strategy, track_io_timing, unlikely, VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, and zero_damaged_pages.

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 3990 of file bufmgr.c.

3991{
3993
3994 AtEOXact_LocalBuffers(isCommit);
3995
3997}
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:4059
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:217
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:1003

References Assert(), AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 4041 of file bufmgr.c.

4042{
4043 UnlockBuffers();
4044
4046
4047 /* localbuf.c needs a chance too */
4049}
void UnlockBuffers(void)
Definition: bufmgr.c:5573
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:1014

References AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferManagerAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 3622 of file bufmgr.c.

3623{
3624 /* info obtained from freelist.c */
3625 int strategy_buf_id;
3626 uint32 strategy_passes;
3627 uint32 recent_alloc;
3628
3629 /*
3630 * Information saved between calls so we can determine the strategy
3631 * point's advance rate and avoid scanning already-cleaned buffers.
3632 */
3633 static bool saved_info_valid = false;
3634 static int prev_strategy_buf_id;
3635 static uint32 prev_strategy_passes;
3636 static int next_to_clean;
3637 static uint32 next_passes;
3638
3639 /* Moving averages of allocation rate and clean-buffer density */
3640 static float smoothed_alloc = 0;
3641 static float smoothed_density = 10.0;
3642
3643 /* Potentially these could be tunables, but for now, not */
3644 float smoothing_samples = 16;
3645 float scan_whole_pool_milliseconds = 120000.0;
3646
3647 /* Used to compute how far we scan ahead */
3648 long strategy_delta;
3649 int bufs_to_lap;
3650 int bufs_ahead;
3651 float scans_per_alloc;
3652 int reusable_buffers_est;
3653 int upcoming_alloc_est;
3654 int min_scan_buffers;
3655
3656 /* Variables for the scanning loop proper */
3657 int num_to_scan;
3658 int num_written;
3659 int reusable_buffers;
3660
3661 /* Variables for final smoothed_density update */
3662 long new_strategy_delta;
3663 uint32 new_recent_alloc;
3664
3665 /*
3666 * Find out where the clock-sweep currently is, and how many buffer
3667 * allocations have happened since our last call.
3668 */
3669 strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3670
3671 /* Report buffer alloc counts to pgstat */
3672 PendingBgWriterStats.buf_alloc += recent_alloc;
3673
3674 /*
3675 * If we're not running the LRU scan, just stop after doing the stats
3676 * stuff. We mark the saved state invalid so that we can recover sanely
3677 * if LRU scan is turned back on later.
3678 */
3679 if (bgwriter_lru_maxpages <= 0)
3680 {
3681 saved_info_valid = false;
3682 return true;
3683 }
3684
3685 /*
3686 * Compute strategy_delta = how many buffers have been scanned by the
3687 * clock-sweep since last time. If first time through, assume none. Then
3688 * see if we are still ahead of the clock-sweep, and if so, how many
3689 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3690 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3691 * behavior when the passes counts wrap around.
3692 */
3693 if (saved_info_valid)
3694 {
3695 int32 passes_delta = strategy_passes - prev_strategy_passes;
3696
3697 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3698 strategy_delta += (long) passes_delta * NBuffers;
3699
3700 Assert(strategy_delta >= 0);
3701
3702 if ((int32) (next_passes - strategy_passes) > 0)
3703 {
3704 /* we're one pass ahead of the strategy point */
3705 bufs_to_lap = strategy_buf_id - next_to_clean;
3706#ifdef BGW_DEBUG
3707 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3708 next_passes, next_to_clean,
3709 strategy_passes, strategy_buf_id,
3710 strategy_delta, bufs_to_lap);
3711#endif
3712 }
3713 else if (next_passes == strategy_passes &&
3714 next_to_clean >= strategy_buf_id)
3715 {
3716 /* on same pass, but ahead or at least not behind */
3717 bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3718#ifdef BGW_DEBUG
3719 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3720 next_passes, next_to_clean,
3721 strategy_passes, strategy_buf_id,
3722 strategy_delta, bufs_to_lap);
3723#endif
3724 }
3725 else
3726 {
3727 /*
3728 * We're behind, so skip forward to the strategy point and start
3729 * cleaning from there.
3730 */
3731#ifdef BGW_DEBUG
3732 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3733 next_passes, next_to_clean,
3734 strategy_passes, strategy_buf_id,
3735 strategy_delta);
3736#endif
3737 next_to_clean = strategy_buf_id;
3738 next_passes = strategy_passes;
3739 bufs_to_lap = NBuffers;
3740 }
3741 }
3742 else
3743 {
3744 /*
3745 * Initializing at startup or after LRU scanning had been off. Always
3746 * start at the strategy point.
3747 */
3748#ifdef BGW_DEBUG
3749 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3750 strategy_passes, strategy_buf_id);
3751#endif
3752 strategy_delta = 0;
3753 next_to_clean = strategy_buf_id;
3754 next_passes = strategy_passes;
3755 bufs_to_lap = NBuffers;
3756 }
3757
3758 /* Update saved info for next time */
3759 prev_strategy_buf_id = strategy_buf_id;
3760 prev_strategy_passes = strategy_passes;
3761 saved_info_valid = true;
3762
3763 /*
3764 * Compute how many buffers had to be scanned for each new allocation, ie,
3765 * 1/density of reusable buffers, and track a moving average of that.
3766 *
3767 * If the strategy point didn't move, we don't update the density estimate
3768 */
3769 if (strategy_delta > 0 && recent_alloc > 0)
3770 {
3771 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3772 smoothed_density += (scans_per_alloc - smoothed_density) /
3773 smoothing_samples;
3774 }
3775
3776 /*
3777 * Estimate how many reusable buffers there are between the current
3778 * strategy point and where we've scanned ahead to, based on the smoothed
3779 * density estimate.
3780 */
3781 bufs_ahead = NBuffers - bufs_to_lap;
3782 reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3783
3784 /*
3785 * Track a moving average of recent buffer allocations. Here, rather than
3786 * a true average we want a fast-attack, slow-decline behavior: we
3787 * immediately follow any increase.
3788 */
3789 if (smoothed_alloc <= (float) recent_alloc)
3790 smoothed_alloc = recent_alloc;
3791 else
3792 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3793 smoothing_samples;
3794
3795 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3796 upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3797
3798 /*
3799 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3800 * eventually underflow to zero, and the underflows produce annoying
3801 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3802 * zero, there's no point in tracking smaller and smaller values of
3803 * smoothed_alloc, so just reset it to exactly zero to avoid this
3804 * syndrome. It will pop back up as soon as recent_alloc increases.
3805 */
3806 if (upcoming_alloc_est == 0)
3807 smoothed_alloc = 0;
3808
3809 /*
3810 * Even in cases where there's been little or no buffer allocation
3811 * activity, we want to make a small amount of progress through the buffer
3812 * cache so that as many reusable buffers as possible are clean after an
3813 * idle period.
3814 *
3815 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3816 * the BGW will be called during the scan_whole_pool time; slice the
3817 * buffer pool into that many sections.
3818 */
3819 min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3820
3821 if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3822 {
3823#ifdef BGW_DEBUG
3824 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3825 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3826#endif
3827 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3828 }
3829
3830 /*
3831 * Now write out dirty reusable buffers, working forward from the
3832 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3833 * enough buffers to match our estimate of the next cycle's allocation
3834 * requirements, or hit the bgwriter_lru_maxpages limit.
3835 */
3836
3837 num_to_scan = bufs_to_lap;
3838 num_written = 0;
3839 reusable_buffers = reusable_buffers_est;
3840
3841 /* Execute the LRU scan */
3842 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3843 {
3844 int sync_state = SyncOneBuffer(next_to_clean, true,
3845 wb_context);
3846
3847 if (++next_to_clean >= NBuffers)
3848 {
3849 next_to_clean = 0;
3850 next_passes++;
3851 }
3852 num_to_scan--;
3853
3854 if (sync_state & BUF_WRITTEN)
3855 {
3856 reusable_buffers++;
3857 if (++num_written >= bgwriter_lru_maxpages)
3858 {
3860 break;
3861 }
3862 }
3863 else if (sync_state & BUF_REUSABLE)
3864 reusable_buffers++;
3865 }
3866
3868
3869#ifdef BGW_DEBUG
3870 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3871 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3872 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3873 bufs_to_lap - num_to_scan,
3874 num_written,
3875 reusable_buffers - reusable_buffers_est);
3876#endif
3877
3878 /*
3879 * Consider the above scan as being like a new allocation scan.
3880 * Characterize its density and update the smoothed one based on it. This
3881 * effectively halves the moving average period in cases where both the
3882 * strategy and the background writer are doing some useful scanning,
3883 * which is helpful because a long memory isn't as desirable on the
3884 * density estimates.
3885 */
3886 new_strategy_delta = bufs_to_lap - num_to_scan;
3887 new_recent_alloc = reusable_buffers - reusable_buffers_est;
3888 if (new_strategy_delta > 0 && new_recent_alloc > 0)
3889 {
3890 scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3891 smoothed_density += (scans_per_alloc - smoothed_density) /
3892 smoothing_samples;
3893
3894#ifdef BGW_DEBUG
3895 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3896 new_recent_alloc, new_strategy_delta,
3897 scans_per_alloc, smoothed_density);
3898#endif
3899 }
3900
3901 /* Return true if OK to hibernate */
3902 return (bufs_to_lap == 0 && recent_alloc == 0);
3903}
int BgWriterDelay
Definition: bgwriter.c:58
#define BUF_REUSABLE
Definition: bufmgr.c:81
double bgwriter_lru_multiplier
Definition: bufmgr.c:146
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3920
int bgwriter_lru_maxpages
Definition: bufmgr.c:145
#define BUF_WRITTEN
Definition: bufmgr.c:80
int32_t int32
Definition: c.h:537
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:226
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:321
int NBuffers
Definition: globals.c:142
PgStat_BgWriterStats PendingBgWriterStats
PgStat_Counter buf_written_clean
Definition: pgstat.h:242
PgStat_Counter maxwritten_clean
Definition: pgstat.h:243
PgStat_Counter buf_alloc
Definition: pgstat.h:244

References Assert(), bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, DEBUG1, DEBUG2, elog, PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ buffer_readv_complete()

static pg_attribute_always_inline PgAioResult buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data,
bool  is_temp 
)
static

Definition at line 7374 of file bufmgr.c.

7376{
7377 PgAioResult result = prior_result;
7379 uint8 first_error_off = 0;
7380 uint8 first_zeroed_off = 0;
7381 uint8 first_ignored_off = 0;
7382 uint8 error_count = 0;
7383 uint8 zeroed_count = 0;
7384 uint8 ignored_count = 0;
7385 uint8 checkfail_count = 0;
7386 uint64 *io_data;
7387 uint8 handle_data_len;
7388
7389 if (is_temp)
7390 {
7391 Assert(td->smgr.is_temp);
7393 }
7394 else
7395 Assert(!td->smgr.is_temp);
7396
7397 /*
7398 * Iterate over all the buffers affected by this IO and call the
7399 * per-buffer completion function for each buffer.
7400 */
7401 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7402 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
7403 {
7404 Buffer buf = io_data[buf_off];
7405 bool failed;
7406 bool failed_verification = false;
7407 bool failed_checksum = false;
7408 bool zeroed_buffer = false;
7409 bool ignored_checksum = false;
7410
7412
7413 /*
7414 * If the entire I/O failed on a lower-level, each buffer needs to be
7415 * marked as failed. In case of a partial read, the first few buffers
7416 * may be ok.
7417 */
7418 failed =
7419 prior_result.status == PGAIO_RS_ERROR
7420 || prior_result.result <= buf_off;
7421
7422 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
7423 &failed_verification,
7424 &failed_checksum,
7425 &ignored_checksum,
7426 &zeroed_buffer);
7427
7428 /*
7429 * Track information about the number of different kinds of error
7430 * conditions across all pages, as there can be multiple pages failing
7431 * verification as part of one IO.
7432 */
7433 if (failed_verification && !zeroed_buffer && error_count++ == 0)
7434 first_error_off = buf_off;
7435 if (zeroed_buffer && zeroed_count++ == 0)
7436 first_zeroed_off = buf_off;
7437 if (ignored_checksum && ignored_count++ == 0)
7438 first_ignored_off = buf_off;
7439 if (failed_checksum)
7440 checkfail_count++;
7441 }
7442
7443 /*
7444 * If the smgr read succeeded [partially] and page verification failed for
7445 * some of the pages, adjust the IO's result state appropriately.
7446 */
7447 if (prior_result.status != PGAIO_RS_ERROR &&
7448 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
7449 {
7450 buffer_readv_encode_error(&result, is_temp,
7451 zeroed_count > 0, ignored_count > 0,
7452 error_count, zeroed_count, checkfail_count,
7453 first_error_off, first_zeroed_off,
7454 first_ignored_off);
7455 pgaio_result_report(result, td, DEBUG1);
7456 }
7457
7458 /*
7459 * For shared relations this reporting is done in
7460 * shared_buffer_readv_complete_local().
7461 */
7462 if (is_temp && checkfail_count > 0)
7464 checkfail_count);
7465
7466 return result;
7467}
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:355
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
Definition: aio_callback.c:156
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:173
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:73
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition: bufmgr.c:7230
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition: bufmgr.c:7135
uint8_t uint8
Definition: c.h:539
uint64_t uint64
Definition: c.h:542
ProcNumber MyProcNumber
Definition: globals.c:90
static char * buf
Definition: pg_test_fsync.c:72
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
uint32 status
Definition: aio_types.h:108
int32 result
Definition: aio_types.h:113
RelFileLocator rlocator
Definition: aio_types.h:65
struct PgAioTargetData::@126 smgr

References Assert(), buf, buffer_readv_complete_one(), buffer_readv_encode_error(), BufferIsValid(), RelFileLocator::dbOid, DEBUG1, PgAioTargetData::is_temp, MyProcNumber, pgaio_io_get_handle_data(), pgaio_io_get_owner(), pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, pgstat_report_checksum_failures_in_db(), PgAioResult::result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and PgAioResult::status.

Referenced by local_buffer_readv_complete(), and shared_buffer_readv_complete().

◆ buffer_readv_complete_one()

static pg_attribute_always_inline void buffer_readv_complete_one ( PgAioTargetData td,
uint8  buf_off,
Buffer  buffer,
uint8  flags,
bool  failed,
bool  is_temp,
bool *  buffer_invalid,
bool *  failed_checksum,
bool *  ignored_checksum,
bool *  zeroed_buffer 
)
static

Definition at line 7230 of file bufmgr.c.

7236{
7237 BufferDesc *buf_hdr = is_temp ?
7238 GetLocalBufferDescriptor(-buffer - 1)
7239 : GetBufferDescriptor(buffer - 1);
7240 BufferTag tag = buf_hdr->tag;
7241 char *bufdata = BufferGetBlock(buffer);
7242 uint32 set_flag_bits;
7243 int piv_flags;
7244
7245 /* check that the buffer is in the expected state for a read */
7246#ifdef USE_ASSERT_CHECKING
7247 {
7248 uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
7249
7250 Assert(buf_state & BM_TAG_VALID);
7251 Assert(!(buf_state & BM_VALID));
7252 /* temp buffers don't use BM_IO_IN_PROGRESS */
7253 if (!is_temp)
7254 Assert(buf_state & BM_IO_IN_PROGRESS);
7255 Assert(!(buf_state & BM_DIRTY));
7256 }
7257#endif
7258
7259 *buffer_invalid = false;
7260 *failed_checksum = false;
7261 *ignored_checksum = false;
7262 *zeroed_buffer = false;
7263
7264 /*
7265 * We ask PageIsVerified() to only log the message about checksum errors,
7266 * as the completion might be run in any backend (or IO workers). We will
7267 * report checksum errors in buffer_readv_report().
7268 */
7269 piv_flags = PIV_LOG_LOG;
7270
7271 /* the local zero_damaged_pages may differ from the definer's */
7273 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
7274
7275 /* Check for garbage data. */
7276 if (!failed)
7277 {
7278 /*
7279 * If the buffer is not currently pinned by this backend, e.g. because
7280 * we're completing this IO after an error, the buffer data will have
7281 * been marked as inaccessible when the buffer was unpinned. The AIO
7282 * subsystem holds a pin, but that doesn't prevent the buffer from
7283 * having been marked as inaccessible. The completion might also be
7284 * executed in a different process.
7285 */
7286#ifdef USE_VALGRIND
7287 if (!BufferIsPinned(buffer))
7288 VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
7289#endif
7290
7291 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
7292 failed_checksum))
7293 {
7294 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
7295 {
7296 memset(bufdata, 0, BLCKSZ);
7297 *zeroed_buffer = true;
7298 }
7299 else
7300 {
7301 *buffer_invalid = true;
7302 /* mark buffer as having failed */
7303 failed = true;
7304 }
7305 }
7306 else if (*failed_checksum)
7307 *ignored_checksum = true;
7308
7309 /* undo what we did above */
7310#ifdef USE_VALGRIND
7311 if (!BufferIsPinned(buffer))
7312 VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
7313#endif
7314
7315 /*
7316 * Immediately log a message about the invalid page, but only to the
7317 * server log. The reason to do so immediately is that this may be
7318 * executed in a different backend than the one that originated the
7319 * request. The reason to do so immediately is that the originator
7320 * might not process the query result immediately (because it is busy
7321 * doing another part of query processing) or at all (e.g. if it was
7322 * cancelled or errored out due to another IO also failing). The
7323 * definer of the IO will emit an ERROR or WARNING when processing the
7324 * IO's results
7325 *
7326 * To avoid duplicating the code to emit these log messages, we reuse
7327 * buffer_readv_report().
7328 */
7329 if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
7330 {
7331 PgAioResult result_one = {0};
7332
7333 buffer_readv_encode_error(&result_one, is_temp,
7334 *zeroed_buffer,
7335 *ignored_checksum,
7336 *buffer_invalid,
7337 *zeroed_buffer ? 1 : 0,
7338 *failed_checksum ? 1 : 0,
7339 buf_off, buf_off, buf_off);
7340 pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
7341 }
7342 }
7343
7344 /* Terminate I/O and set BM_VALID. */
7345 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
7346 if (is_temp)
7347 TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
7348 else
7349 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
7350
7351 /*
7352 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
7353 * callback may not be executed in the same backend that called
7354 * BUFFER_READ_START. The alternative would be to defer calling the
7355 * tracepoint to a later point (e.g. the local completion callback for
7356 * shared buffer reads), which seems even less helpful.
7357 */
7358 TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
7359 tag.blockNum,
7360 tag.spcOid,
7361 tag.dbOid,
7362 tag.relNumber,
7364 false);
7365}
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:237
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:483
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition: bufpage.c:94
#define PIV_LOG_LOG
Definition: bufpage.h:468
PageData * Page
Definition: bufpage.h:81
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition: bufpage.h:469
#define LOG_SERVER_ONLY
Definition: elog.h:32
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio)
Definition: localbuf.c:562
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
pg_atomic_uint32 state
RelFileNumber relNumber
ForkNumber forkNum
Oid spcOid

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, buffer_readv_encode_error(), BufferGetBlock(), BufferIsPinned, buftag::dbOid, buftag::forkNum, GetBufferDescriptor(), GetLocalBufferDescriptor(), INVALID_PROC_NUMBER, LOG_SERVER_ONLY, MyProcNumber, PageIsVerified(), pg_atomic_read_u32(), pgaio_result_report(), PIV_IGNORE_CHECKSUM_FAILURE, PIV_LOG_LOG, READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_ZERO_ON_ERROR, buftag::relNumber, buftag::spcOid, BufferDesc::state, BufferDesc::tag, TerminateBufferIO(), TerminateLocalBufferIO(), VALGRIND_MAKE_MEM_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by buffer_readv_complete().

◆ buffer_readv_decode_error()

static void buffer_readv_decode_error ( PgAioResult  result,
bool *  zeroed_any,
bool *  ignored_any,
uint8 zeroed_or_error_count,
uint8 checkfail_count,
uint8 first_off 
)
inlinestatic

Definition at line 7093 of file bufmgr.c.

7099{
7100 uint32 rem_error = result.error_data;
7101
7102 /* see static asserts in buffer_readv_encode_error */
7103#define READV_COUNT_BITS 7
7104#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
7105
7106 *zeroed_any = rem_error & 1;
7107 rem_error >>= 1;
7108
7109 *ignored_any = rem_error & 1;
7110 rem_error >>= 1;
7111
7112 *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
7113 rem_error >>= READV_COUNT_BITS;
7114
7115 *checkfail_count = rem_error & READV_COUNT_MASK;
7116 rem_error >>= READV_COUNT_BITS;
7117
7118 *first_off = rem_error & READV_COUNT_MASK;
7119 rem_error >>= READV_COUNT_BITS;
7120}
#define READV_COUNT_BITS
#define READV_COUNT_MASK
uint32 error_data
Definition: aio_types.h:111

References PgAioResult::error_data, READV_COUNT_BITS, and READV_COUNT_MASK.

Referenced by buffer_readv_encode_error(), buffer_readv_report(), and shared_buffer_readv_complete_local().

◆ buffer_readv_encode_error()

static void buffer_readv_encode_error ( PgAioResult result,
bool  is_temp,
bool  zeroed_any,
bool  ignored_any,
uint8  error_count,
uint8  zeroed_count,
uint8  checkfail_count,
uint8  first_error_off,
uint8  first_zeroed_off,
uint8  first_ignored_off 
)
inlinestatic

Definition at line 7135 of file bufmgr.c.

7145{
7146
7147 uint8 shift = 0;
7148 uint8 zeroed_or_error_count =
7149 error_count > 0 ? error_count : zeroed_count;
7150 uint8 first_off;
7151
7153 "PG_IOV_MAX is bigger than reserved space for error data");
7155 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
7156
7157 /*
7158 * We only have space to encode one offset - but luckily that's good
7159 * enough. If there is an error, the error is the interesting offset, same
7160 * with a zeroed buffer vs an ignored buffer.
7161 */
7162 if (error_count > 0)
7163 first_off = first_error_off;
7164 else if (zeroed_count > 0)
7165 first_off = first_zeroed_off;
7166 else
7167 first_off = first_ignored_off;
7168
7169 Assert(!zeroed_any || error_count == 0);
7170
7171 result->error_data = 0;
7172
7173 result->error_data |= zeroed_any << shift;
7174 shift += 1;
7175
7176 result->error_data |= ignored_any << shift;
7177 shift += 1;
7178
7179 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
7180 shift += READV_COUNT_BITS;
7181
7182 result->error_data |= ((uint32) checkfail_count) << shift;
7183 shift += READV_COUNT_BITS;
7184
7185 result->error_data |= ((uint32) first_off) << shift;
7186 shift += READV_COUNT_BITS;
7187
7188 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
7190
7191 if (error_count > 0)
7192 result->status = PGAIO_RS_ERROR;
7193 else
7194 result->status = PGAIO_RS_WARNING;
7195
7196 /*
7197 * The encoding is complicated enough to warrant cross-checking it against
7198 * the decode function.
7199 */
7200#ifdef USE_ASSERT_CHECKING
7201 {
7202 bool zeroed_any_2,
7203 ignored_any_2;
7204 uint8 zeroed_or_error_count_2,
7205 checkfail_count_2,
7206 first_off_2;
7207
7209 &zeroed_any_2, &ignored_any_2,
7210 &zeroed_or_error_count_2,
7211 &checkfail_count_2,
7212 &first_off_2);
7213 Assert(zeroed_any == zeroed_any_2);
7214 Assert(ignored_any == ignored_any_2);
7215 Assert(zeroed_or_error_count == zeroed_or_error_count_2);
7216 Assert(checkfail_count == checkfail_count_2);
7217 Assert(first_off == first_off_2);
7218 }
7219#endif
7220
7221#undef READV_COUNT_BITS
7222#undef READV_COUNT_MASK
7223}
#define PGAIO_RESULT_ERROR_BITS
Definition: aio_types.h:98
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition: bufmgr.c:7093
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:940
#define PG_IOV_MAX
Definition: pg_iovec.h:47
uint32 id
Definition: aio_types.h:105

References Assert(), buffer_readv_decode_error(), PgAioResult::error_data, PgAioResult::id, PG_IOV_MAX, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_RESULT_ERROR_BITS, PGAIO_RS_ERROR, PGAIO_RS_WARNING, READV_COUNT_BITS, StaticAssertStmt, and PgAioResult::status.

Referenced by buffer_readv_complete(), and buffer_readv_complete_one().

◆ buffer_readv_report()

static void buffer_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 7477 of file bufmgr.c.

7479{
7480 int nblocks = td->smgr.nblocks;
7481 BlockNumber first = td->smgr.blockNum;
7482 BlockNumber last = first + nblocks - 1;
7483 ProcNumber errProc =
7485 RelPathStr rpath =
7486 relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
7487 bool zeroed_any,
7488 ignored_any;
7489 uint8 zeroed_or_error_count,
7490 checkfail_count,
7491 first_off;
7492 uint8 affected_count;
7493 const char *msg_one,
7494 *msg_mult,
7495 *det_mult,
7496 *hint_mult;
7497
7498 buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
7499 &zeroed_or_error_count,
7500 &checkfail_count,
7501 &first_off);
7502
7503 /*
7504 * Treat a read that had both zeroed buffers *and* ignored checksums as a
7505 * special case, it's too irregular to be emitted the same way as the
7506 * other cases.
7507 */
7508 if (zeroed_any && ignored_any)
7509 {
7510 Assert(zeroed_any && ignored_any);
7511 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
7512 Assert(result.status != PGAIO_RS_ERROR);
7513 affected_count = zeroed_or_error_count;
7514
7515 ereport(elevel,
7517 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation \"%s\"",
7518 affected_count, checkfail_count, first, last, rpath.str),
7519 affected_count > 1 ?
7520 errdetail("Block %u held the first zeroed page.",
7521 first + first_off) : 0,
7522 errhint_plural("See server log for details about the other %d invalid block.",
7523 "See server log for details about the other %d invalid blocks.",
7524 affected_count + checkfail_count - 1,
7525 affected_count + checkfail_count - 1));
7526 return;
7527 }
7528
7529 /*
7530 * The other messages are highly repetitive. To avoid duplicating a long
7531 * and complicated ereport(), gather the translated format strings
7532 * separately and then do one common ereport.
7533 */
7534 if (result.status == PGAIO_RS_ERROR)
7535 {
7536 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
7537 affected_count = zeroed_or_error_count;
7538 msg_one = _("invalid page in block %u of relation \"%s\"");
7539 msg_mult = _("%u invalid pages among blocks %u..%u of relation \"%s\"");
7540 det_mult = _("Block %u held the first invalid page.");
7541 hint_mult = _("See server log for the other %u invalid block(s).");
7542 }
7543 else if (zeroed_any && !ignored_any)
7544 {
7545 affected_count = zeroed_or_error_count;
7546 msg_one = _("invalid page in block %u of relation \"%s\"; zeroing out page");
7547 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation \"%s\"");
7548 det_mult = _("Block %u held the first zeroed page.");
7549 hint_mult = _("See server log for the other %u zeroed block(s).");
7550 }
7551 else if (!zeroed_any && ignored_any)
7552 {
7553 affected_count = checkfail_count;
7554 msg_one = _("ignoring checksum failure in block %u of relation \"%s\"");
7555 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation \"%s\"");
7556 det_mult = _("Block %u held the first ignored page.");
7557 hint_mult = _("See server log for the other %u ignored block(s).");
7558 }
7559 else
7561
7562 ereport(elevel,
7564 affected_count == 1 ?
7565 errmsg_internal(msg_one, first + first_off, rpath.str) :
7566 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
7567 affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
7568 affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
7569}
#define pg_unreachable()
Definition: c.h:336
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1170
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1243
int errhint_internal(const char *fmt,...)
Definition: elog.c:1352
int errhint_plural(const char *fmt_singular, const char *fmt_plural, unsigned long n,...)
Definition: elog.c:1373
#define _(x)
Definition: elog.c:91
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:42
int ProcNumber
Definition: procnumber.h:24
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
BlockNumber blockNum
Definition: aio_types.h:66
BlockNumber nblocks
Definition: aio_types.h:67
ForkNumber forkNum
Definition: aio_types.h:68

References _, Assert(), PgAioTargetData::blockNum, buffer_readv_decode_error(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errdetail(), errdetail_internal(), errhint_internal(), errhint_plural(), errmsg(), errmsg_internal(), PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, pg_unreachable, PGAIO_RS_ERROR, relpathbackend, PgAioTargetData::rlocator, PgAioTargetData::smgr, PgAioResult::status, and RelPathStr::str.

◆ buffer_stage_common()

static pg_attribute_always_inline void buffer_stage_common ( PgAioHandle ioh,
bool  is_write,
bool  is_temp 
)
static

Definition at line 6982 of file bufmgr.c.

6983{
6984 uint64 *io_data;
6985 uint8 handle_data_len;
6986 PgAioWaitRef io_ref;
6988
6989 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
6990
6991 pgaio_io_get_wref(ioh, &io_ref);
6992
6993 /* iterate over all buffers affected by the vectored readv/writev */
6994 for (int i = 0; i < handle_data_len; i++)
6995 {
6996 Buffer buffer = (Buffer) io_data[i];
6997 BufferDesc *buf_hdr = is_temp ?
6998 GetLocalBufferDescriptor(-buffer - 1)
6999 : GetBufferDescriptor(buffer - 1);
7000 uint32 buf_state;
7001
7002 /*
7003 * Check that all the buffers are actually ones that could conceivably
7004 * be done in one IO, i.e. are sequential. This is the last
7005 * buffer-aware code before IO is actually executed and confusion
7006 * about which buffers are targeted by IO can be hard to debug, making
7007 * it worth doing extra-paranoid checks.
7008 */
7009 if (i == 0)
7010 first = buf_hdr->tag;
7011 else
7012 {
7013 Assert(buf_hdr->tag.relNumber == first.relNumber);
7014 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
7015 }
7016
7017 if (is_temp)
7018 buf_state = pg_atomic_read_u32(&buf_hdr->state);
7019 else
7020 buf_state = LockBufHdr(buf_hdr);
7021
7022 /* verify the buffer is in the expected state */
7023 Assert(buf_state & BM_TAG_VALID);
7024 if (is_write)
7025 {
7026 Assert(buf_state & BM_VALID);
7027 Assert(buf_state & BM_DIRTY);
7028 }
7029 else
7030 {
7031 Assert(!(buf_state & BM_VALID));
7032 Assert(!(buf_state & BM_DIRTY));
7033 }
7034
7035 /* temp buffers don't use BM_IO_IN_PROGRESS */
7036 if (!is_temp)
7037 Assert(buf_state & BM_IO_IN_PROGRESS);
7038
7039 Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
7040
7041 /*
7042 * Reflect that the buffer is now owned by the AIO subsystem.
7043 *
7044 * For local buffers: This can't be done just via LocalRefCount, as
7045 * one might initially think, as this backend could error out while
7046 * AIO is still in progress, releasing all the pins by the backend
7047 * itself.
7048 *
7049 * This pin is released again in TerminateBufferIO().
7050 */
7051 buf_hdr->io_wref = io_ref;
7052
7053 if (is_temp)
7054 {
7055 buf_state += BUF_REFCOUNT_ONE;
7056 pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
7057 }
7058 else
7059 UnlockBufHdrExt(buf_hdr, buf_state, 0, 0, 1);
7060
7061 /*
7062 * Ensure the content lock that prevents buffer modifications while
7063 * the buffer is being written out is not released early due to an
7064 * error.
7065 */
7066 if (is_write && !is_temp)
7067 {
7068 LWLock *content_lock;
7069
7070 content_lock = BufferDescriptorGetContentLock(buf_hdr);
7071
7072 Assert(LWLockHeldByMe(content_lock));
7073
7074 /*
7075 * Lock is now owned by AIO subsystem.
7076 */
7077 LWLockDisown(content_lock);
7078 }
7079
7080 /*
7081 * Stop tracking this buffer via the resowner - the AIO system now
7082 * keeps track.
7083 */
7084 if (!is_temp)
7086 }
7087}
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:293
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:51
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
static uint32 UnlockBufHdrExt(BufferDesc *desc, uint32 old_buf_state, uint32 set_bits, uint32 unset_bits, int refcount_change)
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:59
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:228
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1977
void LWLockDisown(LWLock *lock)
Definition: lwlock.c:1883
PgAioWaitRef io_wref
Definition: lwlock.h:42

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, BUF_REFCOUNT_ONE, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), CurrentResourceOwner, GetBufferDescriptor(), GetLocalBufferDescriptor(), i, BufferDesc::io_wref, LockBufHdr(), LWLockDisown(), LWLockHeldByMe(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), PG_USED_FOR_ASSERTS_ONLY, pgaio_io_get_handle_data(), pgaio_io_get_wref(), buftag::relNumber, ResourceOwnerForgetBufferIO(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdrExt().

Referenced by local_buffer_readv_stage(), and shared_buffer_readv_stage().

◆ BufferAlloc()

static pg_attribute_always_inline BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool *  foundPtr,
IOContext  io_context 
)
inlinestatic

Definition at line 1981 of file bufmgr.c.

1985{
1986 BufferTag newTag; /* identity of requested block */
1987 uint32 newHash; /* hash value for newTag */
1988 LWLock *newPartitionLock; /* buffer partition lock for it */
1989 int existing_buf_id;
1990 Buffer victim_buffer;
1991 BufferDesc *victim_buf_hdr;
1992 uint32 victim_buf_state;
1993 uint32 set_bits = 0;
1994
1995 /* Make sure we will have room to remember the buffer pin */
1998
1999 /* create a tag so we can lookup the buffer */
2000 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2001
2002 /* determine its hash code and partition lock ID */
2003 newHash = BufTableHashCode(&newTag);
2004 newPartitionLock = BufMappingPartitionLock(newHash);
2005
2006 /* see if the block is in the buffer pool already */
2007 LWLockAcquire(newPartitionLock, LW_SHARED);
2008 existing_buf_id = BufTableLookup(&newTag, newHash);
2009 if (existing_buf_id >= 0)
2010 {
2011 BufferDesc *buf;
2012 bool valid;
2013
2014 /*
2015 * Found it. Now, pin the buffer so no one can steal it from the
2016 * buffer pool, and check to see if the correct data has been loaded
2017 * into the buffer.
2018 */
2019 buf = GetBufferDescriptor(existing_buf_id);
2020
2021 valid = PinBuffer(buf, strategy, false);
2022
2023 /* Can release the mapping lock as soon as we've pinned it */
2024 LWLockRelease(newPartitionLock);
2025
2026 *foundPtr = true;
2027
2028 if (!valid)
2029 {
2030 /*
2031 * We can only get here if (a) someone else is still reading in
2032 * the page, (b) a previous read attempt failed, or (c) someone
2033 * called StartReadBuffers() but not yet WaitReadBuffers().
2034 */
2035 *foundPtr = false;
2036 }
2037
2038 return buf;
2039 }
2040
2041 /*
2042 * Didn't find it in the buffer pool. We'll have to initialize a new
2043 * buffer. Remember to unlock the mapping lock while doing the work.
2044 */
2045 LWLockRelease(newPartitionLock);
2046
2047 /*
2048 * Acquire a victim buffer. Somebody else might try to do the same, we
2049 * don't hold any conflicting locks. If so we'll have to undo our work
2050 * later.
2051 */
2052 victim_buffer = GetVictimBuffer(strategy, io_context);
2053 victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2054
2055 /*
2056 * Try to make a hashtable entry for the buffer under its new tag. If
2057 * somebody else inserted another buffer for the tag, we'll release the
2058 * victim buffer we acquired and use the already inserted one.
2059 */
2060 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2061 existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2062 if (existing_buf_id >= 0)
2063 {
2064 BufferDesc *existing_buf_hdr;
2065 bool valid;
2066
2067 /*
2068 * Got a collision. Someone has already done what we were about to do.
2069 * We'll just handle this as if it were found in the buffer pool in
2070 * the first place. First, give up the buffer we were planning to
2071 * use.
2072 *
2073 * We could do this after releasing the partition lock, but then we'd
2074 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2075 * before acquiring the lock, for the rare case of such a collision.
2076 */
2077 UnpinBuffer(victim_buf_hdr);
2078
2079 /* remaining code should match code at top of routine */
2080
2081 existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2082
2083 valid = PinBuffer(existing_buf_hdr, strategy, false);
2084
2085 /* Can release the mapping lock as soon as we've pinned it */
2086 LWLockRelease(newPartitionLock);
2087
2088 *foundPtr = true;
2089
2090 if (!valid)
2091 {
2092 /*
2093 * We can only get here if (a) someone else is still reading in
2094 * the page, (b) a previous read attempt failed, or (c) someone
2095 * called StartReadBuffers() but not yet WaitReadBuffers().
2096 */
2097 *foundPtr = false;
2098 }
2099
2100 return existing_buf_hdr;
2101 }
2102
2103 /*
2104 * Need to lock the buffer header too in order to change its tag.
2105 */
2106 victim_buf_state = LockBufHdr(victim_buf_hdr);
2107
2108 /* some sanity checks while we hold the buffer header lock */
2109 Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2110 Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2111
2112 victim_buf_hdr->tag = newTag;
2113
2114 /*
2115 * Make sure BM_PERMANENT is set for buffers that must be written at every
2116 * checkpoint. Unlogged buffers only need to be written at shutdown
2117 * checkpoints, except for their "init" forks, which need to be treated
2118 * just like permanent relations.
2119 */
2120 set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2121 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2122 set_bits |= BM_PERMANENT;
2123
2124 UnlockBufHdrExt(victim_buf_hdr, victim_buf_state,
2125 set_bits, 0, 0);
2126
2127 LWLockRelease(newPartitionLock);
2128
2129 /*
2130 * Buffer contents are currently invalid.
2131 */
2132 *foundPtr = false;
2133
2134 return victim_buf_hdr;
2135}
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_PERMANENT
Definition: buf_internals.h:77
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:54
static LWLock * BufMappingPartitionLock(uint32 hashcode)
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:2320
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy, bool skip_if_not_valid)
Definition: bufmgr.c:3068
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:259
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:3247
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1174
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1894
@ LW_SHARED
Definition: lwlock.h:113
@ LW_EXCLUSIVE
Definition: lwlock.h:112
@ INIT_FORKNUM
Definition: relpath.h:61
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:449

References Assert(), BM_DIRTY, BM_IO_IN_PROGRESS, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), CurrentResourceOwner, GetBufferDescriptor(), GetVictimBuffer(), INIT_FORKNUM, InitBufferTag(), RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrRelationData::smgr_rlocator, BufferDesc::tag, UnlockBufHdrExt(), and UnpinBuffer().

Referenced by PinBufferForBlock().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 4223 of file bufmgr.c.

4224{
4225 BufferDesc *bufHdr;
4226
4227 Assert(BufferIsPinned(buffer));
4228
4229 if (BufferIsLocal(buffer))
4230 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4231 else
4232 bufHdr = GetBufferDescriptor(buffer - 1);
4233
4234 /* pinned, so OK to read tag without spinlock */
4235 return bufHdr->tag.blockNum;
4236}
#define BufferIsLocal(buffer)
Definition: buf.h:37

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_finish_split(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), AsyncReadBuffers(), BitmapHeapScanNextBlock(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), btvacuumpage(), check_index_page(), CheckReadBuffersOperation(), collect_corrupt_items(), collectMatchBitmap(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), gistvacuumpage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_fetch_next_buffer(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_would_be_all_visible(), heap_prepare_pagescan(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), heapam_scan_analyze_next_block(), heapgettup(), heapgettup_pagemode(), index_compute_xid_horizon_for_tuples(), lazy_scan_heap(), lazy_scan_noprune(), lazy_scan_prune(), lazy_vacuum_heap_rel(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), prune_freeze_plan(), read_stream_start_pending_read(), ReadBufferBI(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), StartReadBuffersImpl(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), verify_heapam(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), and visibilitymap_set_vmbits().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 4499 of file bufmgr.c.

4500{
4501 char *page = BufferGetPage(buffer);
4502 BufferDesc *bufHdr;
4503 XLogRecPtr lsn;
4504
4505 /*
4506 * If we don't need locking for correctness, fastpath out.
4507 */
4508 if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
4509 return PageGetLSN(page);
4510
4511 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4512 Assert(BufferIsValid(buffer));
4513 Assert(BufferIsPinned(buffer));
4514
4515 bufHdr = GetBufferDescriptor(buffer - 1);
4516 LockBufHdr(bufHdr);
4517 lsn = PageGetLSN(page);
4518 UnlockBufHdr(bufHdr);
4519
4520 return lsn;
4521}
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:436
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:385
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert(), PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_drop_lock_and_maybe_pin(), _bt_killitems(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 4244 of file bufmgr.c.

4246{
4247 BufferDesc *bufHdr;
4248
4249 /* Do the same checks as BufferGetBlockNumber. */
4250 Assert(BufferIsPinned(buffer));
4251
4252 if (BufferIsLocal(buffer))
4253 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4254 else
4255 bufHdr = GetBufferDescriptor(buffer - 1);
4256
4257 /* pinned, so OK to read tag without spinlock */
4258 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4259 *forknum = BufTagGetForkNum(&bufHdr->tag);
4260 *blknum = bufHdr->tag.blockNum;
4261}

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), heap_inplace_update_and_unlock(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

◆ BufferIsDirty()

bool BufferIsDirty ( Buffer  buffer)

Definition at line 2911 of file bufmgr.c.

2912{
2913 BufferDesc *bufHdr;
2914
2915 Assert(BufferIsPinned(buffer));
2916
2917 if (BufferIsLocal(buffer))
2918 {
2919 int bufid = -buffer - 1;
2920
2921 bufHdr = GetLocalBufferDescriptor(bufid);
2922 /* Content locks are not maintained for local buffers. */
2923 }
2924 else
2925 {
2926 bufHdr = GetBufferDescriptor(buffer - 1);
2928 }
2929
2930 return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2931}
bool BufferIsLockedByMeInMode(Buffer buffer, BufferLockMode mode)
Definition: bufmgr.c:2869
@ BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:207

References Assert(), BM_DIRTY, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by heap_multi_insert(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), log_heap_prune_and_freeze(), and XLogRegisterBuffer().

◆ BufferIsLockedByMe()

bool BufferIsLockedByMe ( Buffer  buffer)

Definition at line 2843 of file bufmgr.c.

2844{
2845 BufferDesc *bufHdr;
2846
2847 Assert(BufferIsPinned(buffer));
2848
2849 if (BufferIsLocal(buffer))
2850 {
2851 /* Content locks are not maintained for local buffers. */
2852 return true;
2853 }
2854 else
2855 {
2856 bufHdr = GetBufferDescriptor(buffer - 1);
2858 }
2859}

References Assert(), PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), and LWLockHeldByMe().

Referenced by FlushOneBuffer(), and MarkBufferDirtyHint().

◆ BufferIsLockedByMeInMode()

bool BufferIsLockedByMeInMode ( Buffer  buffer,
BufferLockMode  mode 
)

Definition at line 2869 of file bufmgr.c.

2870{
2871 BufferDesc *bufHdr;
2872
2873 Assert(BufferIsPinned(buffer));
2874
2875 if (BufferIsLocal(buffer))
2876 {
2877 /* Content locks are not maintained for local buffers. */
2878 return true;
2879 }
2880 else
2881 {
2882 LWLockMode lw_mode;
2883
2884 switch (mode)
2885 {
2887 lw_mode = LW_EXCLUSIVE;
2888 break;
2889 case BUFFER_LOCK_SHARE:
2890 lw_mode = LW_SHARED;
2891 break;
2892 default:
2894 }
2895
2896 bufHdr = GetBufferDescriptor(buffer - 1);
2898 lw_mode);
2899 }
2900}
@ BUFFER_LOCK_SHARE
Definition: bufmgr.h:206
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:2021
LWLockMode
Definition: lwlock.h:111
static PgChecksumMode mode
Definition: pg_checksums.c:56

References Assert(), PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), LW_EXCLUSIVE, LW_SHARED, LWLockHeldByMeInMode(), mode, and pg_unreachable.

Referenced by BufferIsDirty(), IsBufferCleanupOK(), MarkBufferDirty(), visibilitymap_set(), visibilitymap_set_vmbits(), and XLogRegisterBuffer().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 4469 of file bufmgr.c.

4470{
4471 BufferDesc *bufHdr;
4472
4473 /* Local buffers are used only for temp relations. */
4474 if (BufferIsLocal(buffer))
4475 return false;
4476
4477 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4478 Assert(BufferIsValid(buffer));
4479 Assert(BufferIsPinned(buffer));
4480
4481 /*
4482 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4483 * need not bother with the buffer header spinlock. Even if someone else
4484 * changes the buffer header state while we're doing this, the state is
4485 * changed atomically, so we'll read the old value or the new value, but
4486 * not random garbage.
4487 */
4488 bufHdr = GetBufferDescriptor(buffer - 1);
4489 return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
4490}

References Assert(), BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 3343 of file bufmgr.c.

3344{
3345 uint32 buf_state;
3346 int buf_id;
3347 int num_to_scan;
3348 int num_spaces;
3349 int num_processed;
3350 int num_written;
3351 CkptTsStatus *per_ts_stat = NULL;
3352 Oid last_tsid;
3353 binaryheap *ts_heap;
3354 int i;
3355 uint32 mask = BM_DIRTY;
3356 WritebackContext wb_context;
3357
3358 /*
3359 * Unless this is a shutdown checkpoint or we have been explicitly told,
3360 * we write only permanent, dirty buffers. But at shutdown or end of
3361 * recovery, we write all dirty buffers.
3362 */
3365 mask |= BM_PERMANENT;
3366
3367 /*
3368 * Loop over all buffers, and mark the ones that need to be written with
3369 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3370 * can estimate how much work needs to be done.
3371 *
3372 * This allows us to write only those pages that were dirty when the
3373 * checkpoint began, and not those that get dirtied while it proceeds.
3374 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3375 * later in this function, or by normal backends or the bgwriter cleaning
3376 * scan, the flag is cleared. Any buffer dirtied after this point won't
3377 * have the flag set.
3378 *
3379 * Note that if we fail to write some buffer, we may leave buffers with
3380 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3381 * certainly need to be written for the next checkpoint attempt, too.
3382 */
3383 num_to_scan = 0;
3384 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3385 {
3386 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3387 uint32 set_bits = 0;
3388
3389 /*
3390 * Header spinlock is enough to examine BM_DIRTY, see comment in
3391 * SyncOneBuffer.
3392 */
3393 buf_state = LockBufHdr(bufHdr);
3394
3395 if ((buf_state & mask) == mask)
3396 {
3397 CkptSortItem *item;
3398
3399 set_bits = BM_CHECKPOINT_NEEDED;
3400
3401 item = &CkptBufferIds[num_to_scan++];
3402 item->buf_id = buf_id;
3403 item->tsId = bufHdr->tag.spcOid;
3404 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3405 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3406 item->blockNum = bufHdr->tag.blockNum;
3407 }
3408
3409 UnlockBufHdrExt(bufHdr, buf_state,
3410 set_bits, 0,
3411 0);
3412
3413 /* Check for barrier events in case NBuffers is large. */
3416 }
3417
3418 if (num_to_scan == 0)
3419 return; /* nothing to do */
3420
3422
3423 TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3424
3425 /*
3426 * Sort buffers that need to be written to reduce the likelihood of random
3427 * IO. The sorting is also important for the implementation of balancing
3428 * writes between tablespaces. Without balancing writes we'd potentially
3429 * end up writing to the tablespaces one-by-one; possibly overloading the
3430 * underlying system.
3431 */
3432 sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3433
3434 num_spaces = 0;
3435
3436 /*
3437 * Allocate progress status for each tablespace with buffers that need to
3438 * be flushed. This requires the to-be-flushed array to be sorted.
3439 */
3440 last_tsid = InvalidOid;
3441 for (i = 0; i < num_to_scan; i++)
3442 {
3443 CkptTsStatus *s;
3444 Oid cur_tsid;
3445
3446 cur_tsid = CkptBufferIds[i].tsId;
3447
3448 /*
3449 * Grow array of per-tablespace status structs, every time a new
3450 * tablespace is found.
3451 */
3452 if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3453 {
3454 Size sz;
3455
3456 num_spaces++;
3457
3458 /*
3459 * Not worth adding grow-by-power-of-2 logic here - even with a
3460 * few hundred tablespaces this should be fine.
3461 */
3462 sz = sizeof(CkptTsStatus) * num_spaces;
3463
3464 if (per_ts_stat == NULL)
3465 per_ts_stat = (CkptTsStatus *) palloc(sz);
3466 else
3467 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3468
3469 s = &per_ts_stat[num_spaces - 1];
3470 memset(s, 0, sizeof(*s));
3471 s->tsId = cur_tsid;
3472
3473 /*
3474 * The first buffer in this tablespace. As CkptBufferIds is sorted
3475 * by tablespace all (s->num_to_scan) buffers in this tablespace
3476 * will follow afterwards.
3477 */
3478 s->index = i;
3479
3480 /*
3481 * progress_slice will be determined once we know how many buffers
3482 * are in each tablespace, i.e. after this loop.
3483 */
3484
3485 last_tsid = cur_tsid;
3486 }
3487 else
3488 {
3489 s = &per_ts_stat[num_spaces - 1];
3490 }
3491
3492 s->num_to_scan++;
3493
3494 /* Check for barrier events. */
3497 }
3498
3499 Assert(num_spaces > 0);
3500
3501 /*
3502 * Build a min-heap over the write-progress in the individual tablespaces,
3503 * and compute how large a portion of the total progress a single
3504 * processed buffer is.
3505 */
3506 ts_heap = binaryheap_allocate(num_spaces,
3508 NULL);
3509
3510 for (i = 0; i < num_spaces; i++)
3511 {
3512 CkptTsStatus *ts_stat = &per_ts_stat[i];
3513
3514 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3515
3516 binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3517 }
3518
3519 binaryheap_build(ts_heap);
3520
3521 /*
3522 * Iterate through to-be-checkpointed buffers and write the ones (still)
3523 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3524 * tablespaces; otherwise the sorting would lead to only one tablespace
3525 * receiving writes at a time, making inefficient use of the hardware.
3526 */
3527 num_processed = 0;
3528 num_written = 0;
3529 while (!binaryheap_empty(ts_heap))
3530 {
3531 BufferDesc *bufHdr = NULL;
3532 CkptTsStatus *ts_stat = (CkptTsStatus *)
3534
3535 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3536 Assert(buf_id != -1);
3537
3538 bufHdr = GetBufferDescriptor(buf_id);
3539
3540 num_processed++;
3541
3542 /*
3543 * We don't need to acquire the lock here, because we're only looking
3544 * at a single bit. It's possible that someone else writes the buffer
3545 * and clears the flag right after we check, but that doesn't matter
3546 * since SyncOneBuffer will then do nothing. However, there is a
3547 * further race condition: it's conceivable that between the time we
3548 * examine the bit here and the time SyncOneBuffer acquires the lock,
3549 * someone else not only wrote the buffer but replaced it with another
3550 * page and dirtied it. In that improbable case, SyncOneBuffer will
3551 * write the buffer though we didn't need to. It doesn't seem worth
3552 * guarding against this, though.
3553 */
3555 {
3556 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3557 {
3558 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3560 num_written++;
3561 }
3562 }
3563
3564 /*
3565 * Measure progress independent of actually having to flush the buffer
3566 * - otherwise writing become unbalanced.
3567 */
3568 ts_stat->progress += ts_stat->progress_slice;
3569 ts_stat->num_scanned++;
3570 ts_stat->index++;
3571
3572 /* Have all the buffers from the tablespace been processed? */
3573 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3574 {
3575 binaryheap_remove_first(ts_heap);
3576 }
3577 else
3578 {
3579 /* update heap with the new progress */
3580 binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3581 }
3582
3583 /*
3584 * Sleep to throttle our I/O rate.
3585 *
3586 * (This will check for barrier events even if it doesn't sleep.)
3587 */
3588 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3589 }
3590
3591 /*
3592 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3593 * IOContext will always be IOCONTEXT_NORMAL.
3594 */
3596
3597 pfree(per_ts_stat);
3598 per_ts_stat = NULL;
3599 binaryheap_free(ts_heap);
3600
3601 /*
3602 * Update checkpoint statistics. As noted above, this doesn't include
3603 * buffers written by other backends or bgwriter scan.
3604 */
3605 CheckpointStats.ckpt_bufs_written += num_written;
3606
3607 TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3608}
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
#define binaryheap_empty(h)
Definition: binaryheap.h:65
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:76
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:6383
int checkpoint_flush_after
Definition: bufmgr.c:178
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:6406
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:6468
struct CkptTsStatus CkptTsStatus
double float8
Definition: c.h:638
size_t Size
Definition: c.h:613
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:785
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:40
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1610
void pfree(void *pointer)
Definition: mcxt.c:1594
void * palloc(Size size)
Definition: mcxt.c:1365
PgStat_CheckpointerStats PendingCheckpointerStats
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:332
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:322
#define InvalidOid
Definition: postgres_ext.h:37
unsigned int Oid
Definition: postgres_ext.h:32
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:499
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:119
int index
Definition: bufmgr.c:127
int num_scanned
Definition: bufmgr.c:124
float8 progress
Definition: bufmgr.c:118
int num_to_scan
Definition: bufmgr.c:122
Oid tsId
Definition: bufmgr.c:109
PgStat_Counter buffers_written
Definition: pgstat.h:266
CheckpointStatsData CheckpointStats
Definition: xlog.c:211
#define CHECKPOINT_FLUSH_UNLOGGED
Definition: xlog.h:143
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139

References Assert(), binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buffers_written, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_UNLOGGED, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, DatumGetPointer(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IOCONTEXT_NORMAL, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u32(), PointerGetDatum(), ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), buftag::spcOid, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdrExt(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 6318 of file bufmgr.c.

6319{
6320 int ret;
6321 RelFileLocator rlocatora;
6322 RelFileLocator rlocatorb;
6323
6324 rlocatora = BufTagGetRelFileLocator(ba);
6325 rlocatorb = BufTagGetRelFileLocator(bb);
6326
6327 ret = rlocator_comparator(&rlocatora, &rlocatorb);
6328
6329 if (ret != 0)
6330 return ret;
6331
6332 if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
6333 return -1;
6334 if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
6335 return 1;
6336
6337 if (ba->blockNum < bb->blockNum)
6338 return -1;
6339 if (ba->blockNum > bb->blockNum)
6340 return 1;
6341
6342 return 0;
6343}
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:6237

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), and rlocator_comparator().

◆ CheckBufferIsPinnedOnce()

void CheckBufferIsPinnedOnce ( Buffer  buffer)

Definition at line 5651 of file bufmgr.c.

5652{
5653 if (BufferIsLocal(buffer))
5654 {
5655 if (LocalRefCount[-buffer - 1] != 1)
5656 elog(ERROR, "incorrect local pin count: %d",
5657 LocalRefCount[-buffer - 1]);
5658 }
5659 else
5660 {
5661 if (GetPrivateRefCount(buffer) != 1)
5662 elog(ERROR, "incorrect local pin count: %d",
5663 GetPrivateRefCount(buffer));
5664 }
5665}
#define ERROR
Definition: elog.h:39

References PrivateRefCountEntry::buffer, BufferIsLocal, elog, ERROR, GetPrivateRefCount(), and LocalRefCount.

Referenced by GetVictimBuffer(), lazy_scan_heap(), and LockBufferForCleanup().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 4059 of file bufmgr.c.

4060{
4061#ifdef USE_ASSERT_CHECKING
4062 int RefCountErrors = 0;
4064 int i;
4065 char *s;
4066
4067 /* check the array */
4068 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4069 {
4070 res = &PrivateRefCountArray[i];
4071
4072 if (res->buffer != InvalidBuffer)
4073 {
4075 elog(WARNING, "buffer refcount leak: %s", s);
4076 pfree(s);
4077
4078 RefCountErrors++;
4079 }
4080 }
4081
4082 /* if necessary search the hash */
4084 {
4085 HASH_SEQ_STATUS hstat;
4086
4088 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4089 {
4091 elog(WARNING, "buffer refcount leak: %s", s);
4092 pfree(s);
4093 RefCountErrors++;
4094 }
4095 }
4096
4097 Assert(RefCountErrors == 0);
4098#endif
4099}
#define InvalidBuffer
Definition: buf.h:25
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:4166
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:100
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:215
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:216
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1415
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1380

References Assert(), PrivateRefCountEntry::buffer, DebugPrintBufferRefcount(), elog, hash_seq_init(), hash_seq_search(), i, InvalidBuffer, pfree(), PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and WARNING.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 4209 of file bufmgr.c.

4210{
4211 BufferSync(flags);
4212}
static void BufferSync(int flags)
Definition: bufmgr.c:3343

References BufferSync().

Referenced by CheckPointGuts().

◆ CheckReadBuffersOperation()

static void CheckReadBuffersOperation ( ReadBuffersOperation operation,
bool  is_complete 
)
static

Definition at line 1508 of file bufmgr.c.

1509{
1510#ifdef USE_ASSERT_CHECKING
1511 Assert(operation->nblocks_done <= operation->nblocks);
1512 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1513
1514 for (int i = 0; i < operation->nblocks; i++)
1515 {
1516 Buffer buffer = operation->buffers[i];
1517 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1518 GetLocalBufferDescriptor(-buffer - 1) :
1519 GetBufferDescriptor(buffer - 1);
1520
1521 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1523
1524 if (i < operation->nblocks_done)
1526 }
1527#endif
1528}

References Assert(), ReadBuffersOperation::blocknum, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufferGetBlockNumber(), BufferIsLocal, ReadBuffersOperation::buffers, GetBufferDescriptor(), GetLocalBufferDescriptor(), i, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, pg_atomic_read_u32(), and BufferDesc::state.

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 6352 of file bufmgr.c.

6353{
6354 /* compare tablespace */
6355 if (a->tsId < b->tsId)
6356 return -1;
6357 else if (a->tsId > b->tsId)
6358 return 1;
6359 /* compare relation */
6360 if (a->relNumber < b->relNumber)
6361 return -1;
6362 else if (a->relNumber > b->relNumber)
6363 return 1;
6364 /* compare fork */
6365 else if (a->forkNum < b->forkNum)
6366 return -1;
6367 else if (a->forkNum > b->forkNum)
6368 return 1;
6369 /* compare block number */
6370 else if (a->blockNum < b->blockNum)
6371 return -1;
6372 else if (a->blockNum > b->blockNum)
6373 return 1;
6374 /* equal page IDs are unlikely, but not impossible */
6375 return 0;
6376}
int b
Definition: isn.c:74
int a
Definition: isn.c:73

References a, and b.

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 5857 of file bufmgr.c.

5858{
5859 BufferDesc *bufHdr;
5860 uint32 buf_state,
5861 refcount;
5862
5863 Assert(BufferIsValid(buffer));
5864
5865 /* see AIO related comment in LockBufferForCleanup() */
5866
5867 if (BufferIsLocal(buffer))
5868 {
5869 refcount = LocalRefCount[-buffer - 1];
5870 /* There should be exactly one pin */
5871 Assert(refcount > 0);
5872 if (refcount != 1)
5873 return false;
5874 /* Nobody else to wait for */
5875 return true;
5876 }
5877
5878 /* There should be exactly one local pin */
5879 refcount = GetPrivateRefCount(buffer);
5880 Assert(refcount);
5881 if (refcount != 1)
5882 return false;
5883
5884 /* Try to acquire lock */
5885 if (!ConditionalLockBuffer(buffer))
5886 return false;
5887
5888 bufHdr = GetBufferDescriptor(buffer - 1);
5889 buf_state = LockBufHdr(bufHdr);
5890 refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5891
5892 Assert(refcount > 0);
5893 if (refcount == 1)
5894 {
5895 /* Successfully acquired exclusive lock with pincount 1 */
5896 UnlockBufHdr(bufHdr);
5897 return true;
5898 }
5899
5900 /* Failed, so release the lock */
5901 UnlockBufHdr(bufHdr);
5903 return false;
5904}
void LockBuffer(Buffer buffer, BufferLockMode mode)
Definition: bufmgr.c:5604
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5630
@ BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:205

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 5242 of file bufmgr.c.

5244{
5245 char relpersistence;
5246 SMgrRelation src_rel;
5247 SMgrRelation dst_rel;
5248
5249 /* Set the relpersistence. */
5250 relpersistence = permanent ?
5251 RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5252
5253 src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5254 dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5255
5256 /*
5257 * Create and copy all forks of the relation. During create database we
5258 * have a separate cleanup mechanism which deletes complete database
5259 * directory. Therefore, each individual relation doesn't need to be
5260 * registered for cleanup.
5261 */
5262 RelationCreateStorage(dst_rlocator, relpersistence, false);
5263
5264 /* copy main fork. */
5265 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5266 permanent);
5267
5268 /* copy those extra forks that exist */
5269 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5270 forkNum <= MAX_FORKNUM; forkNum++)
5271 {
5272 if (smgrexists(src_rel, forkNum))
5273 {
5274 smgrcreate(dst_rel, forkNum, false);
5275
5276 /*
5277 * WAL log creation if the relation is persistent, or this is the
5278 * init fork of an unlogged relation.
5279 */
5280 if (permanent || forkNum == INIT_FORKNUM)
5281 log_smgrcreate(&dst_rlocator, forkNum);
5282
5283 /* Copy a fork's data, block by block. */
5284 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5285 permanent);
5286 }
5287 }
5288}
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:5128
@ MAIN_FORKNUM
Definition: relpath.h:58
#define MAX_FORKNUM
Definition: relpath.h:70
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:187

References INIT_FORKNUM, INVALID_PROC_NUMBER, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DebugPrintBufferRefcount()

char * DebugPrintBufferRefcount ( Buffer  buffer)

Definition at line 4166 of file bufmgr.c.

4167{
4168 BufferDesc *buf;
4169 int32 loccount;
4170 char *result;
4171 ProcNumber backend;
4172 uint32 buf_state;
4173
4174 Assert(BufferIsValid(buffer));
4175 if (BufferIsLocal(buffer))
4176 {
4177 buf = GetLocalBufferDescriptor(-buffer - 1);
4178 loccount = LocalRefCount[-buffer - 1];
4179 backend = MyProcNumber;
4180 }
4181 else
4182 {
4183 buf = GetBufferDescriptor(buffer - 1);
4184 loccount = GetPrivateRefCount(buffer);
4185 backend = INVALID_PROC_NUMBER;
4186 }
4187
4188 /* theoretically we should lock the bufhdr here */
4189 buf_state = pg_atomic_read_u32(&buf->state);
4190
4191 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
4192 buffer,
4194 BufTagGetForkNum(&buf->tag)).str,
4195 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4196 BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4197 return result;
4198}
#define BUF_FLAG_MASK
Definition: buf_internals.h:56
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43

References Assert(), buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), INVALID_PROC_NUMBER, LocalRefCount, MyProcNumber, pg_atomic_read_u32(), psprintf(), and relpathbackend.

Referenced by buffer_call_start_io(), buffer_call_terminate_io(), CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResOwnerPrintBufferPin().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 4895 of file bufmgr.c.

4896{
4897 int i;
4898
4899 /*
4900 * We needn't consider local buffers, since by assumption the target
4901 * database isn't our own.
4902 */
4903
4904 for (i = 0; i < NBuffers; i++)
4905 {
4906 BufferDesc *bufHdr = GetBufferDescriptor(i);
4907
4908 /*
4909 * As in DropRelationBuffers, an unlocked precheck should be safe and
4910 * saves some cycles.
4911 */
4912 if (bufHdr->tag.dbOid != dbid)
4913 continue;
4914
4915 LockBufHdr(bufHdr);
4916 if (bufHdr->tag.dbOid == dbid)
4917 InvalidateBuffer(bufHdr); /* releases spinlock */
4918 else
4919 UnlockBufHdr(bufHdr);
4920 }
4921}
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:2154

References buftag::dbOid, GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, BufferDesc::tag, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 4545 of file bufmgr.c.

4547{
4548 int i;
4549 int j;
4550 RelFileLocatorBackend rlocator;
4551 BlockNumber nForkBlock[MAX_FORKNUM];
4552 uint64 nBlocksToInvalidate = 0;
4553
4554 rlocator = smgr_reln->smgr_rlocator;
4555
4556 /* If it's a local relation, it's localbuf.c's problem. */
4557 if (RelFileLocatorBackendIsTemp(rlocator))
4558 {
4559 if (rlocator.backend == MyProcNumber)
4560 DropRelationLocalBuffers(rlocator.locator, forkNum, nforks,
4561 firstDelBlock);
4562
4563 return;
4564 }
4565
4566 /*
4567 * To remove all the pages of the specified relation forks from the buffer
4568 * pool, we need to scan the entire buffer pool but we can optimize it by
4569 * finding the buffers from BufMapping table provided we know the exact
4570 * size of each fork of the relation. The exact size is required to ensure
4571 * that we don't leave any buffer for the relation being dropped as
4572 * otherwise the background writer or checkpointer can lead to a PANIC
4573 * error while flushing buffers corresponding to files that don't exist.
4574 *
4575 * To know the exact size, we rely on the size cached for each fork by us
4576 * during recovery which limits the optimization to recovery and on
4577 * standbys but we can easily extend it once we have shared cache for
4578 * relation size.
4579 *
4580 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4581 * and the future writes keeps the cached value up-to-date. See
4582 * smgrextend. It is possible that the value of the first lseek is smaller
4583 * than the actual number of existing blocks in the file due to buggy
4584 * Linux kernels that might not have accounted for the recent write. But
4585 * that should be fine because there must not be any buffers after that
4586 * file size.
4587 */
4588 for (i = 0; i < nforks; i++)
4589 {
4590 /* Get the number of blocks for a relation's fork */
4591 nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4592
4593 if (nForkBlock[i] == InvalidBlockNumber)
4594 {
4595 nBlocksToInvalidate = InvalidBlockNumber;
4596 break;
4597 }
4598
4599 /* calculate the number of blocks to be invalidated */
4600 nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4601 }
4602
4603 /*
4604 * We apply the optimization iff the total number of blocks to invalidate
4605 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4606 */
4607 if (BlockNumberIsValid(nBlocksToInvalidate) &&
4608 nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4609 {
4610 for (j = 0; j < nforks; j++)
4611 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4612 nForkBlock[j], firstDelBlock[j]);
4613 return;
4614 }
4615
4616 for (i = 0; i < NBuffers; i++)
4617 {
4618 BufferDesc *bufHdr = GetBufferDescriptor(i);
4619
4620 /*
4621 * We can make this a tad faster by prechecking the buffer tag before
4622 * we attempt to lock the buffer; this saves a lot of lock
4623 * acquisitions in typical cases. It should be safe because the
4624 * caller must have AccessExclusiveLock on the relation, or some other
4625 * reason to be certain that no one is loading new pages of the rel
4626 * into the buffer pool. (Otherwise we might well miss such pages
4627 * entirely.) Therefore, while the tag might be changing while we
4628 * look at it, it can't be changing *to* a value we care about, only
4629 * *away* from such a value. So false negatives are impossible, and
4630 * false positives are safe because we'll recheck after getting the
4631 * buffer lock.
4632 *
4633 * We could check forkNum and blockNum as well as the rlocator, but
4634 * the incremental win from doing so seems small.
4635 */
4636 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4637 continue;
4638
4639 LockBufHdr(bufHdr);
4640
4641 for (j = 0; j < nforks; j++)
4642 {
4643 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4644 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4645 bufHdr->tag.blockNum >= firstDelBlock[j])
4646 {
4647 InvalidateBuffer(bufHdr); /* releases spinlock */
4648 break;
4649 }
4650 }
4651 if (j >= nforks)
4652 UnlockBufHdr(bufHdr);
4653 }
4654}
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:91
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4835
int j
Definition: isn.c:78
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
Definition: localbuf.c:665
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:847

References RelFileLocatorBackend::backend, buftag::blockNum, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, RelFileLocatorBackendIsTemp, SMgrRelationData::smgr_rlocator, smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 4665 of file bufmgr.c.

4666{
4667 int i;
4668 int n = 0;
4669 SMgrRelation *rels;
4670 BlockNumber (*block)[MAX_FORKNUM + 1];
4671 uint64 nBlocksToInvalidate = 0;
4672 RelFileLocator *locators;
4673 bool cached = true;
4674 bool use_bsearch;
4675
4676 if (nlocators == 0)
4677 return;
4678
4679 rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4680
4681 /* If it's a local relation, it's localbuf.c's problem. */
4682 for (i = 0; i < nlocators; i++)
4683 {
4684 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4685 {
4686 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4687 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4688 }
4689 else
4690 rels[n++] = smgr_reln[i];
4691 }
4692
4693 /*
4694 * If there are no non-local relations, then we're done. Release the
4695 * memory and return.
4696 */
4697 if (n == 0)
4698 {
4699 pfree(rels);
4700 return;
4701 }
4702
4703 /*
4704 * This is used to remember the number of blocks for all the relations
4705 * forks.
4706 */
4707 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4708 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4709
4710 /*
4711 * We can avoid scanning the entire buffer pool if we know the exact size
4712 * of each of the given relation forks. See DropRelationBuffers.
4713 */
4714 for (i = 0; i < n && cached; i++)
4715 {
4716 for (int j = 0; j <= MAX_FORKNUM; j++)
4717 {
4718 /* Get the number of blocks for a relation's fork. */
4719 block[i][j] = smgrnblocks_cached(rels[i], j);
4720
4721 /* We need to only consider the relation forks that exists. */
4722 if (block[i][j] == InvalidBlockNumber)
4723 {
4724 if (!smgrexists(rels[i], j))
4725 continue;
4726 cached = false;
4727 break;
4728 }
4729
4730 /* calculate the total number of blocks to be invalidated */
4731 nBlocksToInvalidate += block[i][j];
4732 }
4733 }
4734
4735 /*
4736 * We apply the optimization iff the total number of blocks to invalidate
4737 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4738 */
4739 if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4740 {
4741 for (i = 0; i < n; i++)
4742 {
4743 for (int j = 0; j <= MAX_FORKNUM; j++)
4744 {
4745 /* ignore relation forks that doesn't exist */
4746 if (!BlockNumberIsValid(block[i][j]))
4747 continue;
4748
4749 /* drop all the buffers for a particular relation fork */
4750 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4751 j, block[i][j], 0);
4752 }
4753 }
4754
4755 pfree(block);
4756 pfree(rels);
4757 return;
4758 }
4759
4760 pfree(block);
4761 locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4762 for (i = 0; i < n; i++)
4763 locators[i] = rels[i]->smgr_rlocator.locator;
4764
4765 /*
4766 * For low number of relations to drop just use a simple walk through, to
4767 * save the bsearch overhead. The threshold to use is rather a guess than
4768 * an exactly determined value, as it depends on many factors (CPU and RAM
4769 * speeds, amount of shared buffers etc.).
4770 */
4771 use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4772
4773 /* sort the list of rlocators if necessary */
4774 if (use_bsearch)
4775 qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4776
4777 for (i = 0; i < NBuffers; i++)
4778 {
4779 RelFileLocator *rlocator = NULL;
4780 BufferDesc *bufHdr = GetBufferDescriptor(i);
4781
4782 /*
4783 * As in DropRelationBuffers, an unlocked precheck should be safe and
4784 * saves some cycles.
4785 */
4786
4787 if (!use_bsearch)
4788 {
4789 int j;
4790
4791 for (j = 0; j < n; j++)
4792 {
4793 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4794 {
4795 rlocator = &locators[j];
4796 break;
4797 }
4798 }
4799 }
4800 else
4801 {
4802 RelFileLocator locator;
4803
4804 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4805 rlocator = bsearch(&locator,
4806 locators, n, sizeof(RelFileLocator),
4808 }
4809
4810 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4811 if (rlocator == NULL)
4812 continue;
4813
4814 LockBufHdr(bufHdr);
4815 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4816 InvalidateBuffer(bufHdr); /* releases spinlock */
4817 else
4818 UnlockBufHdr(bufHdr);
4819 }
4820
4821 pfree(locators);
4822 pfree(rels);
4823}
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:83
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:702
#define qsort(a, b, c, d)
Definition: port.h:500

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, if(), InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, palloc(), pfree(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ EvictAllUnpinnedBuffers()

void EvictAllUnpinnedBuffers ( int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 6683 of file bufmgr.c.

6685{
6686 *buffers_evicted = 0;
6687 *buffers_skipped = 0;
6688 *buffers_flushed = 0;
6689
6690 for (int buf = 1; buf <= NBuffers; buf++)
6691 {
6692 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6693 uint32 buf_state;
6694 bool buffer_flushed;
6695
6697
6698 buf_state = pg_atomic_read_u32(&desc->state);
6699 if (!(buf_state & BM_VALID))
6700 continue;
6701
6704
6705 LockBufHdr(desc);
6706
6707 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6708 (*buffers_evicted)++;
6709 else
6710 (*buffers_skipped)++;
6711
6712 if (buffer_flushed)
6713 (*buffers_flushed)++;
6714 }
6715}
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition: bufmgr.c:6592
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123

References BM_VALID, buf, CHECK_FOR_INTERRUPTS, CurrentResourceOwner, EvictUnpinnedBufferInternal(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u32(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), and BufferDesc::state.

Referenced by pg_buffercache_evict_all().

◆ EvictRelUnpinnedBuffers()

void EvictRelUnpinnedBuffers ( Relation  rel,
int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 6733 of file bufmgr.c.

6735{
6737
6738 *buffers_skipped = 0;
6739 *buffers_evicted = 0;
6740 *buffers_flushed = 0;
6741
6742 for (int buf = 1; buf <= NBuffers; buf++)
6743 {
6744 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6745 uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6746 bool buffer_flushed;
6747
6749
6750 /* An unlocked precheck should be safe and saves some cycles. */
6751 if ((buf_state & BM_VALID) == 0 ||
6753 continue;
6754
6755 /* Make sure we can pin the buffer. */
6758
6759 buf_state = LockBufHdr(desc);
6760
6761 /* recheck, could have changed without the lock */
6762 if ((buf_state & BM_VALID) == 0 ||
6764 {
6765 UnlockBufHdr(desc);
6766 continue;
6767 }
6768
6769 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6770 (*buffers_evicted)++;
6771 else
6772 (*buffers_skipped)++;
6773
6774 if (buffer_flushed)
6775 (*buffers_flushed)++;
6776 }
6777}
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:647
RelFileLocator rd_locator
Definition: rel.h:57

References Assert(), BM_VALID, buf, BufTagMatchesRelFileLocator(), CHECK_FOR_INTERRUPTS, CurrentResourceOwner, EvictUnpinnedBufferInternal(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u32(), RelationData::rd_locator, RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by pg_buffercache_evict_relation().

◆ EvictUnpinnedBuffer()

bool EvictUnpinnedBuffer ( Buffer  buf,
bool *  buffer_flushed 
)

Definition at line 6654 of file bufmgr.c.

6655{
6656 BufferDesc *desc;
6657
6659
6660 /* Make sure we can pin the buffer. */
6663
6664 desc = GetBufferDescriptor(buf - 1);
6665 LockBufHdr(desc);
6666
6667 return EvictUnpinnedBufferInternal(desc, buffer_flushed);
6668}

References Assert(), buf, BufferIsLocal, BufferIsValid(), CurrentResourceOwner, EvictUnpinnedBufferInternal(), GetBufferDescriptor(), LockBufHdr(), ReservePrivateRefCountEntry(), and ResourceOwnerEnlarge().

Referenced by invalidate_rel_block(), modify_rel_block(), and pg_buffercache_evict().

◆ EvictUnpinnedBufferInternal()

static bool EvictUnpinnedBufferInternal ( BufferDesc desc,
bool *  buffer_flushed 
)
static

Definition at line 6592 of file bufmgr.c.

6593{
6594 uint32 buf_state;
6595 bool result;
6596
6597 *buffer_flushed = false;
6598
6599 buf_state = pg_atomic_read_u32(&(desc->state));
6600 Assert(buf_state & BM_LOCKED);
6601
6602 if ((buf_state & BM_VALID) == 0)
6603 {
6604 UnlockBufHdr(desc);
6605 return false;
6606 }
6607
6608 /* Check that it's not pinned already. */
6609 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6610 {
6611 UnlockBufHdr(desc);
6612 return false;
6613 }
6614
6615 PinBuffer_Locked(desc); /* releases spinlock */
6616
6617 /* If it was dirty, try to clean it once. */
6618 if (buf_state & BM_DIRTY)
6619 {
6621 *buffer_flushed = true;
6622 }
6623
6624 /* This will return false if it becomes dirty or someone else pins it. */
6625 result = InvalidateVictimBuffer(desc);
6626
6627 UnpinBuffer(desc);
6628
6629 return result;
6630}
#define BM_LOCKED
Definition: buf_internals.h:68
static void FlushUnlockedBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4420
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:3179
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:2249

References Assert(), BM_DIRTY, BM_LOCKED, BM_VALID, BUF_STATE_GET_REFCOUNT, FlushUnlockedBuffer(), InvalidateVictimBuffer(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, pg_atomic_read_u32(), PinBuffer_Locked(), BufferDesc::state, UnlockBufHdr(), and UnpinBuffer().

Referenced by EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), and EvictUnpinnedBuffer().

◆ ExtendBufferedRel()

Buffer ExtendBufferedRel ( BufferManagerRelation  bmr,
ForkNumber  forkNum,
BufferAccessStrategy  strategy,
uint32  flags 
)

Definition at line 845 of file bufmgr.c.

849{
850 Buffer buf;
851 uint32 extend_by = 1;
852
853 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
854 &buf, &extend_by);
855
856 return buf;
857}
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:877

References buf, and ExtendBufferedRelBy().

Referenced by _bt_allocbuf(), _hash_getnewbuf(), BloomNewBuffer(), brinbuild(), brinbuildempty(), fill_seq_fork_with_data(), ginbuildempty(), GinNewBuffer(), gistbuildempty(), gistNewBuffer(), ReadBuffer_common(), revmap_physical_extend(), and SpGistNewBuffer().

◆ ExtendBufferedRelBy()

BlockNumber ExtendBufferedRelBy ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
Buffer buffers,
uint32 extended_by 
)

Definition at line 877 of file bufmgr.c.

884{
885 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
886 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
887 Assert(extend_by > 0);
888
889 if (bmr.relpersistence == '\0')
890 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
891
892 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
893 extend_by, InvalidBlockNumber,
894 buffers, extended_by);
895}
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2531
SMgrRelation smgr
Definition: bufmgr.h:110
Form_pg_class rd_rel
Definition: rel.h:111

References Assert(), ExtendBufferedRelCommon(), InvalidBlockNumber, RelationData::rd_rel, BufferManagerRelation::rel, BufferManagerRelation::relpersistence, and BufferManagerRelation::smgr.

Referenced by ExtendBufferedRel(), grow_rel(), and RelationAddBlocks().

◆ ExtendBufferedRelCommon()

static BlockNumber ExtendBufferedRelCommon ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2531 of file bufmgr.c.

2539{
2540 BlockNumber first_block;
2541
2542 TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2543 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2544 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2545 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2546 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2547 extend_by);
2548
2549 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2550 first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2551 extend_by, extend_upto,
2552 buffers, &extend_by);
2553 else
2554 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2555 extend_by, extend_upto,
2556 buffers, &extend_by);
2557 *extended_by = extend_by;
2558
2559 TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2560 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.spcOid,
2561 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.dbOid,
2562 BMR_GET_SMGR(bmr)->smgr_rlocator.locator.relNumber,
2563 BMR_GET_SMGR(bmr)->smgr_rlocator.backend,
2564 *extended_by,
2565 first_block);
2566
2567 return first_block;
2568}
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2575
#define BMR_GET_SMGR(bmr)
Definition: bufmgr.h:118
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:346

References BMR_GET_SMGR, ExtendBufferedRelLocal(), ExtendBufferedRelShared(), and BufferManagerRelation::relpersistence.

Referenced by ExtendBufferedRelBy(), and ExtendBufferedRelTo().

◆ ExtendBufferedRelShared()

static BlockNumber ExtendBufferedRelShared ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2575 of file bufmgr.c.

2583{
2584 BlockNumber first_block;
2585 IOContext io_context = IOContextForStrategy(strategy);
2586 instr_time io_start;
2587
2588 LimitAdditionalPins(&extend_by);
2589
2590 /*
2591 * Acquire victim buffers for extension without holding extension lock.
2592 * Writing out victim buffers is the most expensive part of extending the
2593 * relation, particularly when doing so requires WAL flushes. Zeroing out
2594 * the buffers is also quite expensive, so do that before holding the
2595 * extension lock as well.
2596 *
2597 * These pages are pinned by us and not valid. While we hold the pin they
2598 * can't be acquired as victim buffers by another backend.
2599 */
2600 for (uint32 i = 0; i < extend_by; i++)
2601 {
2602 Block buf_block;
2603
2604 buffers[i] = GetVictimBuffer(strategy, io_context);
2605 buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2606
2607 /* new buffers are zero-filled */
2608 MemSet(buf_block, 0, BLCKSZ);
2609 }
2610
2611 /*
2612 * Lock relation against concurrent extensions, unless requested not to.
2613 *
2614 * We use the same extension lock for all forks. That's unnecessarily
2615 * restrictive, but currently extensions for forks don't happen often
2616 * enough to make it worth locking more granularly.
2617 *
2618 * Note that another backend might have extended the relation by the time
2619 * we get the lock.
2620 */
2621 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2623
2624 /*
2625 * If requested, invalidate size cache, so that smgrnblocks asks the
2626 * kernel.
2627 */
2628 if (flags & EB_CLEAR_SIZE_CACHE)
2629 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
2630
2631 first_block = smgrnblocks(BMR_GET_SMGR(bmr), fork);
2632
2633 /*
2634 * Now that we have the accurate relation size, check if the caller wants
2635 * us to extend to only up to a specific size. If there were concurrent
2636 * extensions, we might have acquired too many buffers and need to release
2637 * them.
2638 */
2639 if (extend_upto != InvalidBlockNumber)
2640 {
2641 uint32 orig_extend_by = extend_by;
2642
2643 if (first_block > extend_upto)
2644 extend_by = 0;
2645 else if ((uint64) first_block + extend_by > extend_upto)
2646 extend_by = extend_upto - first_block;
2647
2648 for (uint32 i = extend_by; i < orig_extend_by; i++)
2649 {
2650 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2651
2652 UnpinBuffer(buf_hdr);
2653 }
2654
2655 if (extend_by == 0)
2656 {
2657 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2659 *extended_by = extend_by;
2660 return first_block;
2661 }
2662 }
2663
2664 /* Fail if relation is already at maximum possible length */
2665 if ((uint64) first_block + extend_by >= MaxBlockNumber)
2666 ereport(ERROR,
2667 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2668 errmsg("cannot extend relation %s beyond %u blocks",
2669 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str,
2670 MaxBlockNumber)));
2671
2672 /*
2673 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2674 *
2675 * This needs to happen before we extend the relation, because as soon as
2676 * we do, other backends can start to read in those pages.
2677 */
2678 for (uint32 i = 0; i < extend_by; i++)
2679 {
2680 Buffer victim_buf = buffers[i];
2681 BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2682 BufferTag tag;
2683 uint32 hash;
2684 LWLock *partition_lock;
2685 int existing_id;
2686
2687 /* in case we need to pin an existing buffer below */
2690
2691 InitBufferTag(&tag, &BMR_GET_SMGR(bmr)->smgr_rlocator.locator, fork,
2692 first_block + i);
2693 hash = BufTableHashCode(&tag);
2694 partition_lock = BufMappingPartitionLock(hash);
2695
2696 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2697
2698 existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2699
2700 /*
2701 * We get here only in the corner case where we are trying to extend
2702 * the relation but we found a pre-existing buffer. This can happen
2703 * because a prior attempt at extending the relation failed, and
2704 * because mdread doesn't complain about reads beyond EOF (when
2705 * zero_damaged_pages is ON) and so a previous attempt to read a block
2706 * beyond EOF could have left a "valid" zero-filled buffer.
2707 *
2708 * This has also been observed when relation was overwritten by
2709 * external process. Since the legitimate cases should always have
2710 * left a zero-filled buffer, complain if not PageIsNew.
2711 */
2712 if (existing_id >= 0)
2713 {
2714 BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2715 Block buf_block;
2716 bool valid;
2717
2718 /*
2719 * Pin the existing buffer before releasing the partition lock,
2720 * preventing it from being evicted.
2721 */
2722 valid = PinBuffer(existing_hdr, strategy, false);
2723
2724 LWLockRelease(partition_lock);
2725 UnpinBuffer(victim_buf_hdr);
2726
2727 buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2728 buf_block = BufHdrGetBlock(existing_hdr);
2729
2730 if (valid && !PageIsNew((Page) buf_block))
2731 ereport(ERROR,
2732 (errmsg("unexpected data beyond EOF in block %u of relation \"%s\"",
2733 existing_hdr->tag.blockNum,
2734 relpath(BMR_GET_SMGR(bmr)->smgr_rlocator, fork).str)));
2735
2736 /*
2737 * We *must* do smgr[zero]extend before succeeding, else the page
2738 * will not be reserved by the kernel, and the next P_NEW call
2739 * will decide to return the same page. Clear the BM_VALID bit,
2740 * do StartBufferIO() and proceed.
2741 *
2742 * Loop to handle the very small possibility that someone re-sets
2743 * BM_VALID between our clearing it and StartBufferIO inspecting
2744 * it.
2745 */
2746 do
2747 {
2748 pg_atomic_fetch_and_u32(&existing_hdr->state, ~BM_VALID);
2749 } while (!StartBufferIO(existing_hdr, true, false));
2750 }
2751 else
2752 {
2753 uint32 buf_state;
2754 uint32 set_bits = 0;
2755
2756 buf_state = LockBufHdr(victim_buf_hdr);
2757
2758 /* some sanity checks while we hold the buffer header lock */
2759 Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2760 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2761
2762 victim_buf_hdr->tag = tag;
2763
2764 set_bits |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2765 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2766 set_bits |= BM_PERMANENT;
2767
2768 UnlockBufHdrExt(victim_buf_hdr, buf_state,
2769 set_bits, 0,
2770 0);
2771
2772 LWLockRelease(partition_lock);
2773
2774 /* XXX: could combine the locked operations in it with the above */
2775 StartBufferIO(victim_buf_hdr, true, false);
2776 }
2777 }
2778
2780
2781 /*
2782 * Note: if smgrzeroextend fails, we will end up with buffers that are
2783 * allocated but not marked BM_VALID. The next relation extension will
2784 * still select the same block number (because the relation didn't get any
2785 * longer on disk) and so future attempts to extend the relation will find
2786 * the same buffers (if they have not been recycled) but come right back
2787 * here to try smgrzeroextend again.
2788 *
2789 * We don't need to set checksum for all-zero pages.
2790 */
2791 smgrzeroextend(BMR_GET_SMGR(bmr), fork, first_block, extend_by, false);
2792
2793 /*
2794 * Release the file-extension lock; it's now OK for someone else to extend
2795 * the relation some more.
2796 *
2797 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2798 * take noticeable time.
2799 */
2800 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2802
2804 io_start, 1, extend_by * BLCKSZ);
2805
2806 /* Set BM_VALID, terminate IO, and wake up any waiters */
2807 for (uint32 i = 0; i < extend_by; i++)
2808 {
2809 Buffer buf = buffers[i];
2810 BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2811 bool lock = false;
2812
2813 if (flags & EB_LOCK_FIRST && i == 0)
2814 lock = true;
2815 else if (flags & EB_LOCK_TARGET)
2816 {
2817 Assert(extend_upto != InvalidBlockNumber);
2818 if (first_block + i + 1 == extend_upto)
2819 lock = true;
2820 }
2821
2822 if (lock)
2824
2825 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2826 }
2827
2829
2830 *extended_by = extend_by;
2831
2832 return first_block;
2833}
static uint32 pg_atomic_fetch_and_u32(volatile pg_atomic_uint32 *ptr, uint32 and_)
Definition: atomics.h:394
#define MaxBlockNumber
Definition: block.h:35
#define BM_JUST_DIRTIED
Definition: buf_internals.h:74
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:72
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2513
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:6046
void * Block
Definition: bufmgr.h:26
@ EB_LOCK_TARGET
Definition: bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:90
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:75
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:233
#define MemSet(start, val, len)
Definition: c.h:1022
const char * str
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
#define ExclusiveLock
Definition: lockdefs.h:42
@ IOOP_EXTEND
Definition: pgstat.h:314
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
#define relpath(rlocator, forknum)
Definition: relpath.h:150
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:649
int64 shared_blks_written
Definition: instrument.h:29

References Assert(), buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, BMR_GET_SMGR, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BUFFER_LOCK_EXCLUSIVE, BufferDescriptorGetBuffer(), BufHdrGetBlock, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), CurrentResourceOwner, EB_CLEAR_SIZE_CACHE, EB_LOCK_FIRST, EB_LOCK_TARGET, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errmsg(), ERROR, ExclusiveLock, GetBufferDescriptor(), GetVictimBuffer(), hash(), i, INIT_FORKNUM, InitBufferTag(), InvalidBlockNumber, IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EXTEND, LimitAdditionalPins(), LockBuffer(), LockBufHdr(), LockRelationForExtension(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MaxBlockNumber, MemSet, PageIsNew(), pg_atomic_fetch_and_u32(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer(), BufferManagerRelation::rel, relpath, BufferManagerRelation::relpersistence, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_written, smgrnblocks(), smgrzeroextend(), StartBufferIO(), BufferDesc::state, str, BufferDesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdrExt(), UnlockRelationForExtension(), and UnpinBuffer().

Referenced by ExtendBufferedRelCommon().

◆ ExtendBufferedRelTo()

Buffer ExtendBufferedRelTo ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
BlockNumber  extend_to,
ReadBufferMode  mode 
)

Definition at line 906 of file bufmgr.c.

912{
914 uint32 extended_by = 0;
915 Buffer buffer = InvalidBuffer;
916 Buffer buffers[64];
917
918 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
919 Assert(bmr.smgr == NULL || bmr.relpersistence != '\0');
920 Assert(extend_to != InvalidBlockNumber && extend_to > 0);
921
922 if (bmr.relpersistence == '\0')
923 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
924
925 /*
926 * If desired, create the file if it doesn't exist. If
927 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
928 * an smgrexists call.
929 */
930 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
931 (BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == 0 ||
932 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
933 !smgrexists(BMR_GET_SMGR(bmr), fork))
934 {
936
937 /* recheck, fork might have been created concurrently */
938 if (!smgrexists(BMR_GET_SMGR(bmr), fork))
940
942 }
943
944 /*
945 * If requested, invalidate size cache, so that smgrnblocks asks the
946 * kernel.
947 */
948 if (flags & EB_CLEAR_SIZE_CACHE)
949 BMR_GET_SMGR(bmr)->smgr_cached_nblocks[fork] = InvalidBlockNumber;
950
951 /*
952 * Estimate how many pages we'll need to extend by. This avoids acquiring
953 * unnecessarily many victim buffers.
954 */
956
957 /*
958 * Since no-one else can be looking at the page contents yet, there is no
959 * difference between an exclusive lock and a cleanup-strength lock. Note
960 * that we pass the original mode to ReadBuffer_common() below, when
961 * falling back to reading the buffer to a concurrent relation extension.
962 */
964 flags |= EB_LOCK_TARGET;
965
966 while (current_size < extend_to)
967 {
968 uint32 num_pages = lengthof(buffers);
969 BlockNumber first_block;
970
971 if ((uint64) current_size + num_pages > extend_to)
972 num_pages = extend_to - current_size;
973
974 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
975 num_pages, extend_to,
976 buffers, &extended_by);
977
978 current_size = first_block + extended_by;
979 Assert(num_pages != 0 || current_size >= extend_to);
980
981 for (uint32 i = 0; i < extended_by; i++)
982 {
983 if (first_block + i != extend_to - 1)
984 ReleaseBuffer(buffers[i]);
985 else
986 buffer = buffers[i];
987 }
988 }
989
990 /*
991 * It's possible that another backend concurrently extended the relation.
992 * In that case read the buffer.
993 *
994 * XXX: Should we control this via a flag?
995 */
996 if (buffer == InvalidBuffer)
997 {
998 Assert(extended_by == 0);
999 buffer = ReadBuffer_common(bmr.rel, BMR_GET_SMGR(bmr), bmr.relpersistence,
1000 fork, extend_to - 1, mode, strategy);
1001 }
1002
1003 return buffer;
1004}
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1174
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5366
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:84
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:47
#define lengthof(array)
Definition: c.h:790
static int64 current_size
Definition: pg_checksums.c:64

References Assert(), BMR_GET_SMGR, PrivateRefCountEntry::buffer, current_size, EB_CLEAR_SIZE_CACHE, EB_CREATE_FORK_IF_NEEDED, EB_LOCK_TARGET, EB_PERFORMING_RECOVERY, ExclusiveLock, ExtendBufferedRelCommon(), i, InvalidBlockNumber, InvalidBuffer, lengthof, LockRelationForExtension(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RelationData::rd_rel, ReadBuffer_common(), BufferManagerRelation::rel, ReleaseBuffer(), BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, smgrcreate(), smgrexists(), smgrnblocks(), and UnlockRelationForExtension().

Referenced by fsm_extend(), vm_extend(), and XLogReadBufferExtended().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 4835 of file bufmgr.c.

4838{
4839 BlockNumber curBlock;
4840
4841 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4842 {
4843 uint32 bufHash; /* hash value for tag */
4844 BufferTag bufTag; /* identity of requested block */
4845 LWLock *bufPartitionLock; /* buffer partition lock for it */
4846 int buf_id;
4847 BufferDesc *bufHdr;
4848
4849 /* create a tag so we can lookup the buffer */
4850 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4851
4852 /* determine its hash code and partition lock ID */
4853 bufHash = BufTableHashCode(&bufTag);
4854 bufPartitionLock = BufMappingPartitionLock(bufHash);
4855
4856 /* Check that it is in the buffer pool. If not, do nothing. */
4857 LWLockAcquire(bufPartitionLock, LW_SHARED);
4858 buf_id = BufTableLookup(&bufTag, bufHash);
4859 LWLockRelease(bufPartitionLock);
4860
4861 if (buf_id < 0)
4862 continue;
4863
4864 bufHdr = GetBufferDescriptor(buf_id);
4865
4866 /*
4867 * We need to lock the buffer header and recheck if the buffer is
4868 * still associated with the same block because the buffer could be
4869 * evicted by some other backend loading blocks for a different
4870 * relation after we release lock on the BufMapping table.
4871 */
4872 LockBufHdr(bufHdr);
4873
4874 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4875 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4876 bufHdr->tag.blockNum >= firstDelBlock)
4877 InvalidateBuffer(bufHdr); /* releases spinlock */
4878 else
4879 UnlockBufHdr(bufHdr);
4880 }
4881}

References buftag::blockNum, BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), BufferDesc::tag, and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 4283 of file bufmgr.c.

4285{
4286 XLogRecPtr recptr;
4287 ErrorContextCallback errcallback;
4288 instr_time io_start;
4289 Block bufBlock;
4290 char *bufToWrite;
4291 uint32 buf_state;
4292
4293 /*
4294 * Try to start an I/O operation. If StartBufferIO returns false, then
4295 * someone else flushed the buffer before we could, so we need not do
4296 * anything.
4297 */
4298 if (!StartBufferIO(buf, false, false))
4299 return;
4300
4301 /* Setup error traceback support for ereport() */
4303 errcallback.arg = buf;
4304 errcallback.previous = error_context_stack;
4305 error_context_stack = &errcallback;
4306
4307 /* Find smgr relation for buffer */
4308 if (reln == NULL)
4310
4311 TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4312 buf->tag.blockNum,
4316
4317 buf_state = LockBufHdr(buf);
4318
4319 /*
4320 * Run PageGetLSN while holding header lock, since we don't have the
4321 * buffer locked exclusively in all cases.
4322 */
4323 recptr = BufferGetLSN(buf);
4324
4325 /* To check if block content changes while flushing. - vadim 01/17/97 */
4326 UnlockBufHdrExt(buf, buf_state,
4327 0, BM_JUST_DIRTIED,
4328 0);
4329
4330 /*
4331 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4332 * rule that log updates must hit disk before any of the data-file changes
4333 * they describe do.
4334 *
4335 * However, this rule does not apply to unlogged relations, which will be
4336 * lost after a crash anyway. Most unlogged relation pages do not bear
4337 * LSNs since we never emit WAL records for them, and therefore flushing
4338 * up through the buffer LSN would be useless, but harmless. However,
4339 * GiST indexes use LSNs internally to track page-splits, and therefore
4340 * unlogged GiST pages bear "fake" LSNs generated by
4341 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4342 * LSN counter could advance past the WAL insertion point; and if it did
4343 * happen, attempting to flush WAL through that location would fail, with
4344 * disastrous system-wide consequences. To make sure that can't happen,
4345 * skip the flush if the buffer isn't permanent.
4346 */
4347 if (buf_state & BM_PERMANENT)
4348 XLogFlush(recptr);
4349
4350 /*
4351 * Now it's safe to write the buffer to disk. Note that no one else should
4352 * have been able to write it, while we were busy with log flushing,
4353 * because we got the exclusive right to perform I/O by setting the
4354 * BM_IO_IN_PROGRESS bit.
4355 */
4356 bufBlock = BufHdrGetBlock(buf);
4357
4358 /*
4359 * Update page checksum if desired. Since we have only shared lock on the
4360 * buffer, other processes might be updating hint bits in it, so we must
4361 * copy the page to private storage if we do checksumming.
4362 */
4363 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4364
4366
4367 /*
4368 * bufToWrite is either the shared buffer or a copy, as appropriate.
4369 */
4370 smgrwrite(reln,
4371 BufTagGetForkNum(&buf->tag),
4372 buf->tag.blockNum,
4373 bufToWrite,
4374 false);
4375
4376 /*
4377 * When a strategy is in use, only flushes of dirty buffers already in the
4378 * strategy ring are counted as strategy writes (IOCONTEXT
4379 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4380 * statistics tracking.
4381 *
4382 * If a shared buffer initially added to the ring must be flushed before
4383 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4384 *
4385 * If a shared buffer which was added to the ring later because the
4386 * current strategy buffer is pinned or in use or because all strategy
4387 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4388 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4389 * (from_ring will be false).
4390 *
4391 * When a strategy is not in use, the write can only be a "regular" write
4392 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4393 */
4395 IOOP_WRITE, io_start, 1, BLCKSZ);
4396
4398
4399 /*
4400 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4401 * end the BM_IO_IN_PROGRESS state.
4402 */
4403 TerminateBufferIO(buf, true, 0, true, false);
4404
4405 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4406 buf->tag.blockNum,
4410
4411 /* Pop the error context stack */
4412 error_context_stack = errcallback.previous;
4413}
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:73
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6205
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1509
ErrorContextCallback * error_context_stack
Definition: elog.c:95
@ IOOP_WRITE
Definition: pgstat.h:316
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:131
struct ErrorContextCallback * previous
Definition: elog.h:297
void(* callback)(void *arg)
Definition: elog.h:298
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2783

References ErrorContextCallback::arg, BM_JUST_DIRTIED, BM_PERMANENT, buf, BufferGetLSN, BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, RelFileLocator::dbOid, error_context_stack, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITE, RelFileLocatorBackend::locator, LockBufHdr(), PageSetChecksumCopy(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), ErrorContextCallback::previous, RelFileLocator::relNumber, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rlocator, smgropen(), smgrwrite(), RelFileLocator::spcOid, StartBufferIO(), TerminateBufferIO(), track_io_timing, UnlockBufHdrExt(), and XLogFlush().

Referenced by FlushOneBuffer(), FlushUnlockedBuffer(), and GetVictimBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 5306 of file bufmgr.c.

5307{
5308 int i;
5309 BufferDesc *bufHdr;
5310
5311 for (i = 0; i < NBuffers; i++)
5312 {
5313 uint32 buf_state;
5314
5315 bufHdr = GetBufferDescriptor(i);
5316
5317 /*
5318 * As in DropRelationBuffers, an unlocked precheck should be safe and
5319 * saves some cycles.
5320 */
5321 if (bufHdr->tag.dbOid != dbid)
5322 continue;
5323
5324 /* Make sure we can handle the pin */
5327
5328 buf_state = LockBufHdr(bufHdr);
5329 if (bufHdr->tag.dbOid == dbid &&
5330 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5331 {
5332 PinBuffer_Locked(bufHdr);
5334 UnpinBuffer(bufHdr);
5335 }
5336 else
5337 UnlockBufHdr(bufHdr);
5338 }
5339}

References BM_DIRTY, BM_VALID, CurrentResourceOwner, buftag::dbOid, FlushUnlockedBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 5346 of file bufmgr.c.

5347{
5348 BufferDesc *bufHdr;
5349
5350 /* currently not needed, but no fundamental reason not to support */
5351 Assert(!BufferIsLocal(buffer));
5352
5353 Assert(BufferIsPinned(buffer));
5354
5355 bufHdr = GetBufferDescriptor(buffer - 1);
5356
5357 Assert(BufferIsLockedByMe(buffer));
5358
5360}
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4283
bool BufferIsLockedByMe(Buffer buffer)
Definition: bufmgr.c:2843

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsLockedByMe(), BufferIsPinned, FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, and IOOBJECT_RELATION.

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), invalidate_rel_block(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 4942 of file bufmgr.c.

4943{
4944 int i;
4945 BufferDesc *bufHdr;
4946 SMgrRelation srel = RelationGetSmgr(rel);
4947
4948 if (RelationUsesLocalBuffers(rel))
4949 {
4950 for (i = 0; i < NLocBuffer; i++)
4951 {
4952 uint32 buf_state;
4953
4954 bufHdr = GetLocalBufferDescriptor(i);
4955 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4956 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4957 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4958 {
4959 ErrorContextCallback errcallback;
4960
4961 /* Setup error traceback support for ereport() */
4963 errcallback.arg = bufHdr;
4964 errcallback.previous = error_context_stack;
4965 error_context_stack = &errcallback;
4966
4967 /* Make sure we can handle the pin */
4970
4971 /*
4972 * Pin/unpin mostly to make valgrind work, but it also seems
4973 * like the right thing to do.
4974 */
4975 PinLocalBuffer(bufHdr, false);
4976
4977
4978 FlushLocalBuffer(bufHdr, srel);
4979
4981
4982 /* Pop the error context stack */
4983 error_context_stack = errcallback.previous;
4984 }
4985 }
4986
4987 return;
4988 }
4989
4990 for (i = 0; i < NBuffers; i++)
4991 {
4992 uint32 buf_state;
4993
4994 bufHdr = GetBufferDescriptor(i);
4995
4996 /*
4997 * As in DropRelationBuffers, an unlocked precheck should be safe and
4998 * saves some cycles.
4999 */
5000 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
5001 continue;
5002
5003 /* Make sure we can handle the pin */
5006
5007 buf_state = LockBufHdr(bufHdr);
5008 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
5009 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5010 {
5011 PinBuffer_Locked(bufHdr);
5013 UnpinBuffer(bufHdr);
5014 }
5015 else
5016 UnlockBufHdr(bufHdr);
5017 }
5018}
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6221
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition: localbuf.c:183
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:841
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:805
int NLocBuffer
Definition: localbuf.c:45
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:577

References ErrorContextCallback::arg, BM_DIRTY, BM_VALID, BufferDescriptorGetBuffer(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushLocalBuffer(), FlushUnlockedBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, local_buffer_write_error_callback(), LockBufHdr(), NBuffers, NLocBuffer, pg_atomic_read_u32(), PinBuffer_Locked(), PinLocalBuffer(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 5030 of file bufmgr.c.

5031{
5032 int i;
5033 SMgrSortArray *srels;
5034 bool use_bsearch;
5035
5036 if (nrels == 0)
5037 return;
5038
5039 /* fill-in array for qsort */
5040 srels = palloc(sizeof(SMgrSortArray) * nrels);
5041
5042 for (i = 0; i < nrels; i++)
5043 {
5044 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
5045
5046 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
5047 srels[i].srel = smgrs[i];
5048 }
5049
5050 /*
5051 * Save the bsearch overhead for low number of relations to sync. See
5052 * DropRelationsAllBuffers for details.
5053 */
5054 use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
5055
5056 /* sort the list of SMgrRelations if necessary */
5057 if (use_bsearch)
5058 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
5059
5060 for (i = 0; i < NBuffers; i++)
5061 {
5062 SMgrSortArray *srelent = NULL;
5063 BufferDesc *bufHdr = GetBufferDescriptor(i);
5064 uint32 buf_state;
5065
5066 /*
5067 * As in DropRelationBuffers, an unlocked precheck should be safe and
5068 * saves some cycles.
5069 */
5070
5071 if (!use_bsearch)
5072 {
5073 int j;
5074
5075 for (j = 0; j < nrels; j++)
5076 {
5077 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5078 {
5079 srelent = &srels[j];
5080 break;
5081 }
5082 }
5083 }
5084 else
5085 {
5086 RelFileLocator rlocator;
5087
5088 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5089 srelent = bsearch(&rlocator,
5090 srels, nrels, sizeof(SMgrSortArray),
5092 }
5093
5094 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5095 if (srelent == NULL)
5096 continue;
5097
5098 /* Make sure we can handle the pin */
5101
5102 buf_state = LockBufHdr(bufHdr);
5103 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5104 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5105 {
5106 PinBuffer_Locked(bufHdr);
5108 UnpinBuffer(bufHdr);
5109 }
5110 else
5111 UnlockBufHdr(bufHdr);
5112 }
5113
5114 pfree(srels);
5115}
SMgrRelation srel
Definition: bufmgr.c:140
RelFileLocator rlocator
Definition: bufmgr.c:139

References Assert(), BM_DIRTY, BM_VALID, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, FlushUnlockedBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, RelFileLocatorBackend::locator, LockBufHdr(), NBuffers, palloc(), pfree(), PinBuffer_Locked(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrSortArray::rlocator, rlocator_comparator(), SMgrRelationData::smgr_rlocator, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ FlushUnlockedBuffer()

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 448 of file bufmgr.c.

449{
450 Assert(ref->refcount == 0);
451
452 if (ref >= &PrivateRefCountArray[0] &&
454 {
455 ref->buffer = InvalidBuffer;
456
457 /*
458 * Mark the just used entry as reserved - in many scenarios that
459 * allows us to avoid ever having to search the array/hash for free
460 * entries.
461 */
463 }
464 else
465 {
466 bool found;
467 Buffer buffer = ref->buffer;
468
470 Assert(found);
473 }
474}
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:219
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:952
@ HASH_REMOVE
Definition: hsearch.h:115

References Assert(), PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by UnpinBufferNoOwner().

◆ GetAdditionalPinLimit()

uint32 GetAdditionalPinLimit ( void  )

Definition at line 2487 of file bufmgr.c.

2488{
2489 uint32 estimated_pins_held;
2490
2491 /*
2492 * We get the number of "overflowed" pins for free, but don't know the
2493 * number of pins in PrivateRefCountArray. The cost of calculating that
2494 * exactly doesn't seem worth it, so just assume the max.
2495 */
2496 estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2497
2498 /* Is this backend already holding more than its fair share? */
2499 if (estimated_pins_held > MaxProportionalPins)
2500 return 0;
2501
2502 return MaxProportionalPins - estimated_pins_held;
2503}
static uint32 MaxProportionalPins
Definition: bufmgr.c:221

References MaxProportionalPins, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by LimitAdditionalPins(), and read_stream_start_pending_read().

◆ GetPinLimit()

uint32 GetPinLimit ( void  )

Definition at line 2475 of file bufmgr.c.

2476{
2477 return MaxProportionalPins;
2478}

References MaxProportionalPins.

Referenced by GetAccessStrategy(), and read_stream_begin_impl().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 425 of file bufmgr.c.

426{
428
429 Assert(BufferIsValid(buffer));
430 Assert(!BufferIsLocal(buffer));
431
432 /*
433 * Not moving the entry - that's ok for the current users, but we might
434 * want to change this one day.
435 */
436 ref = GetPrivateRefCountEntry(buffer, false);
437
438 if (ref == NULL)
439 return 0;
440 return ref->refcount;
441}
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:351

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by CheckBufferIsPinnedOnce(), ConditionalLockBufferForCleanup(), DebugPrintBufferRefcount(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), and MarkBufferDirtyHint().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 351 of file bufmgr.c.

352{
354 int i;
355
356 Assert(BufferIsValid(buffer));
357 Assert(!BufferIsLocal(buffer));
358
359 /*
360 * First search for references in the array, that'll be sufficient in the
361 * majority of cases.
362 */
363 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
364 {
365 res = &PrivateRefCountArray[i];
366
367 if (res->buffer == buffer)
368 return res;
369 }
370
371 /*
372 * By here we know that the buffer, if already pinned, isn't residing in
373 * the array.
374 *
375 * Only look up the buffer in the hashtable if we've previously overflowed
376 * into it.
377 */
379 return NULL;
380
381 res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
382
383 if (res == NULL)
384 return NULL;
385 else if (!do_move)
386 {
387 /* caller doesn't want us to move the hash entry into the array */
388 return res;
389 }
390 else
391 {
392 /* move buffer from hashtable into the free array slot */
393 bool found;
395
396 /* Ensure there's a free array slot */
398
399 /* Use up the reserved slot */
403 Assert(free->buffer == InvalidBuffer);
404
405 /* and fill it */
406 free->buffer = buffer;
407 free->refcount = res->refcount;
408
409 /* delete from hashtable */
411 Assert(found);
414
415 return free;
416 }
417}
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBufferNoOwner().

◆ GetVictimBuffer()

static Buffer GetVictimBuffer ( BufferAccessStrategy  strategy,
IOContext  io_context 
)
static

Definition at line 2320 of file bufmgr.c.

2321{
2322 BufferDesc *buf_hdr;
2323 Buffer buf;
2324 uint32 buf_state;
2325 bool from_ring;
2326
2327 /*
2328 * Ensure, before we pin a victim buffer, that there's a free refcount
2329 * entry and resource owner slot for the pin.
2330 */
2333
2334 /* we return here if a prospective victim buffer gets used concurrently */
2335again:
2336
2337 /*
2338 * Select a victim buffer. The buffer is returned pinned and owned by
2339 * this backend.
2340 */
2341 buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2342 buf = BufferDescriptorGetBuffer(buf_hdr);
2343
2344 /*
2345 * We shouldn't have any other pins for this buffer.
2346 */
2348
2349 /*
2350 * If the buffer was dirty, try to write it out. There is a race
2351 * condition here, in that someone might dirty it after we released the
2352 * buffer header lock above, or even while we are writing it out (since
2353 * our share-lock won't prevent hint-bit updates). We will recheck the
2354 * dirty bit after re-locking the buffer header.
2355 */
2356 if (buf_state & BM_DIRTY)
2357 {
2358 LWLock *content_lock;
2359
2360 Assert(buf_state & BM_TAG_VALID);
2361 Assert(buf_state & BM_VALID);
2362
2363 /*
2364 * We need a share-lock on the buffer contents to write it out (else
2365 * we might write invalid data, eg because someone else is compacting
2366 * the page contents while we write). We must use a conditional lock
2367 * acquisition here to avoid deadlock. Even though the buffer was not
2368 * pinned (and therefore surely not locked) when StrategyGetBuffer
2369 * returned it, someone else could have pinned and exclusive-locked it
2370 * by the time we get here. If we try to get the lock unconditionally,
2371 * we'd block waiting for them; if they later block waiting for us,
2372 * deadlock ensues. (This has been observed to happen when two
2373 * backends are both trying to split btree index pages, and the second
2374 * one just happens to be trying to split the page the first one got
2375 * from StrategyGetBuffer.)
2376 */
2377 content_lock = BufferDescriptorGetContentLock(buf_hdr);
2378 if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2379 {
2380 /*
2381 * Someone else has locked the buffer, so give it up and loop back
2382 * to get another one.
2383 */
2384 UnpinBuffer(buf_hdr);
2385 goto again;
2386 }
2387
2388 /*
2389 * If using a nondefault strategy, and writing the buffer would
2390 * require a WAL flush, let the strategy decide whether to go ahead
2391 * and write/reuse the buffer or to choose another victim. We need a
2392 * lock to inspect the page LSN, so this can't be done inside
2393 * StrategyGetBuffer.
2394 */
2395 if (strategy != NULL)
2396 {
2397 XLogRecPtr lsn;
2398
2399 /* Read the LSN while holding buffer header lock */
2400 buf_state = LockBufHdr(buf_hdr);
2401 lsn = BufferGetLSN(buf_hdr);
2402 UnlockBufHdr(buf_hdr);
2403
2404 if (XLogNeedsFlush(lsn)
2405 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2406 {
2407 LWLockRelease(content_lock);
2408 UnpinBuffer(buf_hdr);
2409 goto again;
2410 }
2411 }
2412
2413 /* OK, do the I/O */
2414 FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2415 LWLockRelease(content_lock);
2416
2418 &buf_hdr->tag);
2419 }
2420
2421
2422 if (buf_state & BM_VALID)
2423 {
2424 /*
2425 * When a BufferAccessStrategy is in use, blocks evicted from shared
2426 * buffers are counted as IOOP_EVICT in the corresponding context
2427 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2428 * strategy in two cases: 1) while initially claiming buffers for the
2429 * strategy ring 2) to replace an existing strategy ring buffer
2430 * because it is pinned or in use and cannot be reused.
2431 *
2432 * Blocks evicted from buffers already in the strategy ring are
2433 * counted as IOOP_REUSE in the corresponding strategy context.
2434 *
2435 * At this point, we can accurately count evictions and reuses,
2436 * because we have successfully claimed the valid buffer. Previously,
2437 * we may have been forced to release the buffer due to concurrent
2438 * pinners or erroring out.
2439 */
2441 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2442 }
2443
2444 /*
2445 * If the buffer has an entry in the buffer mapping table, delete it. This
2446 * can fail because another backend could have pinned or dirtied the
2447 * buffer.
2448 */
2449 if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2450 {
2451 UnpinBuffer(buf_hdr);
2452 goto again;
2453 }
2454
2455 /* a final set of sanity checks */
2456#ifdef USE_ASSERT_CHECKING
2457 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2458
2459 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2460 Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2461
2463#endif
2464
2465 return buf;
2466}
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5651
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:6418
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:174
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:787
@ IOOP_EVICT
Definition: pgstat.h:307
@ IOOP_REUSE
Definition: pgstat.h:310
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3127

References Assert(), BackendWritebackContext, BM_DIRTY, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufferGetLSN, CheckBufferIsPinnedOnce(), CurrentResourceOwner, FlushBuffer(), InvalidateVictimBuffer(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, LockBufHdr(), LW_SHARED, LWLockConditionalAcquire(), LWLockRelease(), pg_atomic_read_u32(), pgstat_count_io_op(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), BufferDesc::state, StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by BufferAlloc(), and ExtendBufferedRelShared().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 5831 of file bufmgr.c.

5832{
5833 int bufid = GetStartupBufferPinWaitBufId();
5834
5835 /*
5836 * If we get woken slowly then it's possible that the Startup process was
5837 * already woken by other backends before we got here. Also possible that
5838 * we get here by multiple interrupts or interrupts at inappropriate
5839 * times, so make sure we do nothing if the bufid is not set.
5840 */
5841 if (bufid < 0)
5842 return false;
5843
5844 if (GetPrivateRefCount(bufid + 1) > 0)
5845 return true;
5846
5847 return false;
5848}
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:771

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and ProcessRecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

◆ InitBufferManagerAccess()

void InitBufferManagerAccess ( void  )

Definition at line 4007 of file bufmgr.c.

4008{
4009 HASHCTL hash_ctl;
4010
4011 /*
4012 * An advisory limit on the number of pins each backend should hold, based
4013 * on shared_buffers and the maximum number of connections possible.
4014 * That's very pessimistic, but outside toy-sized shared_buffers it should
4015 * allow plenty of pins. LimitAdditionalPins() and
4016 * GetAdditionalPinLimit() can be used to check the remaining balance.
4017 */
4019
4020 memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
4021
4022 hash_ctl.keysize = sizeof(Buffer);
4023 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4024
4025 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4027
4028 /*
4029 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4030 * the corresponding phase of backend shutdown.
4031 */
4032 Assert(MyProc != NULL);
4034}
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:4041
struct PrivateRefCountEntry PrivateRefCountEntry
HTAB * hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:358
int MaxBackends
Definition: globals.c:146
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
#define NUM_AUXILIARY_PROCS
Definition: proc.h:463
PGPROC * MyProc
Definition: proc.c:67
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76

References Assert(), AtProcExit_Buffers(), HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MaxBackends, MaxProportionalPins, MyProc, NBuffers, NUM_AUXILIARY_PROCS, on_shmem_exit(), PrivateRefCountArray, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 2154 of file bufmgr.c.

2155{
2156 BufferTag oldTag;
2157 uint32 oldHash; /* hash value for oldTag */
2158 LWLock *oldPartitionLock; /* buffer partition lock for it */
2159 uint32 oldFlags;
2160 uint32 buf_state;
2161
2162 /* Save the original buffer tag before dropping the spinlock */
2163 oldTag = buf->tag;
2164
2166
2167 /*
2168 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2169 * worth storing the hashcode in BufferDesc so we need not recompute it
2170 * here? Probably not.
2171 */
2172 oldHash = BufTableHashCode(&oldTag);
2173 oldPartitionLock = BufMappingPartitionLock(oldHash);
2174
2175retry:
2176
2177 /*
2178 * Acquire exclusive mapping lock in preparation for changing the buffer's
2179 * association.
2180 */
2181 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2182
2183 /* Re-lock the buffer header */
2184 buf_state = LockBufHdr(buf);
2185
2186 /* If it's changed while we were waiting for lock, do nothing */
2187 if (!BufferTagsEqual(&buf->tag, &oldTag))
2188 {
2190 LWLockRelease(oldPartitionLock);
2191 return;
2192 }
2193
2194 /*
2195 * We assume the reason for it to be pinned is that either we were
2196 * asynchronously reading the page in before erroring out or someone else
2197 * is flushing the page out. Wait for the IO to finish. (This could be
2198 * an infinite loop if the refcount is messed up... it would be nice to
2199 * time out after awhile, but there seems no way to be sure how many loops
2200 * may be needed. Note that if the other guy has pinned the buffer but
2201 * not yet done StartBufferIO, WaitIO will fall through and we'll
2202 * effectively be busy-looping here.)
2203 */
2204 if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2205 {
2207 LWLockRelease(oldPartitionLock);
2208 /* safety check: should definitely not be our *own* pin */
2210 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2211 WaitIO(buf);
2212 goto retry;
2213 }
2214
2215 /*
2216 * Clear out the buffer's tag and flags. We must do this to ensure that
2217 * linear scans of the buffer array don't think the buffer is valid.
2218 */
2219 oldFlags = buf_state & BUF_FLAG_MASK;
2220 ClearBufferTag(&buf->tag);
2221
2222 UnlockBufHdrExt(buf, buf_state,
2223 0,
2225 0);
2226
2227 /*
2228 * Remove the buffer from the lookup hashtable, if it was in there.
2229 */
2230 if (oldFlags & BM_TAG_VALID)
2231 BufTableDelete(&oldTag, oldHash);
2232
2233 /*
2234 * Done with mapping lock.
2235 */
2236 LWLockRelease(oldPartitionLock);
2237}
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:53
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static void ClearBufferTag(BufferTag *tag)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5967

References BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), UnlockBufHdr(), UnlockBufHdrExt(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ InvalidateVictimBuffer()

static bool InvalidateVictimBuffer ( BufferDesc buf_hdr)
static

Definition at line 2249 of file bufmgr.c.

2250{
2251 uint32 buf_state;
2252 uint32 hash;
2253 LWLock *partition_lock;
2254 BufferTag tag;
2255
2257
2258 /* have buffer pinned, so it's safe to read tag without lock */
2259 tag = buf_hdr->tag;
2260
2261 hash = BufTableHashCode(&tag);
2262 partition_lock = BufMappingPartitionLock(hash);
2263
2264 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2265
2266 /* lock the buffer header */
2267 buf_state = LockBufHdr(buf_hdr);
2268
2269 /*
2270 * We have the buffer pinned nobody else should have been able to unset
2271 * this concurrently.
2272 */
2273 Assert(buf_state & BM_TAG_VALID);
2274 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2275 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2276
2277 /*
2278 * If somebody else pinned the buffer since, or even worse, dirtied it,
2279 * give up on this buffer: It's clearly in use.
2280 */
2281 if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2282 {
2283 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2284
2285 UnlockBufHdr(buf_hdr);
2286 LWLockRelease(partition_lock);
2287
2288 return false;
2289 }
2290
2291 /*
2292 * Clear out the buffer's tag and flags and usagecount. This is not
2293 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2294 * doing anything with the buffer. But currently it's beneficial, as the
2295 * cheaper pre-check for several linear scans of shared buffers use the
2296 * tag (see e.g. FlushDatabaseBuffers()).
2297 */
2298 ClearBufferTag(&buf_hdr->tag);
2299 UnlockBufHdrExt(buf_hdr, buf_state,
2300 0,
2302 0);
2303
2304 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2305
2306 /* finally delete buffer from the buffer mapping table */
2307 BufTableDelete(&tag, hash);
2308
2309 LWLockRelease(partition_lock);
2310
2311 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2312 Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2313 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2315
2316 return true;
2317}

References Assert(), BM_DIRTY, BM_TAG_VALID, BM_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), GetPrivateRefCount(), hash(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), and UnlockBufHdrExt().

Referenced by EvictUnpinnedBufferInternal(), and GetVictimBuffer().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 5915 of file bufmgr.c.

5916{
5917 BufferDesc *bufHdr;
5918 uint32 buf_state;
5919
5920 Assert(BufferIsValid(buffer));
5921
5922 /* see AIO related comment in LockBufferForCleanup() */
5923
5924 if (BufferIsLocal(buffer))
5925 {
5926 /* There should be exactly one pin */
5927 if (LocalRefCount[-buffer - 1] != 1)
5928 return false;
5929 /* Nobody else to wait for */
5930 return true;
5931 }
5932
5933 /* There should be exactly one local pin */
5934 if (GetPrivateRefCount(buffer) != 1)
5935 return false;
5936
5937 bufHdr = GetBufferDescriptor(buffer - 1);
5938
5939 /* caller must hold exclusive lock on buffer */
5941
5942 buf_state = LockBufHdr(bufHdr);
5943
5944 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5945 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5946 {
5947 /* pincount is OK. */
5948 UnlockBufHdr(bufHdr);
5949 return true;
5950 }
5951
5952 UnlockBufHdr(bufHdr);
5953 return false;
5954}

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), BufferIsValid(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext wb_context,
IOContext  io_context 
)

Definition at line 6468 of file bufmgr.c.

6469{
6470 instr_time io_start;
6471 int i;
6472
6473 if (wb_context->nr_pending == 0)
6474 return;
6475
6476 /*
6477 * Executing the writes in-order can make them a lot faster, and allows to
6478 * merge writeback requests to consecutive blocks into larger writebacks.
6479 */
6480 sort_pending_writebacks(wb_context->pending_writebacks,
6481 wb_context->nr_pending);
6482
6484
6485 /*
6486 * Coalesce neighbouring writes, but nothing else. For that we iterate
6487 * through the, now sorted, array of pending flushes, and look forward to
6488 * find all neighbouring (or identical) writes.
6489 */
6490 for (i = 0; i < wb_context->nr_pending; i++)
6491 {
6494 SMgrRelation reln;
6495 int ahead;
6496 BufferTag tag;
6497 RelFileLocator currlocator;
6498 Size nblocks = 1;
6499
6500 cur = &wb_context->pending_writebacks[i];
6501 tag = cur->tag;
6502 currlocator = BufTagGetRelFileLocator(&tag);
6503
6504 /*
6505 * Peek ahead, into following writeback requests, to see if they can
6506 * be combined with the current one.
6507 */
6508 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6509 {
6510
6511 next = &wb_context->pending_writebacks[i + ahead + 1];
6512
6513 /* different file, stop */
6514 if (!RelFileLocatorEquals(currlocator,
6515 BufTagGetRelFileLocator(&next->tag)) ||
6516 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6517 break;
6518
6519 /* ok, block queued twice, skip */
6520 if (cur->tag.blockNum == next->tag.blockNum)
6521 continue;
6522
6523 /* only merge consecutive writes */
6524 if (cur->tag.blockNum + 1 != next->tag.blockNum)
6525 break;
6526
6527 nblocks++;
6528 cur = next;
6529 }
6530
6531 i += ahead;
6532
6533 /* and finally tell the kernel to write the data to storage */
6534 reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6535 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6536 }
6537
6538 /*
6539 * Assume that writeback requests are only issued for buffers containing
6540 * blocks of permanent relations.
6541 */
6543 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
6544
6545 wb_context->nr_pending = 0;
6546}
static int32 next
Definition: blutils.c:224
struct cursor * cur
Definition: ecpg.c:29
@ IOOP_WRITEBACK
Definition: pgstat.h:311
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:805
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, i, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITEBACK, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RelFileLocatorEquals, smgropen(), smgrwriteback(), and track_io_timing.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ LimitAdditionalPins()

void LimitAdditionalPins ( uint32 additional_pins)

Definition at line 2513 of file bufmgr.c.

2514{
2515 uint32 limit;
2516
2517 if (*additional_pins <= 1)
2518 return;
2519
2520 limit = GetAdditionalPinLimit();
2521 limit = Max(limit, 1);
2522 if (limit < *additional_pins)
2523 *additional_pins = limit;
2524}
uint32 GetAdditionalPinLimit(void)
Definition: bufmgr.c:2487
#define Max(x, y)
Definition: c.h:1000

References GetAdditionalPinLimit(), and Max.

Referenced by ExtendBufferedRelShared().

◆ local_buffer_readv_complete()

static PgAioResult local_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 7629 of file bufmgr.c.

7631{
7632 return buffer_readv_complete(ioh, prior_result, cb_data, true);
7633}
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition: bufmgr.c:7374

References buffer_readv_complete().

◆ local_buffer_readv_stage()

static void local_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 7623 of file bufmgr.c.

7624{
7625 buffer_stage_common(ioh, false, true);
7626}
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition: bufmgr.c:6982

References buffer_stage_common().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 6221 of file bufmgr.c.

6222{
6223 BufferDesc *bufHdr = (BufferDesc *) arg;
6224
6225 if (bufHdr != NULL)
6226 errcontext("writing block %u of relation \"%s\"",
6227 bufHdr->tag.blockNum,
6230 BufTagGetForkNum(&bufHdr->tag)).str);
6231}
#define errcontext
Definition: elog.h:198
void * arg

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, MyProcNumber, relpathbackend, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
BufferLockMode  mode 
)

Definition at line 5604 of file bufmgr.c.

5605{
5606 BufferDesc *buf;
5607
5608 Assert(BufferIsPinned(buffer));
5609 if (BufferIsLocal(buffer))
5610 return; /* local buffers need no lock */
5611
5612 buf = GetBufferDescriptor(buffer - 1);
5613
5614 if (mode == BUFFER_LOCK_UNLOCK)
5616 else if (mode == BUFFER_LOCK_SHARE)
5618 else if (mode == BUFFER_LOCK_EXCLUSIVE)
5620 else
5621 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5622}

References Assert(), buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), and mode.

Referenced by _bt_lockbuf(), _bt_unlockbuf(), _bt_upgradelockbufcleanup(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), BitmapHeapScanNextBlock(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), create_toy_buffer(), entryLoadMoreItems(), ExtendBufferedRelShared(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), gin_refind_parent(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishOldSplit(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_lock(), heap_inplace_unlock(), heap_inplace_update_and_unlock(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune_opt(), heap_prepare_pagescan(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgettup(), initBloomState(), invalidate_rel_block(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_page(), lazy_vacuum_heap_rel(), LockBufferForCleanup(), log_newpage_range(), modify_rel_block(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_hash_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), ScanSourceDatabasePgClass(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), update_most_recent_deletion_info(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferForRedoExtended(), XLogRecordPageWithFreeSpace(), and ZeroAndLockBuffer().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 5684 of file bufmgr.c.

5685{
5686 BufferDesc *bufHdr;
5687 TimestampTz waitStart = 0;
5688 bool waiting = false;
5689 bool logged_recovery_conflict = false;
5690
5691 Assert(BufferIsPinned(buffer));
5692 Assert(PinCountWaitBuf == NULL);
5693
5695
5696 /*
5697 * We do not yet need to be worried about in-progress AIOs holding a pin,
5698 * as we, so far, only support doing reads via AIO and this function can
5699 * only be called once the buffer is valid (i.e. no read can be in
5700 * flight).
5701 */
5702
5703 /* Nobody else to wait for */
5704 if (BufferIsLocal(buffer))
5705 return;
5706
5707 bufHdr = GetBufferDescriptor(buffer - 1);
5708
5709 for (;;)
5710 {
5711 uint32 buf_state;
5712 uint32 unset_bits = 0;
5713
5714 /* Try to acquire lock */
5716 buf_state = LockBufHdr(bufHdr);
5717
5718 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5719 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5720 {
5721 /* Successfully acquired exclusive lock with pincount 1 */
5722 UnlockBufHdr(bufHdr);
5723
5724 /*
5725 * Emit the log message if recovery conflict on buffer pin was
5726 * resolved but the startup process waited longer than
5727 * deadlock_timeout for it.
5728 */
5729 if (logged_recovery_conflict)
5731 waitStart, GetCurrentTimestamp(),
5732 NULL, false);
5733
5734 if (waiting)
5735 {
5736 /* reset ps display to remove the suffix if we added one */
5738 waiting = false;
5739 }
5740 return;
5741 }
5742 /* Failed, so mark myself as waiting for pincount 1 */
5743 if (buf_state & BM_PIN_COUNT_WAITER)
5744 {
5745 UnlockBufHdr(bufHdr);
5747 elog(ERROR, "multiple backends attempting to wait for pincount 1");
5748 }
5750 PinCountWaitBuf = bufHdr;
5751 UnlockBufHdrExt(bufHdr, buf_state,
5753 0);
5755
5756 /* Wait to be signaled by UnpinBuffer() */
5757 if (InHotStandby)
5758 {
5759 if (!waiting)
5760 {
5761 /* adjust the process title to indicate that it's waiting */
5762 set_ps_display_suffix("waiting");
5763 waiting = true;
5764 }
5765
5766 /*
5767 * Emit the log message if the startup process is waiting longer
5768 * than deadlock_timeout for recovery conflict on buffer pin.
5769 *
5770 * Skip this if first time through because the startup process has
5771 * not started waiting yet in this case. So, the wait start
5772 * timestamp is set after this logic.
5773 */
5774 if (waitStart != 0 && !logged_recovery_conflict)
5775 {
5777
5778 if (TimestampDifferenceExceeds(waitStart, now,
5780 {
5782 waitStart, now, NULL, true);
5783 logged_recovery_conflict = true;
5784 }
5785 }
5786
5787 /*
5788 * Set the wait start timestamp if logging is enabled and first
5789 * time through.
5790 */
5791 if (log_recovery_conflict_waits && waitStart == 0)
5792 waitStart = GetCurrentTimestamp();
5793
5794 /* Publish the bufid that Startup process waits on */
5795 SetStartupBufferPinWaitBufId(buffer - 1);
5796 /* Set alarm and then wait to be signaled by UnpinBuffer() */
5798 /* Reset the published bufid */
5800 }
5801 else
5802 ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
5803
5804 /*
5805 * Remove flag marking us as waiter. Normally this will not be set
5806 * anymore, but ProcWaitForSignal() can return for other signals as
5807 * well. We take care to only reset the flag if we're the waiter, as
5808 * theoretically another backend could have started waiting. That's
5809 * impossible with the current usages due to table level locking, but
5810 * better be safe.
5811 */
5812 buf_state = LockBufHdr(bufHdr);
5813 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5815 unset_bits |= BM_PIN_COUNT_WAITER;
5816
5817 UnlockBufHdrExt(bufHdr, buf_state,
5818 0, unset_bits,
5819 0);
5820
5821 PinCountWaitBuf = NULL;
5822 /* Loop back and try again */
5823 }
5824}
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:75
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:183
int64 TimestampTz
Definition: timestamp.h:39
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:439
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:387
int DeadlockTimeout
Definition: proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:759
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1984
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:793
bool log_recovery_conflict_waits
Definition: standby.c:42
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
int wait_backend_pgprocno
static volatile sig_atomic_t waiting
Definition: waiteventset.c:171
#define InHotStandby
Definition: xlogutils.h:60

References Assert(), BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, CheckBufferIsPinnedOnce(), DeadlockTimeout, elog, ERROR, GetBufferDescriptor(), GetCurrentTimestamp(), InHotStandby, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProcNumber, now(), PinCountWaitBuf, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), UnlockBufHdrExt(), BufferDesc::wait_backend_pgprocno, and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), XLogReadBufferForRedoExtended(), and ZeroAndLockBuffer().

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 6264 of file bufmgr.c.

6265{
6266 SpinDelayStatus delayStatus;
6267 uint32 old_buf_state;
6268
6270
6271 init_local_spin_delay(&delayStatus);
6272
6273 while (true)
6274 {
6275 /* set BM_LOCKED flag */
6276 old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
6277 /* if it wasn't set before we're OK */
6278 if (!(old_buf_state & BM_LOCKED))
6279 break;
6280 perform_spin_delay(&delayStatus);
6281 }
6282 finish_spin_delay(&delayStatus);
6283 return old_buf_state | BM_LOCKED;
6284}
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:408
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:733

References Assert(), BM_LOCKED, BufferDescriptorGetBuffer(), BufferIsLocal, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), buffer_stage_common(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), create_toy_buffer(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FindAndDropRelationBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), MarkDirtyUnpinnedBuffer(), pg_buffercache_os_pages_internal(), pg_buffercache_pages(), StartBufferIO(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), WaitIO(), and WakePinCountWaiter().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 2943 of file bufmgr.c.

2944{
2945 BufferDesc *bufHdr;
2946 uint32 buf_state;
2947 uint32 old_buf_state;
2948
2949 if (!BufferIsValid(buffer))
2950 elog(ERROR, "bad buffer ID: %d", buffer);
2951
2952 if (BufferIsLocal(buffer))
2953 {
2954 MarkLocalBufferDirty(buffer);
2955 return;
2956 }
2957
2958 bufHdr = GetBufferDescriptor(buffer - 1);
2959
2960 Assert(BufferIsPinned(buffer));
2962
2963 /*
2964 * NB: We have to wait for the buffer header spinlock to be not held, as
2965 * TerminateBufferIO() relies on the spinlock.
2966 */
2967 old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2968 for (;;)
2969 {
2970 if (old_buf_state & BM_LOCKED)
2971 old_buf_state = WaitBufHdrUnlocked(bufHdr);
2972
2973 buf_state = old_buf_state;
2974
2975 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2976 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2977
2978 if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2979 buf_state))
2980 break;
2981 }
2982
2983 /*
2984 * If the buffer was not dirty already, do vacuum accounting.
2985 */
2986 if (!(old_buf_state & BM_DIRTY))
2987 {
2989 if (VacuumCostActive)
2991 }
2992}
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:347
pg_noinline uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:6294
int VacuumCostPageDirty
Definition: globals.c:153
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:491
int64 shared_blks_dirtied
Definition: instrument.h:28

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferIsLocal, BufferIsLockedByMeInMode(), BufferIsPinned, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), dataExecPlaceToPageInternal(), dataExecPlaceToPageLeaf(), doPickSplit(), entryExecPlaceToPage(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_inplace_update_and_unlock(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune_and_freeze(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_new_or_empty(), lazy_scan_prune(), lazy_vacuum_heap_page(), log_newpage_range(), MarkDirtyUnpinnedBufferInternal(), moveLeafs(), nextval_internal(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), SetSequence(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), visibilitymap_set_vmbits(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 5430 of file bufmgr.c.

5431{
5432 BufferDesc *bufHdr;
5433 Page page = BufferGetPage(buffer);
5434
5435 if (!BufferIsValid(buffer))
5436 elog(ERROR, "bad buffer ID: %d", buffer);
5437
5438 if (BufferIsLocal(buffer))
5439 {
5440 MarkLocalBufferDirty(buffer);
5441 return;
5442 }
5443
5444 bufHdr = GetBufferDescriptor(buffer - 1);
5445
5446 Assert(GetPrivateRefCount(buffer) > 0);
5447 /* here, either share or exclusive lock is OK */
5448 Assert(BufferIsLockedByMe(buffer));
5449
5450 /*
5451 * This routine might get called many times on the same page, if we are
5452 * making the first scan after commit of an xact that added/deleted many
5453 * tuples. So, be as quick as we can if the buffer is already dirty. We
5454 * do this by not acquiring spinlock if it looks like the status bits are
5455 * already set. Since we make this test unlocked, there's a chance we
5456 * might fail to notice that the flags have just been cleared, and failed
5457 * to reset them, due to memory-ordering issues. But since this function
5458 * is only intended to be used in cases where failing to write out the
5459 * data would be harmless anyway, it doesn't really matter.
5460 */
5461 if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5463 {
5465 bool dirtied = false;
5466 bool delayChkptFlags = false;
5467 uint32 buf_state;
5468
5469 /*
5470 * If we need to protect hint bit updates from torn writes, WAL-log a
5471 * full page image of the page. This full page image is only necessary
5472 * if the hint bit update is the first change to the page since the
5473 * last checkpoint.
5474 *
5475 * We don't check full_page_writes here because that logic is included
5476 * when we call XLogInsert() since the value changes dynamically.
5477 */
5478 if (XLogHintBitIsNeeded() &&
5480 {
5481 /*
5482 * If we must not write WAL, due to a relfilelocator-specific
5483 * condition or being in recovery, don't dirty the page. We can
5484 * set the hint, just not dirty the page as a result so the hint
5485 * is lost when we evict the page or shutdown.
5486 *
5487 * See src/backend/storage/page/README for longer discussion.
5488 */
5489 if (RecoveryInProgress() ||
5491 return;
5492
5493 /*
5494 * If the block is already dirty because we either made a change
5495 * or set a hint already, then we don't need to write a full page
5496 * image. Note that aggressive cleaning of blocks dirtied by hint
5497 * bit setting would increase the call rate. Bulk setting of hint
5498 * bits would reduce the call rate...
5499 *
5500 * We must issue the WAL record before we mark the buffer dirty.
5501 * Otherwise we might write the page before we write the WAL. That
5502 * causes a race condition, since a checkpoint might occur between
5503 * writing the WAL record and marking the buffer dirty. We solve
5504 * that with a kluge, but one that is already in use during
5505 * transaction commit to prevent race conditions. Basically, we
5506 * simply prevent the checkpoint WAL record from being written
5507 * until we have marked the buffer dirty. We don't start the
5508 * checkpoint flush until we have marked dirty, so our checkpoint
5509 * must flush the change to disk successfully or the checkpoint
5510 * never gets written, so crash recovery will fix.
5511 *
5512 * It's possible we may enter here without an xid, so it is
5513 * essential that CreateCheckPoint waits for virtual transactions
5514 * rather than full transactionids.
5515 */
5518 delayChkptFlags = true;
5519 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5520 }
5521
5522 buf_state = LockBufHdr(bufHdr);
5523
5524 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5525
5526 if (!(buf_state & BM_DIRTY))
5527 {
5528 dirtied = true; /* Means "will be dirtied by this action" */
5529
5530 /*
5531 * Set the page LSN if we wrote a backup block. We aren't supposed
5532 * to set this when only holding a share lock but as long as we
5533 * serialise it somehow we're OK. We choose to set LSN while
5534 * holding the buffer header lock, which causes any reader of an
5535 * LSN who holds only a share lock to also obtain a buffer header
5536 * lock before using PageGetLSN(), which is enforced in
5537 * BufferGetLSNAtomic().
5538 *
5539 * If checksums are enabled, you might think we should reset the
5540 * checksum here. That will happen when the page is written
5541 * sometime later in this checkpoint cycle.
5542 */
5543 if (XLogRecPtrIsValid(lsn))
5544 PageSetLSN(page, lsn);
5545 }
5546
5547 UnlockBufHdrExt(bufHdr, buf_state,
5549 0, 0);
5550
5551 if (delayChkptFlags)
5552 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5553
5554 if (dirtied)
5555 {
5557 if (VacuumCostActive)
5559 }
5560 }
5561}
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:390
#define DELAY_CHKPT_START
Definition: proc.h:135
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:573
int delayChkptFlags
Definition: proc.h:257
bool RecoveryInProgress(void)
Definition: xlog.c:6406
#define XLogRecPtrIsValid(r)
Definition: xlogdefs.h:29
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1087

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsLockedByMe(), BufferIsValid(), BufTagGetRelFileLocator(), DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog, ERROR, GetBufferDescriptor(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), MarkLocalBufferDirty(), MyProc, PageSetLSN(), pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdrExt(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsValid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune_and_freeze(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

◆ MarkDirtyAllUnpinnedBuffers()

void MarkDirtyAllUnpinnedBuffers ( int32 buffers_dirtied,
int32 buffers_already_dirty,
int32 buffers_skipped 
)

Definition at line 6933 of file bufmgr.c.

6936{
6937 *buffers_dirtied = 0;
6938 *buffers_already_dirty = 0;
6939 *buffers_skipped = 0;
6940
6941 for (int buf = 1; buf <= NBuffers; buf++)
6942 {
6943 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6944 uint32 buf_state;
6945 bool buffer_already_dirty;
6946
6948
6949 buf_state = pg_atomic_read_u32(&desc->state);
6950 if (!(buf_state & BM_VALID))
6951 continue;
6952
6955
6956 LockBufHdr(desc);
6957
6958 if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
6959 (*buffers_dirtied)++;
6960 else if (buffer_already_dirty)
6961 (*buffers_already_dirty)++;
6962 else
6963 (*buffers_skipped)++;
6964 }
6965}
static bool MarkDirtyUnpinnedBufferInternal(Buffer buf, BufferDesc *desc, bool *buffer_already_dirty)
Definition: bufmgr.c:6784

References BM_VALID, buf, CHECK_FOR_INTERRUPTS, CurrentResourceOwner, GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), NBuffers, pg_atomic_read_u32(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), and BufferDesc::state.

Referenced by pg_buffercache_mark_dirty_all().

◆ MarkDirtyRelUnpinnedBuffers()

void MarkDirtyRelUnpinnedBuffers ( Relation  rel,
int32 buffers_dirtied,
int32 buffers_already_dirty,
int32 buffers_skipped 
)

Definition at line 6876 of file bufmgr.c.

6880{
6882
6883 *buffers_dirtied = 0;
6884 *buffers_already_dirty = 0;
6885 *buffers_skipped = 0;
6886
6887 for (int buf = 1; buf <= NBuffers; buf++)
6888 {
6889 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6890 uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6891 bool buffer_already_dirty;
6892
6894
6895 /* An unlocked precheck should be safe and saves some cycles. */
6896 if ((buf_state & BM_VALID) == 0 ||
6898 continue;
6899
6900 /* Make sure we can pin the buffer. */
6903
6904 buf_state = LockBufHdr(desc);
6905
6906 /* recheck, could have changed without the lock */
6907 if ((buf_state & BM_VALID) == 0 ||
6909 {
6910 UnlockBufHdr(desc);
6911 continue;
6912 }
6913
6914 if (MarkDirtyUnpinnedBufferInternal(buf, desc, &buffer_already_dirty))
6915 (*buffers_dirtied)++;
6916 else if (buffer_already_dirty)
6917 (*buffers_already_dirty)++;
6918 else
6919 (*buffers_skipped)++;
6920 }
6921}

References Assert(), BM_VALID, buf, BufTagMatchesRelFileLocator(), CHECK_FOR_INTERRUPTS, CurrentResourceOwner, GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), NBuffers, pg_atomic_read_u32(), RelationData::rd_locator, RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by pg_buffercache_mark_dirty_relation().

◆ MarkDirtyUnpinnedBuffer()

bool MarkDirtyUnpinnedBuffer ( Buffer  buf,
bool *  buffer_already_dirty 
)

Definition at line 6840 of file bufmgr.c.

6841{
6842 BufferDesc *desc;
6843 bool buffer_dirtied = false;
6844
6846
6847 /* Make sure we can pin the buffer. */
6850
6851 desc = GetBufferDescriptor(buf - 1);
6852 LockBufHdr(desc);
6853
6854 buffer_dirtied = MarkDirtyUnpinnedBufferInternal(buf, desc, buffer_already_dirty);
6855 /* Both can not be true at the same time */
6856 Assert(!(buffer_dirtied && *buffer_already_dirty));
6857
6858 return buffer_dirtied;
6859}

References Assert(), buf, BufferIsLocal, CurrentResourceOwner, GetBufferDescriptor(), LockBufHdr(), MarkDirtyUnpinnedBufferInternal(), ReservePrivateRefCountEntry(), and ResourceOwnerEnlarge().

Referenced by pg_buffercache_mark_dirty().

◆ MarkDirtyUnpinnedBufferInternal()

static bool MarkDirtyUnpinnedBufferInternal ( Buffer  buf,
BufferDesc desc,
bool *  buffer_already_dirty 
)
static

Definition at line 6784 of file bufmgr.c.

6786{
6787 uint32 buf_state;
6788 bool result = false;
6789
6790 *buffer_already_dirty = false;
6791
6792 buf_state = pg_atomic_read_u32(&(desc->state));
6793 Assert(buf_state & BM_LOCKED);
6794
6795 if ((buf_state & BM_VALID) == 0)
6796 {
6797 UnlockBufHdr(desc);
6798 return false;
6799 }
6800
6801 /* Check that it's not pinned already. */
6802 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6803 {
6804 UnlockBufHdr(desc);
6805 return false;
6806 }
6807
6808 /* Pin the buffer and then release the buffer spinlock */
6809 PinBuffer_Locked(desc);
6810
6811 /* If it was not already dirty, mark it as dirty. */
6812 if (!(buf_state & BM_DIRTY))
6813 {
6816 result = true;
6818 }
6819 else
6820 *buffer_already_dirty = true;
6821
6822 UnpinBuffer(desc);
6823
6824 return result;
6825}
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2943

References Assert(), BM_DIRTY, BM_LOCKED, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MarkBufferDirty(), pg_atomic_read_u32(), PinBuffer_Locked(), BufferDesc::state, UnlockBufHdr(), and UnpinBuffer().

Referenced by MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), and MarkDirtyUnpinnedBuffer().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 325 of file bufmgr.c.

326{
328
329 /* only allowed to be called when a reservation has been made */
331
332 /* use up the reserved entry */
335
336 /* and fill it */
337 res->buffer = buffer;
338 res->refcount = 0;
339
340 return res;
341}

References Assert(), PrivateRefCountEntry::buffer, PrivateRefCountEntry::refcount, and ReservedRefCountEntry.

Referenced by TrackNewBufferPin().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy,
bool  skip_if_not_valid 
)
static

Definition at line 3068 of file bufmgr.c.

3070{
3072 bool result;
3074
3077
3078 ref = GetPrivateRefCountEntry(b, true);
3079
3080 if (ref == NULL)
3081 {
3082 uint32 buf_state;
3083 uint32 old_buf_state;
3084
3085 old_buf_state = pg_atomic_read_u32(&buf->state);
3086 for (;;)
3087 {
3088 if (unlikely(skip_if_not_valid && !(old_buf_state & BM_VALID)))
3089 return false;
3090
3091 /*
3092 * We're not allowed to increase the refcount while the buffer
3093 * header spinlock is held. Wait for the lock to be released.
3094 */
3095 if (old_buf_state & BM_LOCKED)
3096 old_buf_state = WaitBufHdrUnlocked(buf);
3097
3098 buf_state = old_buf_state;
3099
3100 /* increase refcount */
3101 buf_state += BUF_REFCOUNT_ONE;
3102
3103 if (strategy == NULL)
3104 {
3105 /* Default case: increase usagecount unless already max. */
3107 buf_state += BUF_USAGECOUNT_ONE;
3108 }
3109 else
3110 {
3111 /*
3112 * Ring buffers shouldn't evict others from pool. Thus we
3113 * don't make usagecount more than 1.
3114 */
3115 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3116 buf_state += BUF_USAGECOUNT_ONE;
3117 }
3118
3119 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3120 buf_state))
3121 {
3122 result = (buf_state & BM_VALID) != 0;
3123
3125 break;
3126 }
3127 }
3128 }
3129 else
3130 {
3131 /*
3132 * If we previously pinned the buffer, it is likely to be valid, but
3133 * it may not be if StartReadBuffers() was called and
3134 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3135 * the flags without locking. This is racy, but it's OK to return
3136 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3137 * it'll see that it's now valid.
3138 *
3139 * Note: We deliberately avoid a Valgrind client request here.
3140 * Individual access methods can optionally superimpose buffer page
3141 * client requests on top of our client requests to enforce that
3142 * buffers are only accessed while locked (and pinned). It's possible
3143 * that the buffer page is legitimately non-accessible here. We
3144 * cannot meddle with that.
3145 */
3146 result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3147
3148 Assert(ref->refcount > 0);
3149 ref->refcount++;
3151 }
3152
3153 return result;
3154}
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:86
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:60
void TrackNewBufferPin(Buffer buf)
Definition: bufmgr.c:3303

References Assert(), b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, CurrentResourceOwner, GetPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservedRefCountEntry, ResourceOwnerRememberBuffer(), TrackNewBufferPin(), unlikely, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), ExtendBufferedRelShared(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 3179 of file bufmgr.c.

3180{
3181 uint32 old_buf_state;
3182
3183 /*
3184 * As explained, We don't expect any preexisting pins. That allows us to
3185 * manipulate the PrivateRefCount after releasing the spinlock
3186 */
3188
3189 /*
3190 * Since we hold the buffer spinlock, we can update the buffer state and
3191 * release the lock in one operation.
3192 */
3193 old_buf_state = pg_atomic_read_u32(&buf->state);
3194
3195 UnlockBufHdrExt(buf, old_buf_state,
3196 0, 0, 1);
3197
3199}

References Assert(), buf, BufferDescriptorGetBuffer(), GetPrivateRefCountEntry(), pg_atomic_read_u32(), TrackNewBufferPin(), and UnlockBufHdrExt().

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), MarkDirtyUnpinnedBufferInternal(), and SyncOneBuffer().

◆ PinBufferForBlock()

static pg_attribute_always_inline Buffer PinBufferForBlock ( Relation  rel,
SMgrRelation  smgr,
char  persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool *  foundPtr 
)
static

Definition at line 1091 of file bufmgr.c.

1098{
1099 BufferDesc *bufHdr;
1100 IOContext io_context;
1101 IOObject io_object;
1102
1103 Assert(blockNum != P_NEW);
1104
1105 /* Persistence should be set before */
1106 Assert((persistence == RELPERSISTENCE_TEMP ||
1107 persistence == RELPERSISTENCE_PERMANENT ||
1108 persistence == RELPERSISTENCE_UNLOGGED));
1109
1110 if (persistence == RELPERSISTENCE_TEMP)
1111 {
1112 io_context = IOCONTEXT_NORMAL;
1113 io_object = IOOBJECT_TEMP_RELATION;
1114 }
1115 else
1116 {
1117 io_context = IOContextForStrategy(strategy);
1118 io_object = IOOBJECT_RELATION;
1119 }
1120
1121 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1125 smgr->smgr_rlocator.backend);
1126
1127 if (persistence == RELPERSISTENCE_TEMP)
1128 {
1129 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1130 if (*foundPtr)
1132 }
1133 else
1134 {
1135 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1136 strategy, foundPtr, io_context);
1137 if (*foundPtr)
1139 }
1140 if (rel)
1141 {
1142 /*
1143 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1144 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1145 * zeroed instead), the per-relation stats always count them.
1146 */
1148 if (*foundPtr)
1150 }
1151 if (*foundPtr)
1152 {
1153 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1154 if (VacuumCostActive)
1156
1157 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1161 smgr->smgr_rlocator.backend,
1162 true);
1163 }
1164
1165 return BufferDescriptorGetBuffer(bufHdr);
1166}
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:1981
#define P_NEW
Definition: bufmgr.h:198
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:119
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:715

References Assert(), RelFileLocatorBackend::backend, BufferAlloc(), BufferDescriptorGetBuffer(), RelFileLocator::dbOid, IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_HIT, BufferUsage::local_blks_hit, LocalBufferAlloc(), RelFileLocatorBackend::locator, P_NEW, pgBufferUsage, pgstat_count_buffer_hit, pgstat_count_buffer_read, pgstat_count_io_op(), RelFileLocator::relNumber, BufferUsage::shared_blks_hit, SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, VacuumCostActive, VacuumCostBalance, and VacuumCostPageHit.

Referenced by ReadBuffer_common(), and StartReadBuffersImpl().

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 653 of file bufmgr.c.

654{
655 Assert(RelationIsValid(reln));
656 Assert(BlockNumberIsValid(blockNum));
657
658 if (RelationUsesLocalBuffers(reln))
659 {
660 /* see comments in ReadBufferExtended */
661 if (RELATION_IS_OTHER_TEMP(reln))
663 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
664 errmsg("cannot access temporary tables of other sessions")));
665
666 /* pass it off to localbuf.c */
667 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
668 }
669 else
670 {
671 /* pass it to the shared buffer version */
672 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
673 }
674}
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:563
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:72
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:668
#define RelationIsValid(relation)
Definition: rel.h:490

References Assert(), BlockNumberIsValid(), ereport, errcode(), errmsg(), ERROR, PrefetchLocalBuffer(), PrefetchSharedBuffer(), RELATION_IS_OTHER_TEMP, RelationGetSmgr(), RelationIsValid, and RelationUsesLocalBuffers.

Referenced by count_nondeletable_pages(), invalidate_rel_block(), and pg_prewarm().

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 563 of file bufmgr.c.

566{
567 PrefetchBufferResult result = {InvalidBuffer, false};
568 BufferTag newTag; /* identity of requested block */
569 uint32 newHash; /* hash value for newTag */
570 LWLock *newPartitionLock; /* buffer partition lock for it */
571 int buf_id;
572
573 Assert(BlockNumberIsValid(blockNum));
574
575 /* create a tag so we can lookup the buffer */
576 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
577 forkNum, blockNum);
578
579 /* determine its hash code and partition lock ID */
580 newHash = BufTableHashCode(&newTag);
581 newPartitionLock = BufMappingPartitionLock(newHash);
582
583 /* see if the block is in the buffer pool already */
584 LWLockAcquire(newPartitionLock, LW_SHARED);
585 buf_id = BufTableLookup(&newTag, newHash);
586 LWLockRelease(newPartitionLock);
587
588 /* If not in buffers, initiate prefetch */
589 if (buf_id < 0)
590 {
591#ifdef USE_PREFETCH
592 /*
593 * Try to initiate an asynchronous read. This returns false in
594 * recovery if the relation file doesn't exist.
595 */
596 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
597 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
598 {
599 result.initiated_io = true;
600 }
601#endif /* USE_PREFETCH */
602 }
603 else
604 {
605 /*
606 * Report the buffer it was in at that time. The caller may be able
607 * to avoid a buffer table lookup, but it's not pinned and it must be
608 * rechecked!
609 */
610 result.recent_buffer = buf_id + 1;
611 }
612
613 /*
614 * If the block *is* in buffers, we do nothing. This is not really ideal:
615 * the block might be just about to be evicted, which would be stupid
616 * since we know we are going to need it soon. But the only easy answer
617 * is to bump the usage_count, which does not seem like a great solution:
618 * when the caller does ultimately touch the block, usage_count would get
619 * bumped again, resulting in too much favoritism for blocks that are
620 * involved in a prefetch sequence. A real fix would involve some
621 * additional per-buffer state, and it's not clear that there's enough of
622 * a problem to justify that.
623 */
624
625 return result;
626}
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:678
Buffer recent_buffer
Definition: bufmgr.h:61

References Assert(), BlockNumberIsValid(), BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), InitBufferTag(), PrefetchBufferResult::initiated_io, InvalidBuffer, IO_DIRECT_DATA, io_direct_flags, RelFileLocatorBackend::locator, LW_SHARED, LWLockAcquire(), LWLockRelease(), PrefetchBufferResult::recent_buffer, SMgrRelationData::smgr_rlocator, and smgrprefetch().

Referenced by PrefetchBuffer(), and XLogPrefetcherNextBlock().

◆ ProcessReadBuffersResult()

static void ProcessReadBuffersResult ( ReadBuffersOperation operation)
static

Definition at line 1574 of file bufmgr.c.

1575{
1576 PgAioReturn *aio_ret = &operation->io_return;
1577 PgAioResultStatus rs = aio_ret->result.status;
1578 int newly_read_blocks = 0;
1579
1580 Assert(pgaio_wref_valid(&operation->io_wref));
1581 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1582
1583 /*
1584 * SMGR reports the number of blocks successfully read as the result of
1585 * the IO operation. Thus we can simply add that to ->nblocks_done.
1586 */
1587
1588 if (likely(rs != PGAIO_RS_ERROR))
1589 newly_read_blocks = aio_ret->result.result;
1590
1591 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1592 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1593 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1594 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1595 {
1596 /*
1597 * We'll retry, so we just emit a debug message to the server log (or
1598 * not even that in prod scenarios).
1599 */
1600 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1601 elog(DEBUG3, "partial read, will retry");
1602 }
1603
1604 Assert(newly_read_blocks > 0);
1605 Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1606
1607 operation->nblocks_done += newly_read_blocks;
1608
1609 Assert(operation->nblocks_done <= operation->nblocks);
1610}
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:971
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
#define likely(x)
Definition: c.h:406
#define DEBUG3
Definition: elog.h:28
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133

References Assert(), DEBUG1, DEBUG3, elog, ERROR, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, likely, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PGAIO_RS_UNKNOWN, PGAIO_RS_WARNING, pgaio_wref_valid(), PgAioResult::result, PgAioReturn::result, PgAioResult::status, PgAioReturn::target_data, and WARNING.

Referenced by WaitReadBuffers().

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 745 of file bufmgr.c.

746{
747 return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
748}
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:792
@ RBM_NORMAL
Definition: bufmgr.h:46

References MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_allocbuf(), _bt_getbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_lock_tuple(), heap_update(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

◆ ReadBuffer_common()

static pg_attribute_always_inline Buffer ReadBuffer_common ( Relation  rel,
SMgrRelation  smgr,
char  smgr_persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
static

Definition at line 1174 of file bufmgr.c.

1178{
1179 ReadBuffersOperation operation;
1180 Buffer buffer;
1181 int flags;
1182 char persistence;
1183
1184 /*
1185 * Backward compatibility path, most code should use ExtendBufferedRel()
1186 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1187 * scales a lot better.
1188 */
1189 if (unlikely(blockNum == P_NEW))
1190 {
1192
1193 /*
1194 * Since no-one else can be looking at the page contents yet, there is
1195 * no difference between an exclusive lock and a cleanup-strength
1196 * lock.
1197 */
1199 flags |= EB_LOCK_FIRST;
1200
1201 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1202 }
1203
1204 if (rel)
1205 persistence = rel->rd_rel->relpersistence;
1206 else
1207 persistence = smgr_persistence;
1208
1211 {
1212 bool found;
1213
1214 buffer = PinBufferForBlock(rel, smgr, persistence,
1215 forkNum, blockNum, strategy, &found);
1216 ZeroAndLockBuffer(buffer, mode, found);
1217 return buffer;
1218 }
1219
1220 /*
1221 * Signal that we are going to immediately wait. If we're immediately
1222 * waiting, there is no benefit in actually executing the IO
1223 * asynchronously, it would just add dispatch overhead.
1224 */
1226 if (mode == RBM_ZERO_ON_ERROR)
1228 operation.smgr = smgr;
1229 operation.rel = rel;
1230 operation.persistence = persistence;
1231 operation.forknum = forkNum;
1232 operation.strategy = strategy;
1233 if (StartReadBuffer(&operation,
1234 &buffer,
1235 blockNum,
1236 flags))
1237 WaitReadBuffers(&operation);
1238
1239 return buffer;
1240}
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:845
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition: bufmgr.c:1012
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1091
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition: bufmgr.c:1613
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition: bufmgr.c:1489
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:51
#define BMR_REL(p_rel)
Definition: bufmgr.h:114

References BMR_REL, PrivateRefCountEntry::buffer, EB_LOCK_FIRST, EB_SKIP_EXTENSION_LOCK, ExtendBufferedRel(), ReadBuffersOperation::forknum, mode, P_NEW, ReadBuffersOperation::persistence, PinBufferForBlock(), RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelationData::rd_rel, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, ReadBuffersOperation::rel, ReadBuffersOperation::smgr, StartReadBuffer(), ReadBuffersOperation::strategy, unlikely, WaitReadBuffers(), and ZeroAndLockBuffer().

Referenced by ExtendBufferedRelTo(), ReadBufferExtended(), and ReadBufferWithoutRelcache().

◆ ReadBufferExtended()

Buffer ReadBufferExtended ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
inline

Definition at line 792 of file bufmgr.c.

794{
795 Buffer buf;
796
797 /*
798 * Reject attempts to read non-local temporary relations; we would be
799 * likely to get wrong data since we have no visibility into the owning
800 * session's local buffers.
801 */
802 if (RELATION_IS_OTHER_TEMP(reln))
804 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
805 errmsg("cannot access temporary tables of other sessions")));
806
807 /*
808 * Read the buffer, and update pgstat counters to reflect a cache hit or
809 * miss.
810 */
811 buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
812 forkNum, blockNum, mode, strategy);
813
814 return buf;
815}

References buf, ereport, errcode(), errmsg(), ERROR, mode, ReadBuffer_common(), RELATION_IS_OTHER_TEMP, and RelationGetSmgr().

Referenced by _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), blbulkdelete(), blgetbitmap(), BloomInitMetapage(), blvacuumcleanup(), bt_recheck_sibling_links(), btvacuumpage(), count_nondeletable_pages(), create_toy_buffer(), fsm_readbuf(), get_raw_page_internal(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), gin_refind_parent(), ginbulkdelete(), ginDeletePage(), ginScanToDelete(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbulkdelete(), heapam_scan_sample_next_block(), log_newpage_range(), modify_rel_block(), palloc_btree_page(), pgstat_btree_page(), pgstat_gist_page(), pgstat_hash_page(), pgstat_heap(), pgstathashindex(), pgstatindex_impl(), ReadBuffer(), ReadBufferBI(), spgprocesspending(), statapprox_heap(), and vm_readbuf().

◆ ReadBuffersCanStartIO()

static bool ReadBuffersCanStartIO ( Buffer  buffer,
bool  nowait 
)
inlinestatic

Definition at line 1545 of file bufmgr.c.

1546{
1547 /*
1548 * If this backend currently has staged IO, we need to submit the pending
1549 * IO before waiting for the right to issue IO, to avoid the potential for
1550 * deadlocks (and, more commonly, unnecessary delays for other backends).
1551 */
1552 if (!nowait && pgaio_have_staged())
1553 {
1554 if (ReadBuffersCanStartIOOnce(buffer, true))
1555 return true;
1556
1557 /*
1558 * Unfortunately StartBufferIO() returning false doesn't allow to
1559 * distinguish between the buffer already being valid and IO already
1560 * being in progress. Since IO already being in progress is quite
1561 * rare, this approach seems fine.
1562 */
1564 }
1565
1566 return ReadBuffersCanStartIOOnce(buffer, nowait);
1567}
bool pgaio_have_staged(void)
Definition: aio.c:1107
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition: bufmgr.c:1532

References PrivateRefCountEntry::buffer, pgaio_have_staged(), pgaio_submit_staged(), and ReadBuffersCanStartIOOnce().

Referenced by AsyncReadBuffers().

◆ ReadBuffersCanStartIOOnce()

static bool ReadBuffersCanStartIOOnce ( Buffer  buffer,
bool  nowait 
)
inlinestatic

Definition at line 1532 of file bufmgr.c.

1533{
1534 if (BufferIsLocal(buffer))
1535 return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1),
1536 true, nowait);
1537 else
1538 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1539}
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition: localbuf.c:523

References PrivateRefCountEntry::buffer, BufferIsLocal, GetBufferDescriptor(), GetLocalBufferDescriptor(), StartBufferIO(), and StartLocalBufferIO().

Referenced by ReadBuffersCanStartIO().

◆ ReadBufferWithoutRelcache()

Buffer ReadBufferWithoutRelcache ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool  permanent 
)

Definition at line 829 of file bufmgr.c.

832{
833 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
834
835 return ReadBuffer_common(NULL, smgr,
836 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
837 forkNum, blockNum,
838 mode, strategy);
839}

References INVALID_PROC_NUMBER, mode, ReadBuffer_common(), and smgropen().

Referenced by RelationCopyStorageUsingBuffer(), ScanSourceDatabasePgClass(), and XLogReadBufferExtended().

◆ ReadRecentBuffer()

bool ReadRecentBuffer ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
Buffer  recent_buffer 
)

Definition at line 684 of file bufmgr.c.

686{
687 BufferDesc *bufHdr;
688 BufferTag tag;
689 uint32 buf_state;
690
691 Assert(BufferIsValid(recent_buffer));
692
695 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
696
697 if (BufferIsLocal(recent_buffer))
698 {
699 int b = -recent_buffer - 1;
700
701 bufHdr = GetLocalBufferDescriptor(b);
702 buf_state = pg_atomic_read_u32(&bufHdr->state);
703
704 /* Is it still valid and holding the right tag? */
705 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
706 {
707 PinLocalBuffer(bufHdr, true);
708
710
711 return true;
712 }
713 }
714 else
715 {
716 bufHdr = GetBufferDescriptor(recent_buffer - 1);
717
718 /*
719 * Is it still valid and holding the right tag? We do an unlocked tag
720 * comparison first, to make it unlikely that we'll increment the
721 * usage counter of the wrong buffer, if someone calls us with a very
722 * out of date recent_buffer. Then we'll check it again if we get the
723 * pin.
724 */
725 if (BufferTagsEqual(&tag, &bufHdr->tag) &&
726 PinBuffer(bufHdr, NULL, true))
727 {
728 if (BufferTagsEqual(&tag, &bufHdr->tag))
729 {
731 return true;
732 }
733 UnpinBuffer(bufHdr);
734 }
735 }
736
737 return false;
738}

References Assert(), b, BM_VALID, BufferIsLocal, BufferIsValid(), BufferTagsEqual(), CurrentResourceOwner, GetBufferDescriptor(), GetLocalBufferDescriptor(), InitBufferTag(), BufferUsage::local_blks_hit, pg_atomic_read_u32(), pgBufferUsage, PinBuffer(), PinLocalBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_hit, BufferDesc::state, BufferDesc::tag, and UnpinBuffer().

Referenced by invalidate_rel_block(), and XLogReadBufferExtended().

◆ RelationCopyStorageUsingBuffer()

static void RelationCopyStorageUsingBuffer ( RelFileLocator  srclocator,
RelFileLocator  dstlocator,
ForkNumber  forkNum,
bool  permanent 
)
static

Definition at line 5128 of file bufmgr.c.

5131{
5132 Buffer srcBuf;
5133 Buffer dstBuf;
5134 Page srcPage;
5135 Page dstPage;
5136 bool use_wal;
5137 BlockNumber nblocks;
5138 BlockNumber blkno;
5140 BufferAccessStrategy bstrategy_src;
5141 BufferAccessStrategy bstrategy_dst;
5143 ReadStream *src_stream;
5144 SMgrRelation src_smgr;
5145
5146 /*
5147 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5148 * can skip it when copying any fork of an unlogged relation other than
5149 * the init fork.
5150 */
5151 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5152
5153 /* Get number of blocks in the source relation. */
5154 nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5155 forkNum);
5156
5157 /* Nothing to copy; just return. */
5158 if (nblocks == 0)
5159 return;
5160
5161 /*
5162 * Bulk extend the destination relation of the same size as the source
5163 * relation before starting to copy block by block.
5164 */
5165 memset(buf.data, 0, BLCKSZ);
5166 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5167 buf.data, true);
5168
5169 /* This is a bulk operation, so use buffer access strategies. */
5170 bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5171 bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5172
5173 /* Initialize streaming read */
5174 p.current_blocknum = 0;
5175 p.last_exclusive = nblocks;
5176 src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5177
5178 /*
5179 * It is safe to use batchmode as block_range_read_stream_cb takes no
5180 * locks.
5181 */
5184 bstrategy_src,
5185 src_smgr,
5186 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5187 forkNum,
5189 &p,
5190 0);
5191
5192 /* Iterate over each block of the source relation file. */
5193 for (blkno = 0; blkno < nblocks; blkno++)
5194 {
5196
5197 /* Read block from source relation. */
5198 srcBuf = read_stream_next_buffer(src_stream, NULL);
5200 srcPage = BufferGetPage(srcBuf);
5201
5202 dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5203 BufferGetBlockNumber(srcBuf),
5204 RBM_ZERO_AND_LOCK, bstrategy_dst,
5205 permanent);
5206 dstPage = BufferGetPage(dstBuf);
5207
5209
5210 /* Copy page data from the source to the destination. */
5211 memcpy(dstPage, srcPage, BLCKSZ);
5212 MarkBufferDirty(dstBuf);
5213
5214 /* WAL-log the copied page. */
5215 if (use_wal)
5216 log_newpage_buffer(dstBuf, true);
5217
5219
5220 UnlockReleaseBuffer(dstBuf);
5221 UnlockReleaseBuffer(srcBuf);
5222 }
5223 Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5224 read_stream_end(src_stream);
5225
5226 FreeAccessStrategy(bstrategy_src);
5227 FreeAccessStrategy(bstrategy_dst);
5228}
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5383
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:829
@ BAS_BULKREAD
Definition: bufmgr.h:37
@ BAS_BULKWRITE
Definition: bufmgr.h:39
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:461
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:643
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:761
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:791
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1089
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:620
#define XLogIsNeeded()
Definition: xlog.h:109
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1259

References Assert(), BAS_BULKREAD, BAS_BULKWRITE, block_range_read_stream_cb(), buf, BUFFER_LOCK_SHARE, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, BlockRangeReadStreamPrivate::current_blocknum, END_CRIT_SECTION, FreeAccessStrategy(), GetAccessStrategy(), INIT_FORKNUM, INVALID_PROC_NUMBER, InvalidBuffer, BlockRangeReadStreamPrivate::last_exclusive, LockBuffer(), log_newpage_buffer(), MarkBufferDirty(), RBM_ZERO_AND_LOCK, read_stream_begin_smgr_relation(), read_stream_end(), READ_STREAM_FULL, read_stream_next_buffer(), READ_STREAM_USE_BATCHING, ReadBufferWithoutRelcache(), smgrextend(), smgrnblocks(), smgropen(), START_CRIT_SECTION, UnlockReleaseBuffer(), and XLogIsNeeded.

Referenced by CreateAndCopyRelationData().

◆ RelationGetNumberOfBlocksInFork()

BlockNumber RelationGetNumberOfBlocksInFork ( Relation  relation,
ForkNumber  forkNum 
)

Definition at line 4437 of file bufmgr.c.

4438{
4439 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4440 {
4441 /*
4442 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4443 * tableam returns the size in bytes - but for the purpose of this
4444 * routine, we want the number of blocks. Therefore divide, rounding
4445 * up.
4446 */
4447 uint64 szbytes;
4448
4449 szbytes = table_relation_size(relation, forkNum);
4450
4451 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4452 }
4453 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4454 {
4455 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4456 }
4457 else
4458 Assert(false);
4459
4460 return 0; /* keep compiler quiet */
4461}
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1847

References Assert(), RelationData::rd_rel, RelationGetSmgr(), smgrnblocks(), and table_relation_size().

Referenced by _hash_getnewbuf(), _hash_init(), autoprewarm_database_main(), get_raw_page_internal(), and pg_prewarm().

◆ ReleaseAndReadBuffer()

Buffer ReleaseAndReadBuffer ( Buffer  buffer,
Relation  relation,
BlockNumber  blockNum 
)

Definition at line 3008 of file bufmgr.c.

3011{
3012 ForkNumber forkNum = MAIN_FORKNUM;
3013 BufferDesc *bufHdr;
3014
3015 if (BufferIsValid(buffer))
3016 {
3017 Assert(BufferIsPinned(buffer));
3018 if (BufferIsLocal(buffer))
3019 {
3020 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3021 if (bufHdr->tag.blockNum == blockNum &&
3022 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3023 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3024 return buffer;
3025 UnpinLocalBuffer(buffer);
3026 }
3027 else
3028 {
3029 bufHdr = GetBufferDescriptor(buffer - 1);
3030 /* we have pin, so it's ok to examine tag without spinlock */
3031 if (bufHdr->tag.blockNum == blockNum &&
3032 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3033 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3034 return buffer;
3035 UnpinBuffer(bufHdr);
3036 }
3037 }
3038
3039 return ReadBuffer(relation, blockNum);
3040}
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:745

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), MAIN_FORKNUM, RelationData::rd_locator, ReadBuffer(), BufferDesc::tag, UnpinBuffer(), and UnpinLocalBuffer().

Referenced by _bt_relandgetbuf(), ginFindLeafPage(), and heapam_index_fetch_tuple().

◆ ReleaseBuffer()

void ReleaseBuffer ( Buffer  buffer)

Definition at line 5366 of file bufmgr.c.

5367{
5368 if (!BufferIsValid(buffer))
5369 elog(ERROR, "bad buffer ID: %d", buffer);
5370
5371 if (BufferIsLocal(buffer))
5372 UnpinLocalBuffer(buffer);
5373 else
5374 UnpinBuffer(GetBufferDescriptor(buffer - 1));
5375}

References PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by _bt_allocbuf(), _bt_pagedel(), _bt_relbuf(), _bt_search_insert(), _bt_unlink_halfdead_page(), _hash_dropbuf(), _hash_getbuf_with_condlock_cleanup(), autoprewarm_database_main(), BitmapHeapScanNextBlock(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brin_vacuum_scan(), bringetbitmap(), brinGetTupleForHeapBlock(), brininsert(), brinRevmapTerminate(), brinsummarize(), buffer_create_toy(), collect_corrupt_items(), collect_visibility_data(), entryLoadMoreItems(), ExecEndIndexOnlyScan(), ExtendBufferedRelTo(), FreeBulkInsertState(), freeGinBtreeStack(), fsm_search(), fsm_vacuum_page(), get_actual_variable_endpoint(), get_raw_page_internal(), GetRecordedFreeSpace(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), ginDeletePage(), ginFindParents(), ginFinishSplit(), ginFreeScanKeys(), ginInsertCleanup(), GinNewBuffer(), ginScanToDelete(), gistdoinsert(), gistFindCorrectParent(), gistNewBuffer(), gistvacuum_delete_empty_pages(), grow_rel(), heap_abort_speculative(), heap_delete(), heap_endscan(), heap_fetch(), heap_fetch_next_buffer(), heap_force_common(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_rescan(), heap_update(), heap_vac_scan_next_block(), heap_xlog_delete(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), heapam_index_fetch_reset(), heapam_scan_sample_next_block(), heapam_tuple_lock(), heapgettup(), heapgettup_pagemode(), invalidate_rel_block(), lazy_scan_heap(), lazy_vacuum_heap_rel(), modify_rel_block(), pg_prewarm(), pg_visibility(), pg_visibility_map(), pg_visibility_map_summary(), pgstatindex_impl(), read_rel_block_ll(), read_stream_reset(), ReadBufferBI(), RelationAddBlocks(), RelationGetBufferForTuple(), ReleaseBulkInsertStatePin(), revmap_get_buffer(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), statapprox_heap(), summarize_range(), terminate_brin_buildstate(), tts_buffer_heap_clear(), tts_buffer_heap_materialize(), tts_buffer_heap_store_tuple(), UnlockReleaseBuffer(), verify_heapam(), visibilitymap_count(), visibilitymap_get_status(), visibilitymap_pin(), and XLogReadBufferExtended().

◆ ReservePrivateRefCountEntry()

static void ReservePrivateRefCountEntry ( void  )
static

Definition at line 259 of file bufmgr.c.

260{
261 /* Already reserved (or freed), nothing to do */
262 if (ReservedRefCountEntry != NULL)
263 return;
264
265 /*
266 * First search for a free entry the array, that'll be sufficient in the
267 * majority of cases.
268 */
269 {
270 int i;
271
272 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
273 {
275
276 res = &PrivateRefCountArray[i];
277
278 if (res->buffer == InvalidBuffer)
279 {
281 return;
282 }
283 }
284 }
285
286 /*
287 * No luck. All array entries are full. Move one array entry into the hash
288 * table.
289 */
290 {
291 /*
292 * Move entry from the current clock position in the array into the
293 * hashtable. Use that slot.
294 */
295 PrivateRefCountEntry *hashent;
296 bool found;
297
298 /* select victim slot */
301
302 /* Better be used, otherwise we shouldn't get here. */
304
305 /* enter victim array entry into hashtable */
309 &found);
310 Assert(!found);
312
313 /* clear the now free array slot */
316
318 }
319}
static uint32 PrivateRefCountClock
Definition: bufmgr.c:218
@ HASH_ENTER
Definition: hsearch.h:114

References Assert(), PrivateRefCountEntry::buffer, HASH_ENTER, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountClock, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by BufferAlloc(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetPrivateRefCountEntry(), GetVictimBuffer(), MarkDirtyAllUnpinnedBuffers(), MarkDirtyRelUnpinnedBuffers(), MarkDirtyUnpinnedBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ ResOwnerPrintBufferIO()

static char * ResOwnerPrintBufferIO ( Datum  res)
static

Definition at line 6559 of file bufmgr.c.

6560{
6561 Buffer buffer = DatumGetInt32(res);
6562
6563 return psprintf("lost track of buffer IO on buffer %d", buffer);
6564}
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:212

References PrivateRefCountEntry::buffer, DatumGetInt32(), and psprintf().

◆ ResOwnerPrintBufferPin()

static char * ResOwnerPrintBufferPin ( Datum  res)
static

Definition at line 6582 of file bufmgr.c.

6583{
6585}

References DatumGetInt32(), and DebugPrintBufferRefcount().

◆ ResOwnerReleaseBufferIO()

static void ResOwnerReleaseBufferIO ( Datum  res)
static

Definition at line 6551 of file bufmgr.c.

6552{
6553 Buffer buffer = DatumGetInt32(res);
6554
6555 AbortBufferIO(buffer);
6556}
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:6166

References AbortBufferIO(), PrivateRefCountEntry::buffer, and DatumGetInt32().

◆ ResOwnerReleaseBufferPin()

static void ResOwnerReleaseBufferPin ( Datum  res)
static

Definition at line 6567 of file bufmgr.c.

6568{
6569 Buffer buffer = DatumGetInt32(res);
6570
6571 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6572 if (!BufferIsValid(buffer))
6573 elog(ERROR, "bad buffer ID: %d", buffer);
6574
6575 if (BufferIsLocal(buffer))
6577 else
6579}
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:3256
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:848

References PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), DatumGetInt32(), elog, ERROR, GetBufferDescriptor(), UnpinBufferNoOwner(), and UnpinLocalBufferNoOwner().

◆ rlocator_comparator()

static int rlocator_comparator ( const void *  p1,
const void *  p2 
)
static

Definition at line 6237 of file bufmgr.c.

6238{
6239 RelFileLocator n1 = *(const RelFileLocator *) p1;
6240 RelFileLocator n2 = *(const RelFileLocator *) p2;
6241
6242 if (n1.relNumber < n2.relNumber)
6243 return -1;
6244 else if (n1.relNumber > n2.relNumber)
6245 return 1;
6246
6247 if (n1.dbOid < n2.dbOid)
6248 return -1;
6249 else if (n1.dbOid > n2.dbOid)
6250 return 1;
6251
6252 if (n1.spcOid < n2.spcOid)
6253 return -1;
6254 else if (n1.spcOid > n2.spcOid)
6255 return 1;
6256 else
6257 return 0;
6258}

References RelFileLocator::dbOid, RelFileLocator::relNumber, and RelFileLocator::spcOid.

Referenced by buffertag_comparator(), DropRelationsAllBuffers(), and FlushRelationsAllBuffers().

◆ ScheduleBufferTagForWriteback()

void ScheduleBufferTagForWriteback ( WritebackContext wb_context,
IOContext  io_context,
BufferTag tag 
)

Definition at line 6418 of file bufmgr.c.

6420{
6421 PendingWriteback *pending;
6422
6423 /*
6424 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
6425 * point in tracking in that case.
6426 */
6428 !enableFsync)
6429 return;
6430
6431 /*
6432 * Add buffer to the pending writeback array, unless writeback control is
6433 * disabled.
6434 */
6435 if (*wb_context->max_pending > 0)
6436 {
6438
6439 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
6440
6441 pending->tag = *tag;
6442 }
6443
6444 /*
6445 * Perform pending flushes if the writeback limit is exceeded. This
6446 * includes the case where previously an item has been added, but control
6447 * is now disabled.
6448 */
6449 if (wb_context->nr_pending >= *wb_context->max_pending)
6450 IssuePendingWritebacks(wb_context, io_context);
6451}
bool enableFsync
Definition: globals.c:129
#define WRITEBACK_MAX_PENDING_FLUSHES

References Assert(), enableFsync, IO_DIRECT_DATA, io_direct_flags, IssuePendingWritebacks(), WritebackContext::max_pending, WritebackContext::nr_pending, WritebackContext::pending_writebacks, PendingWriteback::tag, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by GetVictimBuffer(), and SyncOneBuffer().

◆ shared_buffer_readv_complete()

static PgAioResult shared_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 7578 of file bufmgr.c.

7580{
7581 return buffer_readv_complete(ioh, prior_result, cb_data, false);
7582}

References buffer_readv_complete().

◆ shared_buffer_readv_complete_local()

static PgAioResult shared_buffer_readv_complete_local ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 7592 of file bufmgr.c.

7594{
7595 bool zeroed_any,
7596 ignored_any;
7597 uint8 zeroed_or_error_count,
7598 checkfail_count,
7599 first_off;
7600
7601 if (prior_result.status == PGAIO_RS_OK)
7602 return prior_result;
7603
7604 buffer_readv_decode_error(prior_result,
7605 &zeroed_any,
7606 &ignored_any,
7607 &zeroed_or_error_count,
7608 &checkfail_count,
7609 &first_off);
7610
7611 if (checkfail_count)
7612 {
7614
7616 checkfail_count);
7617 }
7618
7619 return prior_result;
7620}
@ PGAIO_RS_OK
Definition: aio_types.h:81

References buffer_readv_decode_error(), RelFileLocator::dbOid, pgaio_io_get_target_data(), PGAIO_RS_OK, pgstat_report_checksum_failures_in_db(), PgAioTargetData::rlocator, PgAioTargetData::smgr, and PgAioResult::status.

◆ shared_buffer_readv_stage()

static void shared_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 7572 of file bufmgr.c.

7573{
7574 buffer_stage_common(ioh, false, false);
7575}

References buffer_stage_common().

◆ shared_buffer_write_error_callback()

static void shared_buffer_write_error_callback ( void *  arg)
static

Definition at line 6205 of file bufmgr.c.

6206{
6207 BufferDesc *bufHdr = (BufferDesc *) arg;
6208
6209 /* Buffer is pinned, so we can read the tag without locking the spinlock */
6210 if (bufHdr != NULL)
6211 errcontext("writing block %u of relation \"%s\"",
6212 bufHdr->tag.blockNum,
6214 BufTagGetForkNum(&bufHdr->tag)).str);
6215}

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, relpathperm, and BufferDesc::tag.

Referenced by FlushBuffer().

◆ StartBufferIO()

bool StartBufferIO ( BufferDesc buf,
bool  forInput,
bool  nowait 
)

Definition at line 6046 of file bufmgr.c.

6047{
6048 uint32 buf_state;
6049
6051
6052 for (;;)
6053 {
6054 buf_state = LockBufHdr(buf);
6055
6056 if (!(buf_state & BM_IO_IN_PROGRESS))
6057 break;
6059 if (nowait)
6060 return false;
6061 WaitIO(buf);
6062 }
6063
6064 /* Once we get here, there is definitely no I/O active on this buffer */
6065
6066 /* Check if someone else already did the I/O */
6067 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
6068 {
6070 return false;
6071 }
6072
6073 UnlockBufHdrExt(buf, buf_state,
6075 0);
6076
6079
6080 return true;
6081}
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)

References BM_DIRTY, BM_IO_IN_PROGRESS, BM_VALID, buf, BufferDescriptorGetBuffer(), CurrentResourceOwner, LockBufHdr(), ResourceOwnerEnlarge(), ResourceOwnerRememberBufferIO(), UnlockBufHdr(), UnlockBufHdrExt(), and WaitIO().

Referenced by buffer_call_start_io(), ExtendBufferedRelShared(), FlushBuffer(), read_rel_block_ll(), ReadBuffersCanStartIOOnce(), and ZeroAndLockBuffer().

◆ StartReadBuffer()

bool StartReadBuffer ( ReadBuffersOperation operation,
Buffer buffer,
BlockNumber  blocknum,
int  flags 
)

Definition at line 1489 of file bufmgr.c.

1493{
1494 int nblocks = 1;
1495 bool result;
1496
1497 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1498 false /* single block, no forwarding */ );
1499 Assert(nblocks == 1); /* single block can't be short */
1500
1501 return result;
1502}
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition: bufmgr.c:1243

References Assert(), PrivateRefCountEntry::buffer, and StartReadBuffersImpl().

Referenced by read_stream_next_buffer(), and ReadBuffer_common().

◆ StartReadBuffers()

bool StartReadBuffers ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int *  nblocks,
int  flags 
)

Definition at line 1470 of file bufmgr.c.

1475{
1476 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1477 true /* expect forwarded buffers */ );
1478}

References StartReadBuffersImpl().

Referenced by read_stream_start_pending_read().

◆ StartReadBuffersImpl()

static pg_attribute_always_inline bool StartReadBuffersImpl ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int *  nblocks,
int  flags,
bool  allow_forwarding 
)
static

Definition at line 1243 of file bufmgr.c.

1249{
1250 int actual_nblocks = *nblocks;
1251 int maxcombine = 0;
1252 bool did_start_io;
1253
1254 Assert(*nblocks == 1 || allow_forwarding);
1255 Assert(*nblocks > 0);
1256 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1257
1258 for (int i = 0; i < actual_nblocks; ++i)
1259 {
1260 bool found;
1261
1262 if (allow_forwarding && buffers[i] != InvalidBuffer)
1263 {
1264 BufferDesc *bufHdr;
1265
1266 /*
1267 * This is a buffer that was pinned by an earlier call to
1268 * StartReadBuffers(), but couldn't be handled in one operation at
1269 * that time. The operation was split, and the caller has passed
1270 * an already pinned buffer back to us to handle the rest of the
1271 * operation. It must continue at the expected block number.
1272 */
1273 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1274
1275 /*
1276 * It might be an already valid buffer (a hit) that followed the
1277 * final contiguous block of an earlier I/O (a miss) marking the
1278 * end of it, or a buffer that some other backend has since made
1279 * valid by performing the I/O for us, in which case we can handle
1280 * it as a hit now. It is safe to check for a BM_VALID flag with
1281 * a relaxed load, because we got a fresh view of it while pinning
1282 * it in the previous call.
1283 *
1284 * On the other hand if we don't see BM_VALID yet, it must be an
1285 * I/O that was split by the previous call and we need to try to
1286 * start a new I/O from this block. We're also racing against any
1287 * other backend that might start the I/O or even manage to mark
1288 * it BM_VALID after this check, but StartBufferIO() will handle
1289 * those cases.
1290 */
1291 if (BufferIsLocal(buffers[i]))
1292 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1293 else
1294 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1296 found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
1297 }
1298 else
1299 {
1300 buffers[i] = PinBufferForBlock(operation->rel,
1301 operation->smgr,
1302 operation->persistence,
1303 operation->forknum,
1304 blockNum + i,
1305 operation->strategy,
1306 &found);
1307 }
1308
1309 if (found)
1310 {
1311 /*
1312 * We have a hit. If it's the first block in the requested range,
1313 * we can return it immediately and report that WaitReadBuffers()
1314 * does not need to be called. If the initial value of *nblocks
1315 * was larger, the caller will have to call again for the rest.
1316 */
1317 if (i == 0)
1318 {
1319 *nblocks = 1;
1320
1321#ifdef USE_ASSERT_CHECKING
1322
1323 /*
1324 * Initialize enough of ReadBuffersOperation to make
1325 * CheckReadBuffersOperation() work. Outside of assertions
1326 * that's not necessary when no IO is issued.
1327 */
1328 operation->buffers = buffers;
1329 operation->blocknum = blockNum;
1330 operation->nblocks = 1;
1331 operation->nblocks_done = 1;
1332 CheckReadBuffersOperation(operation, true);
1333#endif
1334 return false;
1335 }
1336
1337 /*
1338 * Otherwise we already have an I/O to perform, but this block
1339 * can't be included as it is already valid. Split the I/O here.
1340 * There may or may not be more blocks requiring I/O after this
1341 * one, we haven't checked, but they can't be contiguous with this
1342 * one in the way. We'll leave this buffer pinned, forwarding it
1343 * to the next call, avoiding the need to unpin it here and re-pin
1344 * it in the next call.
1345 */
1346 actual_nblocks = i;
1347 break;
1348 }
1349 else
1350 {
1351 /*
1352 * Check how many blocks we can cover with the same IO. The smgr
1353 * implementation might e.g. be limited due to a segment boundary.
1354 */
1355 if (i == 0 && actual_nblocks > 1)
1356 {
1357 maxcombine = smgrmaxcombine(operation->smgr,
1358 operation->forknum,
1359 blockNum);
1360 if (unlikely(maxcombine < actual_nblocks))
1361 {
1362 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1363 blockNum, actual_nblocks, maxcombine);
1364 actual_nblocks = maxcombine;
1365 }
1366 }
1367 }
1368 }
1369 *nblocks = actual_nblocks;
1370
1371 /* Populate information needed for I/O. */
1372 operation->buffers = buffers;
1373 operation->blocknum = blockNum;
1374 operation->flags = flags;
1375 operation->nblocks = actual_nblocks;
1376 operation->nblocks_done = 0;
1377 pgaio_wref_clear(&operation->io_wref);
1378
1379 /*
1380 * When using AIO, start the IO in the background. If not, issue prefetch
1381 * requests if desired by the caller.
1382 *
1383 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1384 * de-risk the introduction of AIO somewhat. It's a large architectural
1385 * change, with lots of chances for unanticipated performance effects.
1386 *
1387 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1388 * asynchronously, but without the check here we'd execute IO earlier than
1389 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1390 */
1391 if (io_method != IOMETHOD_SYNC)
1392 {
1393 /*
1394 * Try to start IO asynchronously. It's possible that no IO needs to
1395 * be started, if another backend already performed the IO.
1396 *
1397 * Note that if an IO is started, it might not cover the entire
1398 * requested range, e.g. because an intermediary block has been read
1399 * in by another backend. In that case any "trailing" buffers we
1400 * already pinned above will be "forwarded" by read_stream.c to the
1401 * next call to StartReadBuffers().
1402 *
1403 * This is signalled to the caller by decrementing *nblocks *and*
1404 * reducing operation->nblocks. The latter is done here, but not below
1405 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1406 * overall read size anymore, we need to retry until done in its
1407 * entirety or until failed.
1408 */
1409 did_start_io = AsyncReadBuffers(operation, nblocks);
1410
1411 operation->nblocks = *nblocks;
1412 }
1413 else
1414 {
1415 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1416
1417 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1418 {
1419 /*
1420 * In theory we should only do this if PinBufferForBlock() had to
1421 * allocate new buffers above. That way, if two calls to
1422 * StartReadBuffers() were made for the same blocks before
1423 * WaitReadBuffers(), only the first would issue the advice.
1424 * That'd be a better simulation of true asynchronous I/O, which
1425 * would only start the I/O once, but isn't done here for
1426 * simplicity.
1427 */
1428 smgrprefetch(operation->smgr,
1429 operation->forknum,
1430 blockNum,
1431 actual_nblocks);
1432 }
1433
1434 /*
1435 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1436 * will initiate the necessary IO.
1437 */
1438 did_start_io = true;
1439 }
1440
1441 CheckReadBuffersOperation(operation, !did_start_io);
1442
1443 return did_start_io;
1444}
int io_method
Definition: aio.c:74
@ IOMETHOD_SYNC
Definition: aio.h:34
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition: bufmgr.c:1508
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition: bufmgr.c:1745
#define READ_BUFFERS_ISSUE_ADVICE
Definition: bufmgr.h:124
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:697

References Assert(), AsyncReadBuffers(), ReadBuffersOperation::blocknum, BM_TAG_VALID, BM_VALID, BufferGetBlockNumber(), BufferIsLocal, ReadBuffersOperation::buffers, CheckReadBuffersOperation(), DEBUG2, elog, ReadBuffersOperation::flags, ReadBuffersOperation::forknum, GetBufferDescriptor(), GetLocalBufferDescriptor(), i, InvalidBuffer, io_method, ReadBuffersOperation::io_wref, IOMETHOD_SYNC, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, pg_atomic_read_u32(), pgaio_wref_clear(), PinBufferForBlock(), READ_BUFFERS_ISSUE_ADVICE, READ_BUFFERS_SYNCHRONOUSLY, ReadBuffersOperation::rel, ReadBuffersOperation::smgr, smgrmaxcombine(), smgrprefetch(), BufferDesc::state, ReadBuffersOperation::strategy, and unlikely.

Referenced by StartReadBuffer(), and StartReadBuffers().

◆ SyncOneBuffer()

static int SyncOneBuffer ( int  buf_id,
bool  skip_recently_used,
WritebackContext wb_context 
)
static

Definition at line 3920 of file bufmgr.c.

3921{
3922 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3923 int result = 0;
3924 uint32 buf_state;
3925 BufferTag tag;
3926
3927 /* Make sure we can handle the pin */
3930
3931 /*
3932 * Check whether buffer needs writing.
3933 *
3934 * We can make this check without taking the buffer content lock so long
3935 * as we mark pages dirty in access methods *before* logging changes with
3936 * XLogInsert(): if someone marks the buffer dirty just after our check we
3937 * don't worry because our checkpoint.redo points before log record for
3938 * upcoming changes and so we are not required to write such dirty buffer.
3939 */
3940 buf_state = LockBufHdr(bufHdr);
3941
3942 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3943 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3944 {
3945 result |= BUF_REUSABLE;
3946 }
3947 else if (skip_recently_used)
3948 {
3949 /* Caller told us not to write recently-used buffers */
3950 UnlockBufHdr(bufHdr);
3951 return result;
3952 }
3953
3954 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3955 {
3956 /* It's clean, so nothing to do */
3957 UnlockBufHdr(bufHdr);
3958 return result;
3959 }
3960
3961 /*
3962 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3963 * buffer is clean by the time we've locked it.)
3964 */
3965 PinBuffer_Locked(bufHdr);
3966
3968
3969 tag = bufHdr->tag;
3970
3971 UnpinBuffer(bufHdr);
3972
3973 /*
3974 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3975 * IOContext will always be IOCONTEXT_NORMAL.
3976 */
3978
3979 return result | BUF_WRITTEN;
3980}

References BM_DIRTY, BM_VALID, BUF_REUSABLE, BUF_STATE_GET_REFCOUNT, BUF_STATE_GET_USAGECOUNT, BUF_WRITTEN, CurrentResourceOwner, FlushUnlockedBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by BgBufferSync(), and BufferSync().

◆ TerminateBufferIO()

void TerminateBufferIO ( BufferDesc buf,
bool  clear_dirty,
uint32  set_flag_bits,
bool  forget_owner,
bool  release_aio 
)

Definition at line 6104 of file bufmgr.c.

6106{
6107 uint32 buf_state;
6108 uint32 unset_flag_bits = 0;
6109 int refcount_change = 0;
6110
6111 buf_state = LockBufHdr(buf);
6112
6113 Assert(buf_state & BM_IO_IN_PROGRESS);
6114 unset_flag_bits |= BM_IO_IN_PROGRESS;
6115
6116 /* Clear earlier errors, if this IO failed, it'll be marked again */
6117 unset_flag_bits |= BM_IO_ERROR;
6118
6119 if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
6120 unset_flag_bits |= BM_DIRTY | BM_CHECKPOINT_NEEDED;
6121
6122 if (release_aio)
6123 {
6124 /* release ownership by the AIO subsystem */
6125 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6126 refcount_change = -1;
6127 pgaio_wref_clear(&buf->io_wref);
6128 }
6129
6130 buf_state = UnlockBufHdrExt(buf, buf_state,
6131 set_flag_bits, unset_flag_bits,
6132 refcount_change);
6133
6134 if (forget_owner)
6137
6139
6140 /*
6141 * Support LockBufferForCleanup()
6142 *
6143 * We may have just released the last pin other than the waiter's. In most
6144 * cases, this backend holds another pin on the buffer. But, if, for
6145 * example, this backend is completing an IO issued by another backend, it
6146 * may be time to wake the waiter.
6147 */
6148 if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
6150}
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void WakePinCountWaiter(BufferDesc *buf)
Definition: bufmgr.c:3211
void ConditionVariableBroadcast(ConditionVariable *cv)

References Assert(), BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_JUST_DIRTIED, BM_PIN_COUNT_WAITER, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetIOCV(), ConditionVariableBroadcast(), CurrentResourceOwner, LockBufHdr(), pgaio_wref_clear(), ResourceOwnerForgetBufferIO(), UnlockBufHdrExt(), and WakePinCountWaiter().

Referenced by AbortBufferIO(), buffer_call_terminate_io(), buffer_readv_complete_one(), ExtendBufferedRelShared(), FlushBuffer(), and ZeroAndLockBuffer().

◆ TrackNewBufferPin()

void TrackNewBufferPin ( Buffer  buf)
inline

Definition at line 3303 of file bufmgr.c.

3304{
3306
3308 ref->refcount++;
3309
3311
3312 /*
3313 * This is the first pin for this page by this backend, mark its page as
3314 * defined to valgrind. While the page contents might not actually be
3315 * valid yet, we don't currently guarantee that such pages are marked
3316 * undefined or non-accessible.
3317 *
3318 * It's not necessarily the prettiest to do this here, but otherwise we'd
3319 * need this block of code in multiple places.
3320 */
3322 BLCKSZ);
3323}
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:325

References buf, BufHdrGetBlock, CurrentResourceOwner, GetBufferDescriptor(), NewPrivateRefCountEntry(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by GetBufferFromRing(), PinBuffer(), PinBuffer_Locked(), and StrategyGetBuffer().

◆ ts_ckpt_progress_comparator()

static int ts_ckpt_progress_comparator ( Datum  a,
Datum  b,
void *  arg 
)
static

Definition at line 6383 of file bufmgr.c.

6384{
6387
6388 /* we want a min-heap, so return 1 for the a < b */
6389 if (sa->progress < sb->progress)
6390 return 1;
6391 else if (sa->progress == sb->progress)
6392 return 0;
6393 else
6394 return -1;
6395}

References a, b, DatumGetPointer(), and CkptTsStatus::progress.

Referenced by BufferSync().

◆ UnlockBuffers()

void UnlockBuffers ( void  )

Definition at line 5573 of file bufmgr.c.

5574{
5576
5577 if (buf)
5578 {
5579 uint32 buf_state;
5580 uint32 unset_bits = 0;
5581
5582 buf_state = LockBufHdr(buf);
5583
5584 /*
5585 * Don't complain if flag bit not set; it could have been reset but we
5586 * got a cancel/die interrupt before getting the signal.
5587 */
5588 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5589 buf->wait_backend_pgprocno == MyProcNumber)
5590 unset_bits = BM_PIN_COUNT_WAITER;
5591
5592 UnlockBufHdrExt(buf, buf_state,
5593 0, unset_bits,
5594 0);
5595
5596 PinCountWaitBuf = NULL;
5597 }
5598}

References BM_PIN_COUNT_WAITER, buf, LockBufHdr(), MyProcNumber, PinCountWaitBuf, and UnlockBufHdrExt().

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

◆ UnlockReleaseBuffer()

void UnlockReleaseBuffer ( Buffer  buffer)

Definition at line 5383 of file bufmgr.c.

5384{
5386 ReleaseBuffer(buffer);
5387}

References PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, LockBuffer(), and ReleaseBuffer().

Referenced by _bt_clear_incomplete_split(), _bt_restore_meta(), _hash_relbuf(), allocNewBuffer(), AlterSequence(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinGetStats(), brinRevmapDesummarizeRange(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), collect_corrupt_items(), collect_visibility_data(), count_nondeletable_pages(), createPostingTree(), doPickSplit(), entryLoadMoreItems(), fill_seq_fork_with_data(), flushCachedPage(), FreeSpaceMapPrepareTruncateRel(), fsm_search(), fsm_set_and_search(), generic_redo(), gin_refind_parent(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoSplit(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginRedoVacuumPage(), ginScanToDelete(), ginStepRight(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbufferinginserttuples(), gistbuild(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistplacetopage(), gistProcessItup(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_split_page(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_insert(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heap_xlog_visible(), heapam_scan_analyze_next_tuple(), initBloomState(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), log_newpage_range(), moveLeafs(), nextval_internal(), palloc_btree_page(), pg_get_sequence_data(), pg_sequence_last_value(), pg_visibility(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), ResetSequence(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), scanPostingTree(), ScanSourceDatabasePgClass(), seq_redo(), SequenceChangePersistence(), SetSequence(), shiftList(), spgAddNodeAction(), spgbuild(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistUpdateMetaPage(), spgMatchNodeAction(), spgprocesspending(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), statapprox_heap(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_prepare_truncate(), writeListPage(), xlog_redo(), and XLogRecordPageWithFreeSpace().

◆ UnpinBuffer()

◆ UnpinBufferNoOwner()

static void UnpinBufferNoOwner ( BufferDesc buf)
static

Definition at line 3256 of file bufmgr.c.

3257{
3260
3262
3263 /* not moving as we're likely deleting it soon anyway */
3264 ref = GetPrivateRefCountEntry(b, false);
3265 Assert(ref != NULL);
3266 Assert(ref->refcount > 0);
3267 ref->refcount--;
3268 if (ref->refcount == 0)
3269 {
3270 uint32 old_buf_state;
3271
3272 /*
3273 * Mark buffer non-accessible to Valgrind.
3274 *
3275 * Note that the buffer may have already been marked non-accessible
3276 * within access method code that enforces that buffers are only
3277 * accessed while a buffer lock is held.
3278 */
3280
3281 /*
3282 * I'd better not still hold the buffer content lock. Can't use
3283 * BufferIsLockedByMe(), as that asserts the buffer is pinned.
3284 */
3286
3287 /* decrement the shared reference count */
3288 old_buf_state = pg_atomic_fetch_sub_u32(&buf->state, BUF_REFCOUNT_ONE);
3289
3290 /* Support LockBufferForCleanup() */
3291 if (old_buf_state & BM_PIN_COUNT_WAITER)
3293
3295 }
3296}
static uint32 pg_atomic_fetch_sub_u32(volatile pg_atomic_uint32 *ptr, int32 sub_)
Definition: atomics.h:379
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:448

References Assert(), b, BM_PIN_COUNT_WAITER, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufferIsLocal, BufHdrGetBlock, ForgetPrivateRefCountEntry(), GetPrivateRefCountEntry(), LWLockHeldByMe(), pg_atomic_fetch_sub_u32(), PrivateRefCountEntry::refcount, VALGRIND_MAKE_MEM_NOACCESS, and WakePinCountWaiter().

Referenced by ResOwnerReleaseBufferPin(), and UnpinBuffer().

◆ WaitBufHdrUnlocked()

pg_noinline uint32 WaitBufHdrUnlocked ( BufferDesc buf)

Definition at line 6294 of file bufmgr.c.

6295{
6296 SpinDelayStatus delayStatus;
6297 uint32 buf_state;
6298
6299 init_local_spin_delay(&delayStatus);
6300
6301 buf_state = pg_atomic_read_u32(&buf->state);
6302
6303 while (buf_state & BM_LOCKED)
6304 {
6305 perform_spin_delay(&delayStatus);
6306 buf_state = pg_atomic_read_u32(&buf->state);
6307 }
6308
6309 finish_spin_delay(&delayStatus);
6310
6311 return buf_state;
6312}

References BM_LOCKED, buf, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), and pg_atomic_read_u32().

Referenced by GetBufferFromRing(), MarkBufferDirty(), PinBuffer(), and StrategyGetBuffer().

◆ WaitIO()

static void WaitIO ( BufferDesc buf)
static

Definition at line 5967 of file bufmgr.c.

5968{
5970
5972 for (;;)
5973 {
5974 uint32 buf_state;
5975 PgAioWaitRef iow;
5976
5977 /*
5978 * It may not be necessary to acquire the spinlock to check the flag
5979 * here, but since this test is essential for correctness, we'd better
5980 * play it safe.
5981 */
5982 buf_state = LockBufHdr(buf);
5983
5984 /*
5985 * Copy the wait reference while holding the spinlock. This protects
5986 * against a concurrent TerminateBufferIO() in another backend from
5987 * clearing the wref while it's being read.
5988 */
5989 iow = buf->io_wref;
5991
5992 /* no IO in progress, we don't need to wait */
5993 if (!(buf_state & BM_IO_IN_PROGRESS))
5994 break;
5995
5996 /*
5997 * The buffer has asynchronous IO in progress, wait for it to
5998 * complete.
5999 */
6000 if (pgaio_wref_valid(&iow))
6001 {
6002 pgaio_wref_wait(&iow);
6003
6004 /*
6005 * The AIO subsystem internally uses condition variables and thus
6006 * might remove this backend from the BufferDesc's CV. While that
6007 * wouldn't cause a correctness issue (the first CV sleep just
6008 * immediately returns if not already registered), it seems worth
6009 * avoiding unnecessary loop iterations, given that we take care
6010 * to do so at the start of the function.
6011 */
6013 continue;
6014 }
6015
6016 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
6017 ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
6018 }
6020}
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:991
bool ConditionVariableCancelSleep(void)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)

References BM_IO_IN_PROGRESS, buf, BufferDescriptorGetIOCV(), ConditionVariableCancelSleep(), ConditionVariablePrepareToSleep(), ConditionVariableSleep(), LockBufHdr(), pgaio_wref_valid(), pgaio_wref_wait(), and UnlockBufHdr().

Referenced by InvalidateBuffer(), and StartBufferIO().

◆ WaitReadBuffers()

void WaitReadBuffers ( ReadBuffersOperation operation)

Definition at line 1613 of file bufmgr.c.

1614{
1615 PgAioReturn *aio_ret = &operation->io_return;
1616 IOContext io_context;
1617 IOObject io_object;
1618
1619 if (operation->persistence == RELPERSISTENCE_TEMP)
1620 {
1621 io_context = IOCONTEXT_NORMAL;
1622 io_object = IOOBJECT_TEMP_RELATION;
1623 }
1624 else
1625 {
1626 io_context = IOContextForStrategy(operation->strategy);
1627 io_object = IOOBJECT_RELATION;
1628 }
1629
1630 /*
1631 * If we get here without an IO operation having been issued, the
1632 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1633 * caller should not have called WaitReadBuffers().
1634 *
1635 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1636 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1637 * of the retry logic below, no extra code is required.
1638 *
1639 * This path is expected to eventually go away.
1640 */
1641 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1642 elog(ERROR, "waiting for read operation that didn't read");
1643
1644 /*
1645 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1646 * done. We may need multiple retries, not just because we could get
1647 * multiple partial reads, but also because some of the remaining
1648 * to-be-read buffers may have been read in by other backends, limiting
1649 * the IO size.
1650 */
1651 while (true)
1652 {
1653 int ignored_nblocks_progress;
1654
1655 CheckReadBuffersOperation(operation, false);
1656
1657 /*
1658 * If there is an IO associated with the operation, we may need to
1659 * wait for it.
1660 */
1661 if (pgaio_wref_valid(&operation->io_wref))
1662 {
1663 /*
1664 * Track the time spent waiting for the IO to complete. As
1665 * tracking a wait even if we don't actually need to wait
1666 *
1667 * a) is not cheap, due to the timestamping overhead
1668 *
1669 * b) reports some time as waiting, even if we never waited
1670 *
1671 * we first check if we already know the IO is complete.
1672 */
1673 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1674 !pgaio_wref_check_done(&operation->io_wref))
1675 {
1677
1678 pgaio_wref_wait(&operation->io_wref);
1679
1680 /*
1681 * The IO operation itself was already counted earlier, in
1682 * AsyncReadBuffers(), this just accounts for the wait time.
1683 */
1684 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1685 io_start, 0, 0);
1686 }
1687 else
1688 {
1689 Assert(pgaio_wref_check_done(&operation->io_wref));
1690 }
1691
1692 /*
1693 * We now are sure the IO completed. Check the results. This
1694 * includes reporting on errors if there were any.
1695 */
1696 ProcessReadBuffersResult(operation);
1697 }
1698
1699 /*
1700 * Most of the time, the one IO we already started, will read in
1701 * everything. But we need to deal with partial reads and buffers not
1702 * needing IO anymore.
1703 */
1704 if (operation->nblocks_done == operation->nblocks)
1705 break;
1706
1708
1709 /*
1710 * This may only complete the IO partially, either because some
1711 * buffers were already valid, or because of a partial read.
1712 *
1713 * NB: In contrast to after the AsyncReadBuffers() call in
1714 * StartReadBuffers(), we do *not* reduce
1715 * ReadBuffersOperation->nblocks here, callers expect the full
1716 * operation to be completed at this point (as more operations may
1717 * have been queued).
1718 */
1719 AsyncReadBuffers(operation, &ignored_nblocks_progress);
1720 }
1721
1722 CheckReadBuffersOperation(operation, true);
1723
1724 /* NB: READ_DONE tracepoint was already executed in completion callback */
1725}
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:1005
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition: bufmgr.c:1574

References Assert(), AsyncReadBuffers(), CHECK_FOR_INTERRUPTS, CheckReadBuffersOperation(), elog, ERROR, io_method, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, IOCONTEXT_NORMAL, IOContextForStrategy(), IOMETHOD_SYNC, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_READ, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, PGAIO_RS_UNKNOWN, pgaio_wref_check_done(), pgaio_wref_valid(), pgaio_wref_wait(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), ProcessReadBuffersResult(), PgAioReturn::result, PgAioResult::status, ReadBuffersOperation::strategy, and track_io_timing.

Referenced by read_stream_next_buffer(), and ReadBuffer_common().

◆ WakePinCountWaiter()

static void WakePinCountWaiter ( BufferDesc buf)
static

Definition at line 3211 of file bufmgr.c.

3212{
3213 /*
3214 * Acquire the buffer header lock, re-check that there's a waiter. Another
3215 * backend could have unpinned this buffer, and already woken up the
3216 * waiter.
3217 *
3218 * There's no danger of the buffer being replaced after we unpinned it
3219 * above, as it's pinned by the waiter. The waiter removes
3220 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3221 * backend waking it up.
3222 */
3223 uint32 buf_state = LockBufHdr(buf);
3224
3225 if ((buf_state & BM_PIN_COUNT_WAITER) &&
3226 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3227 {
3228 /* we just released the last pin other than the waiter's */
3229 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3230
3231 UnlockBufHdrExt(buf, buf_state,
3233 0);
3234 ProcSendSignal(wait_backend_pgprocno);
3235 }
3236 else
3238}
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1996

References BM_PIN_COUNT_WAITER, buf, BUF_STATE_GET_REFCOUNT, LockBufHdr(), ProcSendSignal(), UnlockBufHdr(), and UnlockBufHdrExt().

Referenced by TerminateBufferIO(), and UnpinBufferNoOwner().

◆ WritebackContextInit()

void WritebackContextInit ( WritebackContext context,
int *  max_pending 
)

Definition at line 6406 of file bufmgr.c.

6407{
6408 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6409
6410 context->max_pending = max_pending;
6411 context->nr_pending = 0;
6412}

References Assert(), WritebackContext::max_pending, WritebackContext::nr_pending, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by BackgroundWriterMain(), BufferManagerShmemInit(), and BufferSync().

◆ ZeroAndLockBuffer()

static void ZeroAndLockBuffer ( Buffer  buffer,
ReadBufferMode  mode,
bool  already_valid 
)
static

Definition at line 1012 of file bufmgr.c.

1013{
1014 BufferDesc *bufHdr;
1015 bool need_to_zero;
1016 bool isLocalBuf = BufferIsLocal(buffer);
1017
1019
1020 if (already_valid)
1021 {
1022 /*
1023 * If the caller already knew the buffer was valid, we can skip some
1024 * header interaction. The caller just wants to lock the buffer.
1025 */
1026 need_to_zero = false;
1027 }
1028 else if (isLocalBuf)
1029 {
1030 /* Simple case for non-shared buffers. */
1031 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1032 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1033 }
1034 else
1035 {
1036 /*
1037 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1038 * concurrently. Even though we aren't doing I/O, that ensures that
1039 * we don't zero a page that someone else has pinned. An exclusive
1040 * content lock wouldn't be enough, because readers are allowed to
1041 * drop the content lock after determining that a tuple is visible
1042 * (see buffer access rules in README).
1043 */
1044 bufHdr = GetBufferDescriptor(buffer - 1);
1045 need_to_zero = StartBufferIO(bufHdr, true, false);
1046 }
1047
1048 if (need_to_zero)
1049 {
1050 memset(BufferGetPage(buffer), 0, BLCKSZ);
1051
1052 /*
1053 * Grab the buffer content lock before marking the page as valid, to
1054 * make sure that no other backend sees the zeroed page before the
1055 * caller has had a chance to initialize it.
1056 *
1057 * Since no-one else can be looking at the page contents yet, there is
1058 * no difference between an exclusive lock and a cleanup-strength
1059 * lock. (Note that we cannot use LockBuffer() or
1060 * LockBufferForCleanup() here, because they assert that the buffer is
1061 * already valid.)
1062 */
1063 if (!isLocalBuf)
1065
1066 /* Set BM_VALID, terminate IO, and wake up any waiters */
1067 if (isLocalBuf)
1068 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1069 else
1070 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1071 }
1072 else if (!isLocalBuf)
1073 {
1074 /*
1075 * The buffer is valid, so we can't zero it. The caller still expects
1076 * the page to be locked on return.
1077 */
1078 if (mode == RBM_ZERO_AND_LOCK)
1080 else
1081 LockBufferForCleanup(buffer);
1082 }
1083}
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5684

References Assert(), BM_VALID, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferGetPage(), BufferIsLocal, GetBufferDescriptor(), GetLocalBufferDescriptor(), LockBuffer(), LockBufferForCleanup(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, StartBufferIO(), StartLocalBufferIO(), TerminateBufferIO(), and TerminateLocalBufferIO().

Referenced by ReadBuffer_common().

Variable Documentation

◆ aio_local_buffer_readv_cb

const PgAioHandleCallbacks aio_local_buffer_readv_cb
Initial value:
= {
.complete_local = local_buffer_readv_complete,
}
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7629
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7623
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: bufmgr.c:7477

Definition at line 7645 of file bufmgr.c.

◆ aio_shared_buffer_readv_cb

const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Initial value:
= {
.complete_shared = shared_buffer_readv_complete,
}
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7592
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7572
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7578

Definition at line 7636 of file bufmgr.c.

◆ backend_flush_after

int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER

Definition at line 180 of file bufmgr.c.

Referenced by BufferManagerShmemInit().

◆ bgwriter_flush_after

int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER

Definition at line 179 of file bufmgr.c.

Referenced by BackgroundWriterMain().

◆ bgwriter_lru_maxpages

int bgwriter_lru_maxpages = 100

Definition at line 145 of file bufmgr.c.

Referenced by BgBufferSync().

◆ bgwriter_lru_multiplier

double bgwriter_lru_multiplier = 2.0

Definition at line 146 of file bufmgr.c.

Referenced by BgBufferSync().

◆ buffer_io_resowner_desc

const ResourceOwnerDesc buffer_io_resowner_desc
Initial value:
=
{
.name = "buffer io",
.release_priority = RELEASE_PRIO_BUFFER_IOS,
.ReleaseResource = ResOwnerReleaseBufferIO,
.DebugPrint = ResOwnerPrintBufferIO
}
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:6551
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:6559
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54

Definition at line 235 of file bufmgr.c.

Referenced by ResourceOwnerForgetBufferIO(), and ResourceOwnerRememberBufferIO().

◆ buffer_pin_resowner_desc

const ResourceOwnerDesc buffer_pin_resowner_desc
Initial value:
=
{
.name = "buffer pin",
.release_priority = RELEASE_PRIO_BUFFER_PINS,
.ReleaseResource = ResOwnerReleaseBufferPin,
.DebugPrint = ResOwnerPrintBufferPin
}
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:6582
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:6567
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63

Definition at line 244 of file bufmgr.c.

Referenced by ResourceOwnerForgetBuffer(), and ResourceOwnerRememberBuffer().

◆ checkpoint_flush_after

int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER

Definition at line 178 of file bufmgr.c.

Referenced by BufferSync().

◆ effective_io_concurrency

◆ io_combine_limit

◆ io_combine_limit_guc

int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT

Definition at line 171 of file bufmgr.c.

Referenced by assign_io_max_combine_limit().

◆ io_max_combine_limit

◆ maintenance_io_concurrency

◆ MaxProportionalPins

uint32 MaxProportionalPins
static

Definition at line 221 of file bufmgr.c.

Referenced by GetAdditionalPinLimit(), GetPinLimit(), and InitBufferManagerAccess().

◆ PinCountWaitBuf

BufferDesc* PinCountWaitBuf = NULL
static

Definition at line 183 of file bufmgr.c.

Referenced by LockBufferForCleanup(), and UnlockBuffers().

◆ PrivateRefCountArray

◆ PrivateRefCountClock

uint32 PrivateRefCountClock = 0
static

Definition at line 218 of file bufmgr.c.

Referenced by ReservePrivateRefCountEntry().

◆ PrivateRefCountHash

◆ PrivateRefCountOverflowed

◆ ReservedRefCountEntry

◆ track_io_timing

◆ zero_damaged_pages

bool zero_damaged_pages = false

Definition at line 144 of file bufmgr.c.

Referenced by AsyncReadBuffers(), mdreadv(), and read_rel_block_ll().