PostgreSQL Source Code git master
All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/aio.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
#include <lib/sort_template.h>
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define READV_COUNT_BITS   7
 
#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static void ResOwnerReleaseBufferIO (Datum res)
 
static char * ResOwnerPrintBufferIO (Datum res)
 
static void ResOwnerReleaseBufferPin (Datum res)
 
static char * ResOwnerPrintBufferPin (Datum res)
 
static Buffer ReadBuffer_common (Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static BlockNumber ExtendBufferedRelCommon (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static BlockNumber ExtendBufferedRelShared (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void UnpinBufferNoOwner (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static void AbortBufferIO (Buffer buffer)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
 
static bool AsyncReadBuffers (ReadBuffersOperation *operation, int *nblocks_progress)
 
static void CheckReadBuffersOperation (ReadBuffersOperation *operation, bool is_complete)
 
static Buffer GetVictimBuffer (BufferAccessStrategy strategy, IOContext io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
Buffer ExtendBufferedRel (BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
 
BlockNumber ExtendBufferedRelBy (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
 
Buffer ExtendBufferedRelTo (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
 
static void ZeroAndLockBuffer (Buffer buffer, ReadBufferMode mode, bool already_valid)
 
static pg_attribute_always_inline Buffer PinBufferForBlock (Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static pg_attribute_always_inline bool StartReadBuffersImpl (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
 
bool StartReadBuffers (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
 
bool StartReadBuffer (ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
 
static bool ReadBuffersCanStartIOOnce (Buffer buffer, bool nowait)
 
static bool ReadBuffersCanStartIO (Buffer buffer, bool nowait)
 
static void ProcessReadBuffersResult (ReadBuffersOperation *operation)
 
void WaitReadBuffers (ReadBuffersOperation *operation)
 
static void InvalidateBuffer (BufferDesc *buf)
 
static bool InvalidateVictimBuffer (BufferDesc *buf_hdr)
 
uint32 GetPinLimit (void)
 
uint32 GetAdditionalPinLimit (void)
 
void LimitAdditionalPins (uint32 *additional_pins)
 
bool BufferIsExclusiveLocked (Buffer buffer)
 
bool BufferIsDirty (Buffer buffer)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
static void WakePinCountWaiter (BufferDesc *buf)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferManagerAccess (void)
 
char * DebugPrintBufferRefcount (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void CheckBufferIsPinnedOnce (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
bool StartBufferIO (BufferDesc *buf, bool forInput, bool nowait)
 
void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *wb_context, IOContext io_context)
 
static bool EvictUnpinnedBufferInternal (BufferDesc *desc, bool *buffer_flushed)
 
bool EvictUnpinnedBuffer (Buffer buf, bool *buffer_flushed)
 
void EvictAllUnpinnedBuffers (int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
void EvictRelUnpinnedBuffers (Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped)
 
static pg_attribute_always_inline void buffer_stage_common (PgAioHandle *ioh, bool is_write, bool is_temp)
 
static void buffer_readv_decode_error (PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
 
static void buffer_readv_encode_error (PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
 
static pg_attribute_always_inline void buffer_readv_complete_one (PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
 
static pg_attribute_always_inline PgAioResult buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
 
static void buffer_readv_report (PgAioResult result, const PgAioTargetData *td, int elevel)
 
static void shared_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static PgAioResult shared_buffer_readv_complete_local (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 
static void local_buffer_readv_stage (PgAioHandle *ioh, uint8 cb_data)
 
static PgAioResult local_buffer_readv_complete (PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT
 
int io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 
static uint32 MaxProportionalPins
 
const ResourceOwnerDesc buffer_io_resowner_desc
 
const ResourceOwnerDesc buffer_pin_resowner_desc
 
const PgAioHandleCallbacks aio_shared_buffer_readv_cb
 
const PgAioHandleCallbacks aio_local_buffer_readv_cb
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 88 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 78 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 77 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 70 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:422
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:365
int32 * LocalRefCount
Definition: localbuf.c:48

Definition at line 480 of file bufmgr.c.

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 69 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 73 of file bufmgr.c.

◆ READV_COUNT_BITS

#define READV_COUNT_BITS   7

◆ READV_COUNT_MASK

#define READV_COUNT_MASK   ((1 << READV_COUNT_BITS) - 1)

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 97 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 80 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 6380 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 6380 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 6382 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 6382 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 6379 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 6379 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 6381 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 6381 of file bufmgr.c.

◆ ST_SORT [1/2]

#define ST_SORT   sort_checkpoint_bufferids

Definition at line 6378 of file bufmgr.c.

◆ ST_SORT [2/2]

#define ST_SORT   sort_pending_writebacks

Definition at line 6378 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

static void AbortBufferIO ( Buffer  buffer)
static

Definition at line 6091 of file bufmgr.c.

6092{
6093 BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
6094 uint32 buf_state;
6095
6096 buf_state = LockBufHdr(buf_hdr);
6097 Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
6098
6099 if (!(buf_state & BM_VALID))
6100 {
6101 Assert(!(buf_state & BM_DIRTY));
6102 UnlockBufHdr(buf_hdr, buf_state);
6103 }
6104 else
6105 {
6106 Assert(buf_state & BM_DIRTY);
6107 UnlockBufHdr(buf_hdr, buf_state);
6108
6109 /* Issue notice if this is not the first failure... */
6110 if (buf_state & BM_IO_ERROR)
6111 {
6112 /* Buffer is pinned, so we can read tag without spinlock */
6114 (errcode(ERRCODE_IO_ERROR),
6115 errmsg("could not write block %u of %s",
6116 buf_hdr->tag.blockNum,
6118 BufTagGetForkNum(&buf_hdr->tag)).str),
6119 errdetail("Multiple failures --- write error might be permanent.")));
6120 }
6121 }
6122
6123 TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false, false);
6124}
#define BM_TAG_VALID
Definition: buf_internals.h:71
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
#define BM_DIRTY
Definition: buf_internals.h:69
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:72
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:70
#define BM_IO_ERROR
Definition: buf_internals.h:73
static BufferDesc * GetBufferDescriptor(uint32 id)
void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner, bool release_aio)
Definition: bufmgr.c:6032
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:6189
uint32_t uint32
Definition: c.h:502
int errdetail(const char *fmt,...)
Definition: elog.c:1204
int errcode(int sqlerrcode)
Definition: elog.c:854
int errmsg(const char *fmt,...)
Definition: elog.c:1071
#define WARNING
Definition: elog.h:36
#define ereport(elevel,...)
Definition: elog.h:149
Assert(PointerIsAligned(start, uint64))
#define relpathperm(rlocator, forknum)
Definition: relpath.h:146
BufferTag tag
BlockNumber blockNum

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg(), GetBufferDescriptor(), LockBufHdr(), relpathperm, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by ResOwnerReleaseBufferIO().

◆ AsyncReadBuffers()

static bool AsyncReadBuffers ( ReadBuffersOperation operation,
int *  nblocks_progress 
)
static

Definition at line 1762 of file bufmgr.c.

1763{
1764 Buffer *buffers = &operation->buffers[0];
1765 int flags = operation->flags;
1766 BlockNumber blocknum = operation->blocknum;
1767 ForkNumber forknum = operation->forknum;
1768 char persistence = operation->persistence;
1769 int16 nblocks_done = operation->nblocks_done;
1770 Buffer *io_buffers = &operation->buffers[nblocks_done];
1771 int io_buffers_len = 0;
1772 PgAioHandle *ioh;
1773 uint32 ioh_flags = 0;
1774 void *io_pages[MAX_IO_COMBINE_LIMIT];
1775 IOContext io_context;
1776 IOObject io_object;
1777 bool did_start_io;
1778
1779 /*
1780 * When this IO is executed synchronously, either because the caller will
1781 * immediately block waiting for the IO or because IOMETHOD_SYNC is used,
1782 * the AIO subsystem needs to know.
1783 */
1784 if (flags & READ_BUFFERS_SYNCHRONOUSLY)
1785 ioh_flags |= PGAIO_HF_SYNCHRONOUS;
1786
1787 if (persistence == RELPERSISTENCE_TEMP)
1788 {
1789 io_context = IOCONTEXT_NORMAL;
1790 io_object = IOOBJECT_TEMP_RELATION;
1791 ioh_flags |= PGAIO_HF_REFERENCES_LOCAL;
1792 }
1793 else
1794 {
1795 io_context = IOContextForStrategy(operation->strategy);
1796 io_object = IOOBJECT_RELATION;
1797 }
1798
1799 /*
1800 * If zero_damaged_pages is enabled, add the READ_BUFFERS_ZERO_ON_ERROR
1801 * flag. The reason for that is that, hopefully, zero_damaged_pages isn't
1802 * set globally, but on a per-session basis. The completion callback,
1803 * which may be run in other processes, e.g. in IO workers, may have a
1804 * different value of the zero_damaged_pages GUC.
1805 *
1806 * XXX: We probably should eventually use a different flag for
1807 * zero_damaged_pages, so we can report different log levels / error codes
1808 * for zero_damaged_pages and ZERO_ON_ERROR.
1809 */
1812
1813 /*
1814 * For the same reason as with zero_damaged_pages we need to use this
1815 * backend's ignore_checksum_failure value.
1816 */
1819
1820
1821 /*
1822 * To be allowed to report stats in the local completion callback we need
1823 * to prepare to report stats now. This ensures we can safely report the
1824 * checksum failure even in a critical section.
1825 */
1827
1828 /*
1829 * Get IO handle before ReadBuffersCanStartIO(), as pgaio_io_acquire()
1830 * might block, which we don't want after setting IO_IN_PROGRESS.
1831 *
1832 * If we need to wait for IO before we can get a handle, submit
1833 * already-staged IO first, so that other backends don't need to wait.
1834 * There wouldn't be a deadlock risk, as pgaio_io_acquire() just needs to
1835 * wait for already submitted IO, which doesn't require additional locks,
1836 * but it could still cause undesirable waits.
1837 *
1838 * A secondary benefit is that this would allow us to measure the time in
1839 * pgaio_io_acquire() without causing undue timer overhead in the common,
1840 * non-blocking, case. However, currently the pgstats infrastructure
1841 * doesn't really allow that, as it a) asserts that an operation can't
1842 * have time without operations b) doesn't have an API to report
1843 * "accumulated" time.
1844 */
1846 if (unlikely(!ioh))
1847 {
1849
1851 }
1852
1853 /*
1854 * Check if we can start IO on the first to-be-read buffer.
1855 *
1856 * If an I/O is already in progress in another backend, we want to wait
1857 * for the outcome: either done, or something went wrong and we will
1858 * retry.
1859 */
1860 if (!ReadBuffersCanStartIO(buffers[nblocks_done], false))
1861 {
1862 /*
1863 * Someone else has already completed this block, we're done.
1864 *
1865 * When IO is necessary, ->nblocks_done is updated in
1866 * ProcessReadBuffersResult(), but that is not called if no IO is
1867 * necessary. Thus update here.
1868 */
1869 operation->nblocks_done += 1;
1870 *nblocks_progress = 1;
1871
1872 pgaio_io_release(ioh);
1873 pgaio_wref_clear(&operation->io_wref);
1874 did_start_io = false;
1875
1876 /*
1877 * Report and track this as a 'hit' for this backend, even though it
1878 * must have started out as a miss in PinBufferForBlock(). The other
1879 * backend will track this as a 'read'.
1880 */
1881 TRACE_POSTGRESQL_BUFFER_READ_DONE(forknum, blocknum + operation->nblocks_done,
1882 operation->smgr->smgr_rlocator.locator.spcOid,
1883 operation->smgr->smgr_rlocator.locator.dbOid,
1884 operation->smgr->smgr_rlocator.locator.relNumber,
1885 operation->smgr->smgr_rlocator.backend,
1886 true);
1887
1888 if (persistence == RELPERSISTENCE_TEMP)
1890 else
1892
1893 if (operation->rel)
1894 pgstat_count_buffer_hit(operation->rel);
1895
1896 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1897
1898 if (VacuumCostActive)
1900 }
1901 else
1902 {
1903 instr_time io_start;
1904
1905 /* We found a buffer that we need to read in. */
1906 Assert(io_buffers[0] == buffers[nblocks_done]);
1907 io_pages[0] = BufferGetBlock(buffers[nblocks_done]);
1908 io_buffers_len = 1;
1909
1910 /*
1911 * How many neighboring-on-disk blocks can we scatter-read into other
1912 * buffers at the same time? In this case we don't wait if we see an
1913 * I/O already in progress. We already set BM_IO_IN_PROGRESS for the
1914 * head block, so we should get on with that I/O as soon as possible.
1915 */
1916 for (int i = nblocks_done + 1; i < operation->nblocks; i++)
1917 {
1918 if (!ReadBuffersCanStartIO(buffers[i], true))
1919 break;
1920 /* Must be consecutive block numbers. */
1921 Assert(BufferGetBlockNumber(buffers[i - 1]) ==
1922 BufferGetBlockNumber(buffers[i]) - 1);
1923 Assert(io_buffers[io_buffers_len] == buffers[i]);
1924
1925 io_pages[io_buffers_len++] = BufferGetBlock(buffers[i]);
1926 }
1927
1928 /* get a reference to wait for in WaitReadBuffers() */
1929 pgaio_io_get_wref(ioh, &operation->io_wref);
1930
1931 /* provide the list of buffers to the completion callbacks */
1932 pgaio_io_set_handle_data_32(ioh, (uint32 *) io_buffers, io_buffers_len);
1933
1935 persistence == RELPERSISTENCE_TEMP ?
1938 flags);
1939
1940 pgaio_io_set_flag(ioh, ioh_flags);
1941
1942 /* ---
1943 * Even though we're trying to issue IO asynchronously, track the time
1944 * in smgrstartreadv():
1945 * - if io_method == IOMETHOD_SYNC, we will always perform the IO
1946 * immediately
1947 * - the io method might not support the IO (e.g. worker IO for a temp
1948 * table)
1949 * ---
1950 */
1952 smgrstartreadv(ioh, operation->smgr, forknum,
1953 blocknum + nblocks_done,
1954 io_pages, io_buffers_len);
1955 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1956 io_start, 1, io_buffers_len * BLCKSZ);
1957
1958 if (persistence == RELPERSISTENCE_TEMP)
1959 pgBufferUsage.local_blks_read += io_buffers_len;
1960 else
1961 pgBufferUsage.shared_blks_read += io_buffers_len;
1962
1963 /*
1964 * Track vacuum cost when issuing IO, not after waiting for it.
1965 * Otherwise we could end up issuing a lot of IO in a short timespan,
1966 * despite a low cost limit.
1967 */
1968 if (VacuumCostActive)
1969 VacuumCostBalance += VacuumCostPageMiss * io_buffers_len;
1970
1971 *nblocks_progress = io_buffers_len;
1972 did_start_io = true;
1973 }
1974
1975 return did_start_io;
1976}
PgAioHandle * pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:173
void pgaio_wref_clear(PgAioWaitRef *iow)
Definition: aio.c:866
void pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
Definition: aio.c:354
void pgaio_io_set_flag(PgAioHandle *ioh, PgAioHandleFlags flag)
Definition: aio.c:318
void pgaio_submit_staged(void)
Definition: aio.c:1020
void pgaio_io_release(PgAioHandle *ioh)
Definition: aio.c:242
PgAioHandle * pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
Definition: aio.c:199
@ PGAIO_HCB_LOCAL_BUFFER_READV
Definition: aio.h:200
@ PGAIO_HCB_SHARED_BUFFER_READV
Definition: aio.h:198
@ PGAIO_HF_SYNCHRONOUS
Definition: aio.h:70
@ PGAIO_HF_REFERENCES_LOCAL
Definition: aio.h:60
void pgaio_io_set_handle_data_32(PgAioHandle *ioh, uint32 *data, uint8 len)
Definition: aio_callback.c:139
void pgaio_io_register_callbacks(PgAioHandle *ioh, PgAioHandleCallbackID cb_id, uint8 cb_data)
Definition: aio_callback.c:86
uint32 BlockNumber
Definition: block.h:31
int Buffer
Definition: buf.h:23
bool track_io_timing
Definition: bufmgr.c:144
BlockNumber BufferGetBlockNumber(Buffer buffer)
Definition: bufmgr.c:4161
static bool ReadBuffersCanStartIO(Buffer buffer, bool nowait)
Definition: bufmgr.c:1562
bool zero_damaged_pages
Definition: bufmgr.c:141
#define READ_BUFFERS_ZERO_ON_ERROR
Definition: bufmgr.h:112
static Block BufferGetBlock(Buffer buffer)
Definition: bufmgr.h:381
#define MAX_IO_COMBINE_LIMIT
Definition: bufmgr.h:166
#define READ_BUFFERS_IGNORE_CHECKSUM_FAILURES
Definition: bufmgr.h:116
#define READ_BUFFERS_SYNCHRONOUSLY
Definition: bufmgr.h:118
bool ignore_checksum_failure
Definition: bufpage.c:27
int16_t int16
Definition: c.h:497
#define unlikely(x)
Definition: c.h:347
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:800
int VacuumCostPageMiss
Definition: globals.c:153
bool VacuumCostActive
Definition: globals.c:159
int VacuumCostBalance
Definition: globals.c:158
int VacuumCostPageHit
Definition: globals.c:152
BufferUsage pgBufferUsage
Definition: instrument.c:20
int i
Definition: isn.c:77
IOObject
Definition: pgstat.h:273
@ IOOBJECT_RELATION
Definition: pgstat.h:274
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:275
IOContext
Definition: pgstat.h:282
@ IOCONTEXT_NORMAL
Definition: pgstat.h:286
@ IOOP_READ
Definition: pgstat.h:312
@ IOOP_HIT
Definition: pgstat.h:306
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:709
void pgstat_prepare_report_checksum_failure(Oid dboid)
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:90
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:68
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt, uint64 bytes)
Definition: pgstat_io.c:121
ForkNumber
Definition: relpath.h:56
ResourceOwner CurrentResourceOwner
Definition: resowner.c:173
void smgrstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void **buffers, BlockNumber nblocks)
Definition: smgr.c:753
int64 local_blks_hit
Definition: instrument.h:30
int64 shared_blks_read
Definition: instrument.h:27
int64 local_blks_read
Definition: instrument.h:31
int64 shared_blks_hit
Definition: instrument.h:26
ForkNumber forknum
Definition: bufmgr.h:127
PgAioWaitRef io_wref
Definition: bufmgr.h:140
Buffer * buffers
Definition: bufmgr.h:135
BufferAccessStrategy strategy
Definition: bufmgr.h:128
BlockNumber blocknum
Definition: bufmgr.h:136
PgAioReturn io_return
Definition: bufmgr.h:141
struct SMgrRelationData * smgr
Definition: bufmgr.h:125
RelFileLocator locator
RelFileNumber relNumber
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:38

References Assert(), RelFileLocatorBackend::backend, ReadBuffersOperation::blocknum, BufferGetBlock(), BufferGetBlockNumber(), ReadBuffersOperation::buffers, CurrentResourceOwner, RelFileLocator::dbOid, ReadBuffersOperation::flags, ReadBuffersOperation::forknum, i, ignore_checksum_failure, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_HIT, IOOP_READ, BufferUsage::local_blks_hit, BufferUsage::local_blks_read, RelFileLocatorBackend::locator, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_HF_REFERENCES_LOCAL, PGAIO_HF_SYNCHRONOUS, pgaio_io_acquire(), pgaio_io_acquire_nb(), pgaio_io_get_wref(), pgaio_io_register_callbacks(), pgaio_io_release(), pgaio_io_set_flag(), pgaio_io_set_handle_data_32(), pgaio_submit_staged(), pgaio_wref_clear(), pgBufferUsage, pgstat_count_buffer_hit, pgstat_count_io_op(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), pgstat_prepare_report_checksum_failure(), READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, ReadBuffersCanStartIO(), ReadBuffersOperation::rel, RelFileLocator::relNumber, BufferUsage::shared_blks_hit, BufferUsage::shared_blks_read, ReadBuffersOperation::smgr, SMgrRelationData::smgr_rlocator, smgrstartreadv(), RelFileLocator::spcOid, ReadBuffersOperation::strategy, track_io_timing, unlikely, VacuumCostActive, VacuumCostBalance, VacuumCostPageHit, VacuumCostPageMiss, and zero_damaged_pages.

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 3989 of file bufmgr.c.

3990{
3992
3993 AtEOXact_LocalBuffers(isCommit);
3994
3996}
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:4058
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:214
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:993

References Assert(), AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 4040 of file bufmgr.c.

4041{
4042 UnlockBuffers();
4043
4045
4046 /* localbuf.c needs a chance too */
4048}
void UnlockBuffers(void)
Definition: bufmgr.c:5509
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:1004

References AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferManagerAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 3618 of file bufmgr.c.

3619{
3620 /* info obtained from freelist.c */
3621 int strategy_buf_id;
3622 uint32 strategy_passes;
3623 uint32 recent_alloc;
3624
3625 /*
3626 * Information saved between calls so we can determine the strategy
3627 * point's advance rate and avoid scanning already-cleaned buffers.
3628 */
3629 static bool saved_info_valid = false;
3630 static int prev_strategy_buf_id;
3631 static uint32 prev_strategy_passes;
3632 static int next_to_clean;
3633 static uint32 next_passes;
3634
3635 /* Moving averages of allocation rate and clean-buffer density */
3636 static float smoothed_alloc = 0;
3637 static float smoothed_density = 10.0;
3638
3639 /* Potentially these could be tunables, but for now, not */
3640 float smoothing_samples = 16;
3641 float scan_whole_pool_milliseconds = 120000.0;
3642
3643 /* Used to compute how far we scan ahead */
3644 long strategy_delta;
3645 int bufs_to_lap;
3646 int bufs_ahead;
3647 float scans_per_alloc;
3648 int reusable_buffers_est;
3649 int upcoming_alloc_est;
3650 int min_scan_buffers;
3651
3652 /* Variables for the scanning loop proper */
3653 int num_to_scan;
3654 int num_written;
3655 int reusable_buffers;
3656
3657 /* Variables for final smoothed_density update */
3658 long new_strategy_delta;
3659 uint32 new_recent_alloc;
3660
3661 /*
3662 * Find out where the freelist clock sweep currently is, and how many
3663 * buffer allocations have happened since our last call.
3664 */
3665 strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3666
3667 /* Report buffer alloc counts to pgstat */
3668 PendingBgWriterStats.buf_alloc += recent_alloc;
3669
3670 /*
3671 * If we're not running the LRU scan, just stop after doing the stats
3672 * stuff. We mark the saved state invalid so that we can recover sanely
3673 * if LRU scan is turned back on later.
3674 */
3675 if (bgwriter_lru_maxpages <= 0)
3676 {
3677 saved_info_valid = false;
3678 return true;
3679 }
3680
3681 /*
3682 * Compute strategy_delta = how many buffers have been scanned by the
3683 * clock sweep since last time. If first time through, assume none. Then
3684 * see if we are still ahead of the clock sweep, and if so, how many
3685 * buffers we could scan before we'd catch up with it and "lap" it. Note:
3686 * weird-looking coding of xxx_passes comparisons are to avoid bogus
3687 * behavior when the passes counts wrap around.
3688 */
3689 if (saved_info_valid)
3690 {
3691 int32 passes_delta = strategy_passes - prev_strategy_passes;
3692
3693 strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3694 strategy_delta += (long) passes_delta * NBuffers;
3695
3696 Assert(strategy_delta >= 0);
3697
3698 if ((int32) (next_passes - strategy_passes) > 0)
3699 {
3700 /* we're one pass ahead of the strategy point */
3701 bufs_to_lap = strategy_buf_id - next_to_clean;
3702#ifdef BGW_DEBUG
3703 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3704 next_passes, next_to_clean,
3705 strategy_passes, strategy_buf_id,
3706 strategy_delta, bufs_to_lap);
3707#endif
3708 }
3709 else if (next_passes == strategy_passes &&
3710 next_to_clean >= strategy_buf_id)
3711 {
3712 /* on same pass, but ahead or at least not behind */
3713 bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3714#ifdef BGW_DEBUG
3715 elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3716 next_passes, next_to_clean,
3717 strategy_passes, strategy_buf_id,
3718 strategy_delta, bufs_to_lap);
3719#endif
3720 }
3721 else
3722 {
3723 /*
3724 * We're behind, so skip forward to the strategy point and start
3725 * cleaning from there.
3726 */
3727#ifdef BGW_DEBUG
3728 elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3729 next_passes, next_to_clean,
3730 strategy_passes, strategy_buf_id,
3731 strategy_delta);
3732#endif
3733 next_to_clean = strategy_buf_id;
3734 next_passes = strategy_passes;
3735 bufs_to_lap = NBuffers;
3736 }
3737 }
3738 else
3739 {
3740 /*
3741 * Initializing at startup or after LRU scanning had been off. Always
3742 * start at the strategy point.
3743 */
3744#ifdef BGW_DEBUG
3745 elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3746 strategy_passes, strategy_buf_id);
3747#endif
3748 strategy_delta = 0;
3749 next_to_clean = strategy_buf_id;
3750 next_passes = strategy_passes;
3751 bufs_to_lap = NBuffers;
3752 }
3753
3754 /* Update saved info for next time */
3755 prev_strategy_buf_id = strategy_buf_id;
3756 prev_strategy_passes = strategy_passes;
3757 saved_info_valid = true;
3758
3759 /*
3760 * Compute how many buffers had to be scanned for each new allocation, ie,
3761 * 1/density of reusable buffers, and track a moving average of that.
3762 *
3763 * If the strategy point didn't move, we don't update the density estimate
3764 */
3765 if (strategy_delta > 0 && recent_alloc > 0)
3766 {
3767 scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3768 smoothed_density += (scans_per_alloc - smoothed_density) /
3769 smoothing_samples;
3770 }
3771
3772 /*
3773 * Estimate how many reusable buffers there are between the current
3774 * strategy point and where we've scanned ahead to, based on the smoothed
3775 * density estimate.
3776 */
3777 bufs_ahead = NBuffers - bufs_to_lap;
3778 reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3779
3780 /*
3781 * Track a moving average of recent buffer allocations. Here, rather than
3782 * a true average we want a fast-attack, slow-decline behavior: we
3783 * immediately follow any increase.
3784 */
3785 if (smoothed_alloc <= (float) recent_alloc)
3786 smoothed_alloc = recent_alloc;
3787 else
3788 smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3789 smoothing_samples;
3790
3791 /* Scale the estimate by a GUC to allow more aggressive tuning. */
3792 upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3793
3794 /*
3795 * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3796 * eventually underflow to zero, and the underflows produce annoying
3797 * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3798 * zero, there's no point in tracking smaller and smaller values of
3799 * smoothed_alloc, so just reset it to exactly zero to avoid this
3800 * syndrome. It will pop back up as soon as recent_alloc increases.
3801 */
3802 if (upcoming_alloc_est == 0)
3803 smoothed_alloc = 0;
3804
3805 /*
3806 * Even in cases where there's been little or no buffer allocation
3807 * activity, we want to make a small amount of progress through the buffer
3808 * cache so that as many reusable buffers as possible are clean after an
3809 * idle period.
3810 *
3811 * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3812 * the BGW will be called during the scan_whole_pool time; slice the
3813 * buffer pool into that many sections.
3814 */
3815 min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3816
3817 if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3818 {
3819#ifdef BGW_DEBUG
3820 elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3821 upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3822#endif
3823 upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3824 }
3825
3826 /*
3827 * Now write out dirty reusable buffers, working forward from the
3828 * next_to_clean point, until we have lapped the strategy scan, or cleaned
3829 * enough buffers to match our estimate of the next cycle's allocation
3830 * requirements, or hit the bgwriter_lru_maxpages limit.
3831 */
3832
3833 num_to_scan = bufs_to_lap;
3834 num_written = 0;
3835 reusable_buffers = reusable_buffers_est;
3836
3837 /* Execute the LRU scan */
3838 while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3839 {
3840 int sync_state = SyncOneBuffer(next_to_clean, true,
3841 wb_context);
3842
3843 if (++next_to_clean >= NBuffers)
3844 {
3845 next_to_clean = 0;
3846 next_passes++;
3847 }
3848 num_to_scan--;
3849
3850 if (sync_state & BUF_WRITTEN)
3851 {
3852 reusable_buffers++;
3853 if (++num_written >= bgwriter_lru_maxpages)
3854 {
3856 break;
3857 }
3858 }
3859 else if (sync_state & BUF_REUSABLE)
3860 reusable_buffers++;
3861 }
3862
3864
3865#ifdef BGW_DEBUG
3866 elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3867 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3868 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3869 bufs_to_lap - num_to_scan,
3870 num_written,
3871 reusable_buffers - reusable_buffers_est);
3872#endif
3873
3874 /*
3875 * Consider the above scan as being like a new allocation scan.
3876 * Characterize its density and update the smoothed one based on it. This
3877 * effectively halves the moving average period in cases where both the
3878 * strategy and the background writer are doing some useful scanning,
3879 * which is helpful because a long memory isn't as desirable on the
3880 * density estimates.
3881 */
3882 new_strategy_delta = bufs_to_lap - num_to_scan;
3883 new_recent_alloc = reusable_buffers - reusable_buffers_est;
3884 if (new_strategy_delta > 0 && new_recent_alloc > 0)
3885 {
3886 scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3887 smoothed_density += (scans_per_alloc - smoothed_density) /
3888 smoothing_samples;
3889
3890#ifdef BGW_DEBUG
3891 elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3892 new_recent_alloc, new_strategy_delta,
3893 scans_per_alloc, smoothed_density);
3894#endif
3895 }
3896
3897 /* Return true if OK to hibernate */
3898 return (bufs_to_lap == 0 && recent_alloc == 0);
3899}
int BgWriterDelay
Definition: bgwriter.c:58
#define BUF_REUSABLE
Definition: bufmgr.c:78
double bgwriter_lru_multiplier
Definition: bufmgr.c:143
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3916
int bgwriter_lru_maxpages
Definition: bufmgr.c:142
#define BUF_WRITTEN
Definition: bufmgr.c:77
int32_t int32
Definition: c.h:498
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:226
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
int NBuffers
Definition: globals.c:143
PgStat_BgWriterStats PendingBgWriterStats
PgStat_Counter buf_written_clean
Definition: pgstat.h:239
PgStat_Counter maxwritten_clean
Definition: pgstat.h:240
PgStat_Counter buf_alloc
Definition: pgstat.h:241

References Assert(), bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, DEBUG1, DEBUG2, elog, PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ buffer_readv_complete()

static pg_attribute_always_inline PgAioResult buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data,
bool  is_temp 
)
static

Definition at line 7107 of file bufmgr.c.

7109{
7110 PgAioResult result = prior_result;
7112 uint8 first_error_off = 0;
7113 uint8 first_zeroed_off = 0;
7114 uint8 first_ignored_off = 0;
7115 uint8 error_count = 0;
7116 uint8 zeroed_count = 0;
7117 uint8 ignored_count = 0;
7118 uint8 checkfail_count = 0;
7119 uint64 *io_data;
7120 uint8 handle_data_len;
7121
7122 if (is_temp)
7123 {
7124 Assert(td->smgr.is_temp);
7126 }
7127 else
7128 Assert(!td->smgr.is_temp);
7129
7130 /*
7131 * Iterate over all the buffers affected by this IO and call the
7132 * per-buffer completion function for each buffer.
7133 */
7134 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
7135 for (uint8 buf_off = 0; buf_off < handle_data_len; buf_off++)
7136 {
7137 Buffer buf = io_data[buf_off];
7138 bool failed;
7139 bool failed_verification = false;
7140 bool failed_checksum = false;
7141 bool zeroed_buffer = false;
7142 bool ignored_checksum = false;
7143
7145
7146 /*
7147 * If the entire I/O failed on a lower-level, each buffer needs to be
7148 * marked as failed. In case of a partial read, the first few buffers
7149 * may be ok.
7150 */
7151 failed =
7152 prior_result.status == PGAIO_RS_ERROR
7153 || prior_result.result <= buf_off;
7154
7155 buffer_readv_complete_one(td, buf_off, buf, cb_data, failed, is_temp,
7156 &failed_verification,
7157 &failed_checksum,
7158 &ignored_checksum,
7159 &zeroed_buffer);
7160
7161 /*
7162 * Track information about the number of different kinds of error
7163 * conditions across all pages, as there can be multiple pages failing
7164 * verification as part of one IO.
7165 */
7166 if (failed_verification && !zeroed_buffer && error_count++ == 0)
7167 first_error_off = buf_off;
7168 if (zeroed_buffer && zeroed_count++ == 0)
7169 first_zeroed_off = buf_off;
7170 if (ignored_checksum && ignored_count++ == 0)
7171 first_ignored_off = buf_off;
7172 if (failed_checksum)
7173 checkfail_count++;
7174 }
7175
7176 /*
7177 * If the smgr read succeeded [partially] and page verification failed for
7178 * some of the pages, adjust the IO's result state appropriately.
7179 */
7180 if (prior_result.status != PGAIO_RS_ERROR &&
7181 (error_count > 0 || ignored_count > 0 || zeroed_count > 0))
7182 {
7183 buffer_readv_encode_error(&result, is_temp,
7184 zeroed_count > 0, ignored_count > 0,
7185 error_count, zeroed_count, checkfail_count,
7186 first_error_off, first_zeroed_off,
7187 first_ignored_off);
7188 pgaio_result_report(result, td, DEBUG1);
7189 }
7190
7191 /*
7192 * For shared relations this reporting is done in
7193 * shared_buffer_readv_complete_local().
7194 */
7195 if (is_temp && checkfail_count > 0)
7197 checkfail_count);
7198
7199 return result;
7200}
ProcNumber pgaio_io_get_owner(PgAioHandle *ioh)
Definition: aio.c:343
uint64 * pgaio_io_get_handle_data(PgAioHandle *ioh, uint8 *len)
Definition: aio_callback.c:154
void pgaio_result_report(PgAioResult result, const PgAioTargetData *target_data, int elevel)
Definition: aio_callback.c:171
PgAioTargetData * pgaio_io_get_target_data(PgAioHandle *ioh)
Definition: aio_target.c:72
@ PGAIO_RS_ERROR
Definition: aio_types.h:84
static pg_attribute_always_inline void buffer_readv_complete_one(PgAioTargetData *td, uint8 buf_off, Buffer buffer, uint8 flags, bool failed, bool is_temp, bool *buffer_invalid, bool *failed_checksum, bool *ignored_checksum, bool *zeroed_buffer)
Definition: bufmgr.c:6963
static void buffer_readv_encode_error(PgAioResult *result, bool is_temp, bool zeroed_any, bool ignored_any, uint8 error_count, uint8 zeroed_count, uint8 checkfail_count, uint8 first_error_off, uint8 first_zeroed_off, uint8 first_ignored_off)
Definition: bufmgr.c:6868
uint8_t uint8
Definition: c.h:500
uint64_t uint64
Definition: c.h:503
ProcNumber MyProcNumber
Definition: globals.c:91
static char * buf
Definition: pg_test_fsync.c:72
void pgstat_report_checksum_failures_in_db(Oid dboid, int failurecount)
uint32 status
Definition: aio_types.h:108
int32 result
Definition: aio_types.h:113
RelFileLocator rlocator
Definition: aio_types.h:65
struct PgAioTargetData::@124 smgr

References Assert(), buf, buffer_readv_complete_one(), buffer_readv_encode_error(), BufferIsValid(), RelFileLocator::dbOid, DEBUG1, PgAioTargetData::is_temp, MyProcNumber, pgaio_io_get_handle_data(), pgaio_io_get_owner(), pgaio_io_get_target_data(), pgaio_result_report(), PGAIO_RS_ERROR, pgstat_report_checksum_failures_in_db(), PgAioResult::result, PgAioTargetData::rlocator, PgAioTargetData::smgr, and PgAioResult::status.

Referenced by local_buffer_readv_complete(), and shared_buffer_readv_complete().

◆ buffer_readv_complete_one()

static pg_attribute_always_inline void buffer_readv_complete_one ( PgAioTargetData td,
uint8  buf_off,
Buffer  buffer,
uint8  flags,
bool  failed,
bool  is_temp,
bool *  buffer_invalid,
bool *  failed_checksum,
bool *  ignored_checksum,
bool *  zeroed_buffer 
)
static

Definition at line 6963 of file bufmgr.c.

6969{
6970 BufferDesc *buf_hdr = is_temp ?
6971 GetLocalBufferDescriptor(-buffer - 1)
6972 : GetBufferDescriptor(buffer - 1);
6973 BufferTag tag = buf_hdr->tag;
6974 char *bufdata = BufferGetBlock(buffer);
6975 uint32 set_flag_bits;
6976 int piv_flags;
6977
6978 /* check that the buffer is in the expected state for a read */
6979#ifdef USE_ASSERT_CHECKING
6980 {
6981 uint32 buf_state = pg_atomic_read_u32(&buf_hdr->state);
6982
6983 Assert(buf_state & BM_TAG_VALID);
6984 Assert(!(buf_state & BM_VALID));
6985 /* temp buffers don't use BM_IO_IN_PROGRESS */
6986 if (!is_temp)
6987 Assert(buf_state & BM_IO_IN_PROGRESS);
6988 Assert(!(buf_state & BM_DIRTY));
6989 }
6990#endif
6991
6992 *buffer_invalid = false;
6993 *failed_checksum = false;
6994 *ignored_checksum = false;
6995 *zeroed_buffer = false;
6996
6997 /*
6998 * We ask PageIsVerified() to only log the message about checksum errors,
6999 * as the completion might be run in any backend (or IO workers). We will
7000 * report checksum errors in buffer_readv_report().
7001 */
7002 piv_flags = PIV_LOG_LOG;
7003
7004 /* the local zero_damaged_pages may differ from the definer's */
7006 piv_flags |= PIV_IGNORE_CHECKSUM_FAILURE;
7007
7008 /* Check for garbage data. */
7009 if (!failed)
7010 {
7011 /*
7012 * If the buffer is not currently pinned by this backend, e.g. because
7013 * we're completing this IO after an error, the buffer data will have
7014 * been marked as inaccessible when the buffer was unpinned. The AIO
7015 * subsystem holds a pin, but that doesn't prevent the buffer from
7016 * having been marked as inaccessible. The completion might also be
7017 * executed in a different process.
7018 */
7019#ifdef USE_VALGRIND
7020 if (!BufferIsPinned(buffer))
7021 VALGRIND_MAKE_MEM_DEFINED(bufdata, BLCKSZ);
7022#endif
7023
7024 if (!PageIsVerified((Page) bufdata, tag.blockNum, piv_flags,
7025 failed_checksum))
7026 {
7027 if (flags & READ_BUFFERS_ZERO_ON_ERROR)
7028 {
7029 memset(bufdata, 0, BLCKSZ);
7030 *zeroed_buffer = true;
7031 }
7032 else
7033 {
7034 *buffer_invalid = true;
7035 /* mark buffer as having failed */
7036 failed = true;
7037 }
7038 }
7039 else if (*failed_checksum)
7040 *ignored_checksum = true;
7041
7042 /* undo what we did above */
7043#ifdef USE_VALGRIND
7044 if (!BufferIsPinned(buffer))
7045 VALGRIND_MAKE_MEM_NOACCESS(bufdata, BLCKSZ);
7046#endif
7047
7048 /*
7049 * Immediately log a message about the invalid page, but only to the
7050 * server log. The reason to do so immediately is that this may be
7051 * executed in a different backend than the one that originated the
7052 * request. The reason to do so immediately is that the originator
7053 * might not process the query result immediately (because it is busy
7054 * doing another part of query processing) or at all (e.g. if it was
7055 * cancelled or errored out due to another IO also failing). The
7056 * definer of the IO will emit an ERROR or WARNING when processing the
7057 * IO's results
7058 *
7059 * To avoid duplicating the code to emit these log messages, we reuse
7060 * buffer_readv_report().
7061 */
7062 if (*buffer_invalid || *failed_checksum || *zeroed_buffer)
7063 {
7064 PgAioResult result_one = {0};
7065
7066 buffer_readv_encode_error(&result_one, is_temp,
7067 *zeroed_buffer,
7068 *ignored_checksum,
7069 *buffer_invalid,
7070 *zeroed_buffer ? 1 : 0,
7071 *failed_checksum ? 1 : 0,
7072 buf_off, buf_off, buf_off);
7073 pgaio_result_report(result_one, td, LOG_SERVER_ONLY);
7074 }
7075 }
7076
7077 /* Terminate I/O and set BM_VALID. */
7078 set_flag_bits = failed ? BM_IO_ERROR : BM_VALID;
7079 if (is_temp)
7080 TerminateLocalBufferIO(buf_hdr, false, set_flag_bits, true);
7081 else
7082 TerminateBufferIO(buf_hdr, false, set_flag_bits, false, true);
7083
7084 /*
7085 * Call the BUFFER_READ_DONE tracepoint in the callback, even though the
7086 * callback may not be executed in the same backend that called
7087 * BUFFER_READ_START. The alternative would be to defer calling the
7088 * tracepoint to a later point (e.g. the local completion callback for
7089 * shared buffer reads), which seems even less helpful.
7090 */
7091 TRACE_POSTGRESQL_BUFFER_READ_DONE(tag.forkNum,
7092 tag.blockNum,
7093 tag.spcOid,
7094 tag.dbOid,
7095 tag.relNumber,
7097 false);
7098}
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:239
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:480
bool PageIsVerified(PageData *page, BlockNumber blkno, int flags, bool *checksum_failure_p)
Definition: bufpage.c:94
#define PIV_LOG_LOG
Definition: bufpage.h:469
PageData * Page
Definition: bufpage.h:82
#define PIV_IGNORE_CHECKSUM_FAILURE
Definition: bufpage.h:470
#define LOG_SERVER_ONLY
Definition: elog.h:32
void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bits, bool release_aio)
Definition: localbuf.c:560
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26
#define VALGRIND_MAKE_MEM_NOACCESS(addr, size)
Definition: memdebug.h:27
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
pg_atomic_uint32 state
RelFileNumber relNumber
ForkNumber forkNum
Oid spcOid

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, buffer_readv_encode_error(), BufferGetBlock(), BufferIsPinned, buftag::dbOid, buftag::forkNum, GetBufferDescriptor(), GetLocalBufferDescriptor(), INVALID_PROC_NUMBER, LOG_SERVER_ONLY, MyProcNumber, PageIsVerified(), pg_atomic_read_u32(), pgaio_result_report(), PIV_IGNORE_CHECKSUM_FAILURE, PIV_LOG_LOG, READ_BUFFERS_IGNORE_CHECKSUM_FAILURES, READ_BUFFERS_ZERO_ON_ERROR, buftag::relNumber, buftag::spcOid, BufferDesc::state, BufferDesc::tag, TerminateBufferIO(), TerminateLocalBufferIO(), VALGRIND_MAKE_MEM_DEFINED, and VALGRIND_MAKE_MEM_NOACCESS.

Referenced by buffer_readv_complete().

◆ buffer_readv_decode_error()

static void buffer_readv_decode_error ( PgAioResult  result,
bool *  zeroed_any,
bool *  ignored_any,
uint8 zeroed_or_error_count,
uint8 checkfail_count,
uint8 first_off 
)
inlinestatic

Definition at line 6826 of file bufmgr.c.

6832{
6833 uint32 rem_error = result.error_data;
6834
6835 /* see static asserts in buffer_readv_encode_error */
6836#define READV_COUNT_BITS 7
6837#define READV_COUNT_MASK ((1 << READV_COUNT_BITS) - 1)
6838
6839 *zeroed_any = rem_error & 1;
6840 rem_error >>= 1;
6841
6842 *ignored_any = rem_error & 1;
6843 rem_error >>= 1;
6844
6845 *zeroed_or_error_count = rem_error & READV_COUNT_MASK;
6846 rem_error >>= READV_COUNT_BITS;
6847
6848 *checkfail_count = rem_error & READV_COUNT_MASK;
6849 rem_error >>= READV_COUNT_BITS;
6850
6851 *first_off = rem_error & READV_COUNT_MASK;
6852 rem_error >>= READV_COUNT_BITS;
6853}
#define READV_COUNT_BITS
#define READV_COUNT_MASK
uint32 error_data
Definition: aio_types.h:111

References PgAioResult::error_data, READV_COUNT_BITS, and READV_COUNT_MASK.

Referenced by buffer_readv_encode_error(), buffer_readv_report(), and shared_buffer_readv_complete_local().

◆ buffer_readv_encode_error()

static void buffer_readv_encode_error ( PgAioResult result,
bool  is_temp,
bool  zeroed_any,
bool  ignored_any,
uint8  error_count,
uint8  zeroed_count,
uint8  checkfail_count,
uint8  first_error_off,
uint8  first_zeroed_off,
uint8  first_ignored_off 
)
inlinestatic

Definition at line 6868 of file bufmgr.c.

6878{
6879
6880 uint8 shift = 0;
6881 uint8 zeroed_or_error_count =
6882 error_count > 0 ? error_count : zeroed_count;
6883 uint8 first_off;
6884
6886 "PG_IOV_MAX is bigger than reserved space for error data");
6888 "PGAIO_RESULT_ERROR_BITS is insufficient for buffer_readv");
6889
6890 /*
6891 * We only have space to encode one offset - but luckily that's good
6892 * enough. If there is an error, the error is the interesting offset, same
6893 * with a zeroed buffer vs an ignored buffer.
6894 */
6895 if (error_count > 0)
6896 first_off = first_error_off;
6897 else if (zeroed_count > 0)
6898 first_off = first_zeroed_off;
6899 else
6900 first_off = first_ignored_off;
6901
6902 Assert(!zeroed_any || error_count == 0);
6903
6904 result->error_data = 0;
6905
6906 result->error_data |= zeroed_any << shift;
6907 shift += 1;
6908
6909 result->error_data |= ignored_any << shift;
6910 shift += 1;
6911
6912 result->error_data |= ((uint32) zeroed_or_error_count) << shift;
6913 shift += READV_COUNT_BITS;
6914
6915 result->error_data |= ((uint32) checkfail_count) << shift;
6916 shift += READV_COUNT_BITS;
6917
6918 result->error_data |= ((uint32) first_off) << shift;
6919 shift += READV_COUNT_BITS;
6920
6921 result->id = is_temp ? PGAIO_HCB_LOCAL_BUFFER_READV :
6923
6924 if (error_count > 0)
6925 result->status = PGAIO_RS_ERROR;
6926 else
6927 result->status = PGAIO_RS_WARNING;
6928
6929 /*
6930 * The encoding is complicated enough to warrant cross-checking it against
6931 * the decode function.
6932 */
6933#ifdef USE_ASSERT_CHECKING
6934 {
6935 bool zeroed_any_2,
6936 ignored_any_2;
6937 uint8 zeroed_or_error_count_2,
6938 checkfail_count_2,
6939 first_off_2;
6940
6942 &zeroed_any_2, &ignored_any_2,
6943 &zeroed_or_error_count_2,
6944 &checkfail_count_2,
6945 &first_off_2);
6946 Assert(zeroed_any == zeroed_any_2);
6947 Assert(ignored_any == ignored_any_2);
6948 Assert(zeroed_or_error_count == zeroed_or_error_count_2);
6949 Assert(checkfail_count == checkfail_count_2);
6950 Assert(first_off == first_off_2);
6951 }
6952#endif
6953
6954#undef READV_COUNT_BITS
6955#undef READV_COUNT_MASK
6956}
#define PGAIO_RESULT_ERROR_BITS
Definition: aio_types.h:98
@ PGAIO_RS_WARNING
Definition: aio_types.h:83
static void buffer_readv_decode_error(PgAioResult result, bool *zeroed_any, bool *ignored_any, uint8 *zeroed_or_error_count, uint8 *checkfail_count, uint8 *first_off)
Definition: bufmgr.c:6826
#define StaticAssertStmt(condition, errmessage)
Definition: c.h:909
#define PG_IOV_MAX
Definition: pg_iovec.h:41
uint32 id
Definition: aio_types.h:105

References Assert(), buffer_readv_decode_error(), PgAioResult::error_data, PgAioResult::id, PG_IOV_MAX, PGAIO_HCB_LOCAL_BUFFER_READV, PGAIO_HCB_SHARED_BUFFER_READV, PGAIO_RESULT_ERROR_BITS, PGAIO_RS_ERROR, PGAIO_RS_WARNING, READV_COUNT_BITS, StaticAssertStmt, and PgAioResult::status.

Referenced by buffer_readv_complete(), and buffer_readv_complete_one().

◆ buffer_readv_report()

static void buffer_readv_report ( PgAioResult  result,
const PgAioTargetData td,
int  elevel 
)
static

Definition at line 7210 of file bufmgr.c.

7212{
7213 int nblocks = td->smgr.nblocks;
7214 BlockNumber first = td->smgr.blockNum;
7215 BlockNumber last = first + nblocks - 1;
7216 ProcNumber errProc =
7218 RelPathStr rpath =
7219 relpathbackend(td->smgr.rlocator, errProc, td->smgr.forkNum);
7220 bool zeroed_any,
7221 ignored_any;
7222 uint8 zeroed_or_error_count,
7223 checkfail_count,
7224 first_off;
7225 uint8 affected_count;
7226 const char *msg_one,
7227 *msg_mult,
7228 *det_mult,
7229 *hint_mult;
7230
7231 buffer_readv_decode_error(result, &zeroed_any, &ignored_any,
7232 &zeroed_or_error_count,
7233 &checkfail_count,
7234 &first_off);
7235
7236 /*
7237 * Treat a read that had both zeroed buffers *and* ignored checksums as a
7238 * special case, it's too irregular to be emitted the same way as the
7239 * other cases.
7240 */
7241 if (zeroed_any && ignored_any)
7242 {
7243 Assert(zeroed_any && ignored_any);
7244 Assert(nblocks > 1); /* same block can't be both zeroed and ignored */
7245 Assert(result.status != PGAIO_RS_ERROR);
7246 affected_count = zeroed_or_error_count;
7247
7248 ereport(elevel,
7250 errmsg("zeroing %u page(s) and ignoring %u checksum failure(s) among blocks %u..%u of relation %s",
7251 affected_count, checkfail_count, first, last, rpath.str),
7252 affected_count > 1 ?
7253 errdetail("Block %u held first zeroed page.",
7254 first + first_off) : 0,
7255 errhint("See server log for details about the other %u invalid block(s).",
7256 affected_count + checkfail_count - 1));
7257 return;
7258 }
7259
7260 /*
7261 * The other messages are highly repetitive. To avoid duplicating a long
7262 * and complicated ereport(), gather the translated format strings
7263 * separately and then do one common ereport.
7264 */
7265 if (result.status == PGAIO_RS_ERROR)
7266 {
7267 Assert(!zeroed_any); /* can't have invalid pages when zeroing them */
7268 affected_count = zeroed_or_error_count;
7269 msg_one = _("invalid page in block %u of relation %s");
7270 msg_mult = _("%u invalid pages among blocks %u..%u of relation %s");
7271 det_mult = _("Block %u held first invalid page.");
7272 hint_mult = _("See server log for the other %u invalid block(s).");
7273 }
7274 else if (zeroed_any && !ignored_any)
7275 {
7276 affected_count = zeroed_or_error_count;
7277 msg_one = _("invalid page in block %u of relation %s; zeroing out page");
7278 msg_mult = _("zeroing out %u invalid pages among blocks %u..%u of relation %s");
7279 det_mult = _("Block %u held first zeroed page.");
7280 hint_mult = _("See server log for the other %u zeroed block(s).");
7281 }
7282 else if (!zeroed_any && ignored_any)
7283 {
7284 affected_count = checkfail_count;
7285 msg_one = _("ignoring checksum failure in block %u of relation %s");
7286 msg_mult = _("ignoring %u checksum failures among blocks %u..%u of relation %s");
7287 det_mult = _("Block %u held first ignored page.");
7288 hint_mult = _("See server log for the other %u ignored block(s).");
7289 }
7290 else
7292
7293 ereport(elevel,
7295 affected_count == 1 ?
7296 errmsg_internal(msg_one, first + first_off, rpath.str) :
7297 errmsg_internal(msg_mult, affected_count, first, last, rpath.str),
7298 affected_count > 1 ? errdetail_internal(det_mult, first + first_off) : 0,
7299 affected_count > 1 ? errhint_internal(hint_mult, affected_count - 1) : 0);
7300}
#define pg_unreachable()
Definition: c.h:332
int errmsg_internal(const char *fmt,...)
Definition: elog.c:1158
int errdetail_internal(const char *fmt,...)
Definition: elog.c:1231
int errhint_internal(const char *fmt,...)
Definition: elog.c:1340
int errhint(const char *fmt,...)
Definition: elog.c:1318
#define _(x)
Definition: elog.c:91
#define ERRCODE_DATA_CORRUPTED
Definition: pg_basebackup.c:41
int ProcNumber
Definition: procnumber.h:24
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:141
char str[REL_PATH_STR_MAXLEN+1]
Definition: relpath.h:123
BlockNumber blockNum
Definition: aio_types.h:66
BlockNumber nblocks
Definition: aio_types.h:67
ForkNumber forkNum
Definition: aio_types.h:68

References _, Assert(), PgAioTargetData::blockNum, buffer_readv_decode_error(), ereport, errcode(), ERRCODE_DATA_CORRUPTED, errdetail(), errdetail_internal(), errhint(), errhint_internal(), errmsg(), errmsg_internal(), PgAioTargetData::forkNum, INVALID_PROC_NUMBER, PgAioTargetData::is_temp, MyProcNumber, PgAioTargetData::nblocks, pg_unreachable, PGAIO_RS_ERROR, relpathbackend, PgAioTargetData::rlocator, PgAioTargetData::smgr, PgAioResult::status, and RelPathStr::str.

◆ buffer_stage_common()

static pg_attribute_always_inline void buffer_stage_common ( PgAioHandle ioh,
bool  is_write,
bool  is_temp 
)
static

Definition at line 6717 of file bufmgr.c.

6718{
6719 uint64 *io_data;
6720 uint8 handle_data_len;
6721 PgAioWaitRef io_ref;
6723
6724 io_data = pgaio_io_get_handle_data(ioh, &handle_data_len);
6725
6726 pgaio_io_get_wref(ioh, &io_ref);
6727
6728 /* iterate over all buffers affected by the vectored readv/writev */
6729 for (int i = 0; i < handle_data_len; i++)
6730 {
6731 Buffer buffer = (Buffer) io_data[i];
6732 BufferDesc *buf_hdr = is_temp ?
6733 GetLocalBufferDescriptor(-buffer - 1)
6734 : GetBufferDescriptor(buffer - 1);
6735 uint32 buf_state;
6736
6737 /*
6738 * Check that all the buffers are actually ones that could conceivably
6739 * be done in one IO, i.e. are sequential. This is the last
6740 * buffer-aware code before IO is actually executed and confusion
6741 * about which buffers are targeted by IO can be hard to debug, making
6742 * it worth doing extra-paranoid checks.
6743 */
6744 if (i == 0)
6745 first = buf_hdr->tag;
6746 else
6747 {
6748 Assert(buf_hdr->tag.relNumber == first.relNumber);
6749 Assert(buf_hdr->tag.blockNum == first.blockNum + i);
6750 }
6751
6752 if (is_temp)
6753 buf_state = pg_atomic_read_u32(&buf_hdr->state);
6754 else
6755 buf_state = LockBufHdr(buf_hdr);
6756
6757 /* verify the buffer is in the expected state */
6758 Assert(buf_state & BM_TAG_VALID);
6759 if (is_write)
6760 {
6761 Assert(buf_state & BM_VALID);
6762 Assert(buf_state & BM_DIRTY);
6763 }
6764 else
6765 {
6766 Assert(!(buf_state & BM_VALID));
6767 Assert(!(buf_state & BM_DIRTY));
6768 }
6769
6770 /* temp buffers don't use BM_IO_IN_PROGRESS */
6771 if (!is_temp)
6772 Assert(buf_state & BM_IO_IN_PROGRESS);
6773
6774 Assert(BUF_STATE_GET_REFCOUNT(buf_state) >= 1);
6775
6776 /*
6777 * Reflect that the buffer is now owned by the AIO subsystem.
6778 *
6779 * For local buffers: This can't be done just via LocalRefCount, as
6780 * one might initially think, as this backend could error out while
6781 * AIO is still in progress, releasing all the pins by the backend
6782 * itself.
6783 *
6784 * This pin is released again in TerminateBufferIO().
6785 */
6786 buf_state += BUF_REFCOUNT_ONE;
6787 buf_hdr->io_wref = io_ref;
6788
6789 if (is_temp)
6790 pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
6791 else
6792 UnlockBufHdr(buf_hdr, buf_state);
6793
6794 /*
6795 * Ensure the content lock that prevents buffer modifications while
6796 * the buffer is being written out is not released early due to an
6797 * error.
6798 */
6799 if (is_write && !is_temp)
6800 {
6801 LWLock *content_lock;
6802
6803 content_lock = BufferDescriptorGetContentLock(buf_hdr);
6804
6805 Assert(LWLockHeldByMe(content_lock));
6806
6807 /*
6808 * Lock is now owned by AIO subsystem.
6809 */
6810 LWLockDisown(content_lock);
6811 }
6812
6813 /*
6814 * Stop tracking this buffer via the resowner - the AIO system now
6815 * keeps track.
6816 */
6817 if (!is_temp)
6819 }
6820}
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:295
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:51
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
static void ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:59
#define PG_USED_FOR_ASSERTS_ONLY
Definition: c.h:224
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1970
void LWLockDisown(LWLock *lock)
Definition: lwlock.c:1891
PgAioWaitRef io_wref
Definition: lwlock.h:42

References Assert(), buftag::blockNum, BM_DIRTY, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, BUF_REFCOUNT_ONE, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), CurrentResourceOwner, GetBufferDescriptor(), GetLocalBufferDescriptor(), i, BufferDesc::io_wref, LockBufHdr(), LWLockDisown(), LWLockHeldByMe(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), PG_USED_FOR_ASSERTS_ONLY, pgaio_io_get_handle_data(), pgaio_io_get_wref(), buftag::relNumber, ResourceOwnerForgetBufferIO(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by local_buffer_readv_stage(), and shared_buffer_readv_stage().

◆ BufferAlloc()

static pg_attribute_always_inline BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool *  foundPtr,
IOContext  io_context 
)
inlinestatic

Definition at line 1998 of file bufmgr.c.

2002{
2003 BufferTag newTag; /* identity of requested block */
2004 uint32 newHash; /* hash value for newTag */
2005 LWLock *newPartitionLock; /* buffer partition lock for it */
2006 int existing_buf_id;
2007 Buffer victim_buffer;
2008 BufferDesc *victim_buf_hdr;
2009 uint32 victim_buf_state;
2010
2011 /* Make sure we will have room to remember the buffer pin */
2014
2015 /* create a tag so we can lookup the buffer */
2016 InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
2017
2018 /* determine its hash code and partition lock ID */
2019 newHash = BufTableHashCode(&newTag);
2020 newPartitionLock = BufMappingPartitionLock(newHash);
2021
2022 /* see if the block is in the buffer pool already */
2023 LWLockAcquire(newPartitionLock, LW_SHARED);
2024 existing_buf_id = BufTableLookup(&newTag, newHash);
2025 if (existing_buf_id >= 0)
2026 {
2027 BufferDesc *buf;
2028 bool valid;
2029
2030 /*
2031 * Found it. Now, pin the buffer so no one can steal it from the
2032 * buffer pool, and check to see if the correct data has been loaded
2033 * into the buffer.
2034 */
2035 buf = GetBufferDescriptor(existing_buf_id);
2036
2037 valid = PinBuffer(buf, strategy);
2038
2039 /* Can release the mapping lock as soon as we've pinned it */
2040 LWLockRelease(newPartitionLock);
2041
2042 *foundPtr = true;
2043
2044 if (!valid)
2045 {
2046 /*
2047 * We can only get here if (a) someone else is still reading in
2048 * the page, (b) a previous read attempt failed, or (c) someone
2049 * called StartReadBuffers() but not yet WaitReadBuffers().
2050 */
2051 *foundPtr = false;
2052 }
2053
2054 return buf;
2055 }
2056
2057 /*
2058 * Didn't find it in the buffer pool. We'll have to initialize a new
2059 * buffer. Remember to unlock the mapping lock while doing the work.
2060 */
2061 LWLockRelease(newPartitionLock);
2062
2063 /*
2064 * Acquire a victim buffer. Somebody else might try to do the same, we
2065 * don't hold any conflicting locks. If so we'll have to undo our work
2066 * later.
2067 */
2068 victim_buffer = GetVictimBuffer(strategy, io_context);
2069 victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
2070
2071 /*
2072 * Try to make a hashtable entry for the buffer under its new tag. If
2073 * somebody else inserted another buffer for the tag, we'll release the
2074 * victim buffer we acquired and use the already inserted one.
2075 */
2076 LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
2077 existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
2078 if (existing_buf_id >= 0)
2079 {
2080 BufferDesc *existing_buf_hdr;
2081 bool valid;
2082
2083 /*
2084 * Got a collision. Someone has already done what we were about to do.
2085 * We'll just handle this as if it were found in the buffer pool in
2086 * the first place. First, give up the buffer we were planning to
2087 * use.
2088 *
2089 * We could do this after releasing the partition lock, but then we'd
2090 * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
2091 * before acquiring the lock, for the rare case of such a collision.
2092 */
2093 UnpinBuffer(victim_buf_hdr);
2094
2095 /*
2096 * The victim buffer we acquired previously is clean and unused, let
2097 * it be found again quickly
2098 */
2099 StrategyFreeBuffer(victim_buf_hdr);
2100
2101 /* remaining code should match code at top of routine */
2102
2103 existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
2104
2105 valid = PinBuffer(existing_buf_hdr, strategy);
2106
2107 /* Can release the mapping lock as soon as we've pinned it */
2108 LWLockRelease(newPartitionLock);
2109
2110 *foundPtr = true;
2111
2112 if (!valid)
2113 {
2114 /*
2115 * We can only get here if (a) someone else is still reading in
2116 * the page, (b) a previous read attempt failed, or (c) someone
2117 * called StartReadBuffers() but not yet WaitReadBuffers().
2118 */
2119 *foundPtr = false;
2120 }
2121
2122 return existing_buf_hdr;
2123 }
2124
2125 /*
2126 * Need to lock the buffer header too in order to change its tag.
2127 */
2128 victim_buf_state = LockBufHdr(victim_buf_hdr);
2129
2130 /* some sanity checks while we hold the buffer header lock */
2131 Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
2132 Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
2133
2134 victim_buf_hdr->tag = newTag;
2135
2136 /*
2137 * Make sure BM_PERMANENT is set for buffers that must be written at every
2138 * checkpoint. Unlogged buffers only need to be written at shutdown
2139 * checkpoints, except for their "init" forks, which need to be treated
2140 * just like permanent relations.
2141 */
2142 victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2143 if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
2144 victim_buf_state |= BM_PERMANENT;
2145
2146 UnlockBufHdr(victim_buf_hdr, victim_buf_state);
2147
2148 LWLockRelease(newPartitionLock);
2149
2150 /*
2151 * Buffer contents are currently invalid.
2152 */
2153 *foundPtr = false;
2154
2155 return victim_buf_hdr;
2156}
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_PERMANENT
Definition: buf_internals.h:77
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:54
static LWLock * BufMappingPartitionLock(uint32 hashcode)
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:3065
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:2343
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:256
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:3257
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1182
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1902
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
@ INIT_FORKNUM
Definition: relpath.h:61
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:452

References Assert(), BM_DIRTY, BM_IO_IN_PROGRESS, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), CurrentResourceOwner, GetBufferDescriptor(), GetVictimBuffer(), INIT_FORKNUM, InitBufferTag(), RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrRelationData::smgr_rlocator, StrategyFreeBuffer(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by PinBufferForBlock().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 4161 of file bufmgr.c.

4162{
4163 BufferDesc *bufHdr;
4164
4165 Assert(BufferIsPinned(buffer));
4166
4167 if (BufferIsLocal(buffer))
4168 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4169 else
4170 bufHdr = GetBufferDescriptor(buffer - 1);
4171
4172 /* pinned, so OK to read tag without spinlock */
4173 return bufHdr->tag.blockNum;
4174}
#define BufferIsLocal(buffer)
Definition: buf.h:37

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_finish_split(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), AsyncReadBuffers(), BitmapHeapScanNextBlock(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), btvacuumpage(), check_index_page(), CheckReadBuffersOperation(), collect_corrupt_items(), collectMatchBitmap(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), gistvacuumpage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_fetch_next_buffer(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_page_prune_and_freeze(), heap_prepare_pagescan(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), heapam_scan_analyze_next_block(), heapgettup(), heapgettup_pagemode(), index_compute_xid_horizon_for_tuples(), lazy_scan_heap(), lazy_scan_noprune(), lazy_scan_prune(), lazy_vacuum_heap_rel(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), StartReadBuffersImpl(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), verify_heapam(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), and visibilitymap_set().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 4423 of file bufmgr.c.

4424{
4425 char *page = BufferGetPage(buffer);
4426 BufferDesc *bufHdr;
4427 XLogRecPtr lsn;
4428 uint32 buf_state;
4429
4430 /*
4431 * If we don't need locking for correctness, fastpath out.
4432 */
4433 if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
4434 return PageGetLSN(page);
4435
4436 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4437 Assert(BufferIsValid(buffer));
4438 Assert(BufferIsPinned(buffer));
4439
4440 bufHdr = GetBufferDescriptor(buffer - 1);
4441 buf_state = LockBufHdr(bufHdr);
4442 lsn = PageGetLSN(page);
4443 UnlockBufHdr(bufHdr, buf_state);
4444
4445 return lsn;
4446}
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:414
static XLogRecPtr PageGetLSN(const PageData *page)
Definition: bufpage.h:386
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert(), PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gin_check_parent_keys_consistency(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 4182 of file bufmgr.c.

4184{
4185 BufferDesc *bufHdr;
4186
4187 /* Do the same checks as BufferGetBlockNumber. */
4188 Assert(BufferIsPinned(buffer));
4189
4190 if (BufferIsLocal(buffer))
4191 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
4192 else
4193 bufHdr = GetBufferDescriptor(buffer - 1);
4194
4195 /* pinned, so OK to read tag without spinlock */
4196 *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4197 *forknum = BufTagGetForkNum(&bufHdr->tag);
4198 *blknum = bufHdr->tag.blockNum;
4199}

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), heap_inplace_update_and_unlock(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

◆ BufferIsDirty()

bool BufferIsDirty ( Buffer  buffer)

Definition at line 2912 of file bufmgr.c.

2913{
2914 BufferDesc *bufHdr;
2915
2916 Assert(BufferIsPinned(buffer));
2917
2918 if (BufferIsLocal(buffer))
2919 {
2920 int bufid = -buffer - 1;
2921
2922 bufHdr = GetLocalBufferDescriptor(bufid);
2923 /* Content locks are not maintained for local buffers. */
2924 }
2925 else
2926 {
2927 bufHdr = GetBufferDescriptor(buffer - 1);
2929 LW_EXCLUSIVE));
2930 }
2931
2932 return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2933}
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:2014

References Assert(), BM_DIRTY, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by XLogRegisterBuffer().

◆ BufferIsExclusiveLocked()

bool BufferIsExclusiveLocked ( Buffer  buffer)

Definition at line 2884 of file bufmgr.c.

2885{
2886 BufferDesc *bufHdr;
2887
2888 Assert(BufferIsPinned(buffer));
2889
2890 if (BufferIsLocal(buffer))
2891 {
2892 /* Content locks are not maintained for local buffers. */
2893 return true;
2894 }
2895 else
2896 {
2897 bufHdr = GetBufferDescriptor(buffer - 1);
2899 LW_EXCLUSIVE);
2900 }
2901}

References Assert(), PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), LW_EXCLUSIVE, and LWLockHeldByMeInMode().

Referenced by XLogRegisterBuffer().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 4393 of file bufmgr.c.

4394{
4395 BufferDesc *bufHdr;
4396
4397 /* Local buffers are used only for temp relations. */
4398 if (BufferIsLocal(buffer))
4399 return false;
4400
4401 /* Make sure we've got a real buffer, and that we hold a pin on it. */
4402 Assert(BufferIsValid(buffer));
4403 Assert(BufferIsPinned(buffer));
4404
4405 /*
4406 * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
4407 * need not bother with the buffer header spinlock. Even if someone else
4408 * changes the buffer header state while we're doing this, the state is
4409 * changed atomically, so we'll read the old value or the new value, but
4410 * not random garbage.
4411 */
4412 bufHdr = GetBufferDescriptor(buffer - 1);
4413 return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
4414}

References Assert(), BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 3342 of file bufmgr.c.

3343{
3344 uint32 buf_state;
3345 int buf_id;
3346 int num_to_scan;
3347 int num_spaces;
3348 int num_processed;
3349 int num_written;
3350 CkptTsStatus *per_ts_stat = NULL;
3351 Oid last_tsid;
3352 binaryheap *ts_heap;
3353 int i;
3354 int mask = BM_DIRTY;
3355 WritebackContext wb_context;
3356
3357 /*
3358 * Unless this is a shutdown checkpoint or we have been explicitly told,
3359 * we write only permanent, dirty buffers. But at shutdown or end of
3360 * recovery, we write all dirty buffers.
3361 */
3364 mask |= BM_PERMANENT;
3365
3366 /*
3367 * Loop over all buffers, and mark the ones that need to be written with
3368 * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
3369 * can estimate how much work needs to be done.
3370 *
3371 * This allows us to write only those pages that were dirty when the
3372 * checkpoint began, and not those that get dirtied while it proceeds.
3373 * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
3374 * later in this function, or by normal backends or the bgwriter cleaning
3375 * scan, the flag is cleared. Any buffer dirtied after this point won't
3376 * have the flag set.
3377 *
3378 * Note that if we fail to write some buffer, we may leave buffers with
3379 * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
3380 * certainly need to be written for the next checkpoint attempt, too.
3381 */
3382 num_to_scan = 0;
3383 for (buf_id = 0; buf_id < NBuffers; buf_id++)
3384 {
3385 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3386
3387 /*
3388 * Header spinlock is enough to examine BM_DIRTY, see comment in
3389 * SyncOneBuffer.
3390 */
3391 buf_state = LockBufHdr(bufHdr);
3392
3393 if ((buf_state & mask) == mask)
3394 {
3395 CkptSortItem *item;
3396
3397 buf_state |= BM_CHECKPOINT_NEEDED;
3398
3399 item = &CkptBufferIds[num_to_scan++];
3400 item->buf_id = buf_id;
3401 item->tsId = bufHdr->tag.spcOid;
3402 item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
3403 item->forkNum = BufTagGetForkNum(&bufHdr->tag);
3404 item->blockNum = bufHdr->tag.blockNum;
3405 }
3406
3407 UnlockBufHdr(bufHdr, buf_state);
3408
3409 /* Check for barrier events in case NBuffers is large. */
3412 }
3413
3414 if (num_to_scan == 0)
3415 return; /* nothing to do */
3416
3418
3419 TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
3420
3421 /*
3422 * Sort buffers that need to be written to reduce the likelihood of random
3423 * IO. The sorting is also important for the implementation of balancing
3424 * writes between tablespaces. Without balancing writes we'd potentially
3425 * end up writing to the tablespaces one-by-one; possibly overloading the
3426 * underlying system.
3427 */
3428 sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
3429
3430 num_spaces = 0;
3431
3432 /*
3433 * Allocate progress status for each tablespace with buffers that need to
3434 * be flushed. This requires the to-be-flushed array to be sorted.
3435 */
3436 last_tsid = InvalidOid;
3437 for (i = 0; i < num_to_scan; i++)
3438 {
3439 CkptTsStatus *s;
3440 Oid cur_tsid;
3441
3442 cur_tsid = CkptBufferIds[i].tsId;
3443
3444 /*
3445 * Grow array of per-tablespace status structs, every time a new
3446 * tablespace is found.
3447 */
3448 if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3449 {
3450 Size sz;
3451
3452 num_spaces++;
3453
3454 /*
3455 * Not worth adding grow-by-power-of-2 logic here - even with a
3456 * few hundred tablespaces this should be fine.
3457 */
3458 sz = sizeof(CkptTsStatus) * num_spaces;
3459
3460 if (per_ts_stat == NULL)
3461 per_ts_stat = (CkptTsStatus *) palloc(sz);
3462 else
3463 per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3464
3465 s = &per_ts_stat[num_spaces - 1];
3466 memset(s, 0, sizeof(*s));
3467 s->tsId = cur_tsid;
3468
3469 /*
3470 * The first buffer in this tablespace. As CkptBufferIds is sorted
3471 * by tablespace all (s->num_to_scan) buffers in this tablespace
3472 * will follow afterwards.
3473 */
3474 s->index = i;
3475
3476 /*
3477 * progress_slice will be determined once we know how many buffers
3478 * are in each tablespace, i.e. after this loop.
3479 */
3480
3481 last_tsid = cur_tsid;
3482 }
3483 else
3484 {
3485 s = &per_ts_stat[num_spaces - 1];
3486 }
3487
3488 s->num_to_scan++;
3489
3490 /* Check for barrier events. */
3493 }
3494
3495 Assert(num_spaces > 0);
3496
3497 /*
3498 * Build a min-heap over the write-progress in the individual tablespaces,
3499 * and compute how large a portion of the total progress a single
3500 * processed buffer is.
3501 */
3502 ts_heap = binaryheap_allocate(num_spaces,
3504 NULL);
3505
3506 for (i = 0; i < num_spaces; i++)
3507 {
3508 CkptTsStatus *ts_stat = &per_ts_stat[i];
3509
3510 ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3511
3512 binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3513 }
3514
3515 binaryheap_build(ts_heap);
3516
3517 /*
3518 * Iterate through to-be-checkpointed buffers and write the ones (still)
3519 * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3520 * tablespaces; otherwise the sorting would lead to only one tablespace
3521 * receiving writes at a time, making inefficient use of the hardware.
3522 */
3523 num_processed = 0;
3524 num_written = 0;
3525 while (!binaryheap_empty(ts_heap))
3526 {
3527 BufferDesc *bufHdr = NULL;
3528 CkptTsStatus *ts_stat = (CkptTsStatus *)
3530
3531 buf_id = CkptBufferIds[ts_stat->index].buf_id;
3532 Assert(buf_id != -1);
3533
3534 bufHdr = GetBufferDescriptor(buf_id);
3535
3536 num_processed++;
3537
3538 /*
3539 * We don't need to acquire the lock here, because we're only looking
3540 * at a single bit. It's possible that someone else writes the buffer
3541 * and clears the flag right after we check, but that doesn't matter
3542 * since SyncOneBuffer will then do nothing. However, there is a
3543 * further race condition: it's conceivable that between the time we
3544 * examine the bit here and the time SyncOneBuffer acquires the lock,
3545 * someone else not only wrote the buffer but replaced it with another
3546 * page and dirtied it. In that improbable case, SyncOneBuffer will
3547 * write the buffer though we didn't need to. It doesn't seem worth
3548 * guarding against this, though.
3549 */
3551 {
3552 if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3553 {
3554 TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3556 num_written++;
3557 }
3558 }
3559
3560 /*
3561 * Measure progress independent of actually having to flush the buffer
3562 * - otherwise writing become unbalanced.
3563 */
3564 ts_stat->progress += ts_stat->progress_slice;
3565 ts_stat->num_scanned++;
3566 ts_stat->index++;
3567
3568 /* Have all the buffers from the tablespace been processed? */
3569 if (ts_stat->num_scanned == ts_stat->num_to_scan)
3570 {
3571 binaryheap_remove_first(ts_heap);
3572 }
3573 else
3574 {
3575 /* update heap with the new progress */
3576 binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3577 }
3578
3579 /*
3580 * Sleep to throttle our I/O rate.
3581 *
3582 * (This will check for barrier events even if it doesn't sleep.)
3583 */
3584 CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3585 }
3586
3587 /*
3588 * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3589 * IOContext will always be IOCONTEXT_NORMAL.
3590 */
3592
3593 pfree(per_ts_stat);
3594 per_ts_stat = NULL;
3595 binaryheap_free(ts_heap);
3596
3597 /*
3598 * Update checkpoint statistics. As noted above, this doesn't include
3599 * buffers written by other backends or bgwriter scan.
3600 */
3601 CheckpointStats.ckpt_bufs_written += num_written;
3602
3603 TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3604}
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
#define binaryheap_empty(h)
Definition: binaryheap.h:65
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:76
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:6308
int checkpoint_flush_after
Definition: bufmgr.c:175
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:6331
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:6393
struct CkptTsStatus CkptTsStatus
double float8
Definition: c.h:601
size_t Size
Definition: c.h:576
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:773
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:40
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:2167
void pfree(void *pointer)
Definition: mcxt.c:2147
void * palloc(Size size)
Definition: mcxt.c:1940
PgStat_CheckpointerStats PendingCheckpointerStats
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:327
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:317
#define InvalidOid
Definition: postgres_ext.h:35
unsigned int Oid
Definition: postgres_ext.h:30
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:498
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:116
int index
Definition: bufmgr.c:124
int num_scanned
Definition: bufmgr.c:121
float8 progress
Definition: bufmgr.c:115
int num_to_scan
Definition: bufmgr.c:119
Oid tsId
Definition: bufmgr.c:106
PgStat_Counter buffers_written
Definition: pgstat.h:263
CheckpointStatsData CheckpointStats
Definition: xlog.c:209
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:143
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139

References Assert(), binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buffers_written, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, DatumGetPointer(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IOCONTEXT_NORMAL, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u32(), PointerGetDatum(), ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), buftag::spcOid, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 6243 of file bufmgr.c.

6244{
6245 int ret;
6246 RelFileLocator rlocatora;
6247 RelFileLocator rlocatorb;
6248
6249 rlocatora = BufTagGetRelFileLocator(ba);
6250 rlocatorb = BufTagGetRelFileLocator(bb);
6251
6252 ret = rlocator_comparator(&rlocatora, &rlocatorb);
6253
6254 if (ret != 0)
6255 return ret;
6256
6257 if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
6258 return -1;
6259 if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
6260 return 1;
6261
6262 if (ba->blockNum < bb->blockNum)
6263 return -1;
6264 if (ba->blockNum > bb->blockNum)
6265 return 1;
6266
6267 return 0;
6268}
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:6162

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), and rlocator_comparator().

◆ CheckBufferIsPinnedOnce()

void CheckBufferIsPinnedOnce ( Buffer  buffer)

Definition at line 5584 of file bufmgr.c.

5585{
5586 if (BufferIsLocal(buffer))
5587 {
5588 if (LocalRefCount[-buffer - 1] != 1)
5589 elog(ERROR, "incorrect local pin count: %d",
5590 LocalRefCount[-buffer - 1]);
5591 }
5592 else
5593 {
5594 if (GetPrivateRefCount(buffer) != 1)
5595 elog(ERROR, "incorrect local pin count: %d",
5596 GetPrivateRefCount(buffer));
5597 }
5598}
#define ERROR
Definition: elog.h:39

References PrivateRefCountEntry::buffer, BufferIsLocal, elog, ERROR, GetPrivateRefCount(), and LocalRefCount.

Referenced by GetVictimBuffer(), lazy_scan_heap(), and LockBufferForCleanup().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 4058 of file bufmgr.c.

4059{
4060#ifdef USE_ASSERT_CHECKING
4061 int RefCountErrors = 0;
4063 int i;
4064 char *s;
4065
4066 /* check the array */
4067 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
4068 {
4069 res = &PrivateRefCountArray[i];
4070
4071 if (res->buffer != InvalidBuffer)
4072 {
4074 elog(WARNING, "buffer refcount leak: %s", s);
4075 pfree(s);
4076
4077 RefCountErrors++;
4078 }
4079 }
4080
4081 /* if necessary search the hash */
4083 {
4084 HASH_SEQ_STATUS hstat;
4085
4087 while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
4088 {
4090 elog(WARNING, "buffer refcount leak: %s", s);
4091 pfree(s);
4092 RefCountErrors++;
4093 }
4094 }
4095
4096 Assert(RefCountErrors == 0);
4097#endif
4098}
#define InvalidBuffer
Definition: buf.h:25
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:4104
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:97
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:212
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:213
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1420
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385

References Assert(), PrivateRefCountEntry::buffer, DebugPrintBufferRefcount(), elog, hash_seq_init(), hash_seq_search(), i, InvalidBuffer, pfree(), PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, and WARNING.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 4147 of file bufmgr.c.

4148{
4149 BufferSync(flags);
4150}
static void BufferSync(int flags)
Definition: bufmgr.c:3342

References BufferSync().

Referenced by CheckPointGuts().

◆ CheckReadBuffersOperation()

static void CheckReadBuffersOperation ( ReadBuffersOperation operation,
bool  is_complete 
)
static

Definition at line 1525 of file bufmgr.c.

1526{
1527#ifdef USE_ASSERT_CHECKING
1528 Assert(operation->nblocks_done <= operation->nblocks);
1529 Assert(!is_complete || operation->nblocks == operation->nblocks_done);
1530
1531 for (int i = 0; i < operation->nblocks; i++)
1532 {
1533 Buffer buffer = operation->buffers[i];
1534 BufferDesc *buf_hdr = BufferIsLocal(buffer) ?
1535 GetLocalBufferDescriptor(-buffer - 1) :
1536 GetBufferDescriptor(buffer - 1);
1537
1538 Assert(BufferGetBlockNumber(buffer) == operation->blocknum + i);
1540
1541 if (i < operation->nblocks_done)
1543 }
1544#endif
1545}

References Assert(), ReadBuffersOperation::blocknum, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufferGetBlockNumber(), BufferIsLocal, ReadBuffersOperation::buffers, GetBufferDescriptor(), GetLocalBufferDescriptor(), i, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, pg_atomic_read_u32(), and BufferDesc::state.

Referenced by StartReadBuffersImpl(), and WaitReadBuffers().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 6277 of file bufmgr.c.

6278{
6279 /* compare tablespace */
6280 if (a->tsId < b->tsId)
6281 return -1;
6282 else if (a->tsId > b->tsId)
6283 return 1;
6284 /* compare relation */
6285 if (a->relNumber < b->relNumber)
6286 return -1;
6287 else if (a->relNumber > b->relNumber)
6288 return 1;
6289 /* compare fork */
6290 else if (a->forkNum < b->forkNum)
6291 return -1;
6292 else if (a->forkNum > b->forkNum)
6293 return 1;
6294 /* compare block number */
6295 else if (a->blockNum < b->blockNum)
6296 return -1;
6297 else if (a->blockNum > b->blockNum)
6298 return 1;
6299 /* equal page IDs are unlikely, but not impossible */
6300 return 0;
6301}
int b
Definition: isn.c:74
int a
Definition: isn.c:73

References a, and b.

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 5785 of file bufmgr.c.

5786{
5787 BufferDesc *bufHdr;
5788 uint32 buf_state,
5789 refcount;
5790
5791 Assert(BufferIsValid(buffer));
5792
5793 /* see AIO related comment in LockBufferForCleanup() */
5794
5795 if (BufferIsLocal(buffer))
5796 {
5797 refcount = LocalRefCount[-buffer - 1];
5798 /* There should be exactly one pin */
5799 Assert(refcount > 0);
5800 if (refcount != 1)
5801 return false;
5802 /* Nobody else to wait for */
5803 return true;
5804 }
5805
5806 /* There should be exactly one local pin */
5807 refcount = GetPrivateRefCount(buffer);
5808 Assert(refcount);
5809 if (refcount != 1)
5810 return false;
5811
5812 /* Try to acquire lock */
5813 if (!ConditionalLockBuffer(buffer))
5814 return false;
5815
5816 bufHdr = GetBufferDescriptor(buffer - 1);
5817 buf_state = LockBufHdr(bufHdr);
5818 refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5819
5820 Assert(refcount > 0);
5821 if (refcount == 1)
5822 {
5823 /* Successfully acquired exclusive lock with pincount 1 */
5824 UnlockBufHdr(bufHdr, buf_state);
5825 return true;
5826 }
5827
5828 /* Failed, so release the lock */
5829 UnlockBufHdr(bufHdr, buf_state);
5831 return false;
5832}
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5563
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5537
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:196

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 5177 of file bufmgr.c.

5179{
5180 char relpersistence;
5181 SMgrRelation src_rel;
5182 SMgrRelation dst_rel;
5183
5184 /* Set the relpersistence. */
5185 relpersistence = permanent ?
5186 RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
5187
5188 src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
5189 dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
5190
5191 /*
5192 * Create and copy all forks of the relation. During create database we
5193 * have a separate cleanup mechanism which deletes complete database
5194 * directory. Therefore, each individual relation doesn't need to be
5195 * registered for cleanup.
5196 */
5197 RelationCreateStorage(dst_rlocator, relpersistence, false);
5198
5199 /* copy main fork. */
5200 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
5201 permanent);
5202
5203 /* copy those extra forks that exist */
5204 for (ForkNumber forkNum = MAIN_FORKNUM + 1;
5205 forkNum <= MAX_FORKNUM; forkNum++)
5206 {
5207 if (smgrexists(src_rel, forkNum))
5208 {
5209 smgrcreate(dst_rel, forkNum, false);
5210
5211 /*
5212 * WAL log creation if the relation is persistent, or this is the
5213 * init fork of an unlogged relation.
5214 */
5215 if (permanent || forkNum == INIT_FORKNUM)
5216 log_smgrcreate(&dst_rlocator, forkNum);
5217
5218 /* Copy a fork's data, block by block. */
5219 RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
5220 permanent);
5221 }
5222 }
5223}
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:5063
@ MAIN_FORKNUM
Definition: relpath.h:58
#define MAX_FORKNUM
Definition: relpath.h:70
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:240
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:481
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:462
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:122
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:187

References INIT_FORKNUM, INVALID_PROC_NUMBER, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DebugPrintBufferRefcount()

char * DebugPrintBufferRefcount ( Buffer  buffer)

Definition at line 4104 of file bufmgr.c.

4105{
4106 BufferDesc *buf;
4107 int32 loccount;
4108 char *result;
4109 ProcNumber backend;
4110 uint32 buf_state;
4111
4112 Assert(BufferIsValid(buffer));
4113 if (BufferIsLocal(buffer))
4114 {
4115 buf = GetLocalBufferDescriptor(-buffer - 1);
4116 loccount = LocalRefCount[-buffer - 1];
4117 backend = MyProcNumber;
4118 }
4119 else
4120 {
4121 buf = GetBufferDescriptor(buffer - 1);
4122 loccount = GetPrivateRefCount(buffer);
4123 backend = INVALID_PROC_NUMBER;
4124 }
4125
4126 /* theoretically we should lock the bufhdr here */
4127 buf_state = pg_atomic_read_u32(&buf->state);
4128
4129 result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
4130 buffer,
4132 BufTagGetForkNum(&buf->tag)).str,
4133 buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
4134 BUF_STATE_GET_REFCOUNT(buf_state), loccount);
4135 return result;
4136}
#define BUF_FLAG_MASK
Definition: buf_internals.h:56
char * psprintf(const char *fmt,...)
Definition: psprintf.c:43

References Assert(), buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), INVALID_PROC_NUMBER, LocalRefCount, MyProcNumber, pg_atomic_read_u32(), psprintf(), and relpathbackend.

Referenced by buffer_call_start_io(), buffer_call_terminate_io(), CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResOwnerPrintBufferPin().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 4825 of file bufmgr.c.

4826{
4827 int i;
4828
4829 /*
4830 * We needn't consider local buffers, since by assumption the target
4831 * database isn't our own.
4832 */
4833
4834 for (i = 0; i < NBuffers; i++)
4835 {
4836 BufferDesc *bufHdr = GetBufferDescriptor(i);
4837 uint32 buf_state;
4838
4839 /*
4840 * As in DropRelationBuffers, an unlocked precheck should be safe and
4841 * saves some cycles.
4842 */
4843 if (bufHdr->tag.dbOid != dbid)
4844 continue;
4845
4846 buf_state = LockBufHdr(bufHdr);
4847 if (bufHdr->tag.dbOid == dbid)
4848 InvalidateBuffer(bufHdr); /* releases spinlock */
4849 else
4850 UnlockBufHdr(bufHdr, buf_state);
4851 }
4852}
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:2176

References buftag::dbOid, GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, BufferDesc::tag, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 4470 of file bufmgr.c.

4472{
4473 int i;
4474 int j;
4475 RelFileLocatorBackend rlocator;
4476 BlockNumber nForkBlock[MAX_FORKNUM];
4477 uint64 nBlocksToInvalidate = 0;
4478
4479 rlocator = smgr_reln->smgr_rlocator;
4480
4481 /* If it's a local relation, it's localbuf.c's problem. */
4482 if (RelFileLocatorBackendIsTemp(rlocator))
4483 {
4484 if (rlocator.backend == MyProcNumber)
4485 {
4486 for (j = 0; j < nforks; j++)
4487 DropRelationLocalBuffers(rlocator.locator, forkNum[j],
4488 firstDelBlock[j]);
4489 }
4490 return;
4491 }
4492
4493 /*
4494 * To remove all the pages of the specified relation forks from the buffer
4495 * pool, we need to scan the entire buffer pool but we can optimize it by
4496 * finding the buffers from BufMapping table provided we know the exact
4497 * size of each fork of the relation. The exact size is required to ensure
4498 * that we don't leave any buffer for the relation being dropped as
4499 * otherwise the background writer or checkpointer can lead to a PANIC
4500 * error while flushing buffers corresponding to files that don't exist.
4501 *
4502 * To know the exact size, we rely on the size cached for each fork by us
4503 * during recovery which limits the optimization to recovery and on
4504 * standbys but we can easily extend it once we have shared cache for
4505 * relation size.
4506 *
4507 * In recovery, we cache the value returned by the first lseek(SEEK_END)
4508 * and the future writes keeps the cached value up-to-date. See
4509 * smgrextend. It is possible that the value of the first lseek is smaller
4510 * than the actual number of existing blocks in the file due to buggy
4511 * Linux kernels that might not have accounted for the recent write. But
4512 * that should be fine because there must not be any buffers after that
4513 * file size.
4514 */
4515 for (i = 0; i < nforks; i++)
4516 {
4517 /* Get the number of blocks for a relation's fork */
4518 nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4519
4520 if (nForkBlock[i] == InvalidBlockNumber)
4521 {
4522 nBlocksToInvalidate = InvalidBlockNumber;
4523 break;
4524 }
4525
4526 /* calculate the number of blocks to be invalidated */
4527 nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4528 }
4529
4530 /*
4531 * We apply the optimization iff the total number of blocks to invalidate
4532 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4533 */
4534 if (BlockNumberIsValid(nBlocksToInvalidate) &&
4535 nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4536 {
4537 for (j = 0; j < nforks; j++)
4538 FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4539 nForkBlock[j], firstDelBlock[j]);
4540 return;
4541 }
4542
4543 for (i = 0; i < NBuffers; i++)
4544 {
4545 BufferDesc *bufHdr = GetBufferDescriptor(i);
4546 uint32 buf_state;
4547
4548 /*
4549 * We can make this a tad faster by prechecking the buffer tag before
4550 * we attempt to lock the buffer; this saves a lot of lock
4551 * acquisitions in typical cases. It should be safe because the
4552 * caller must have AccessExclusiveLock on the relation, or some other
4553 * reason to be certain that no one is loading new pages of the rel
4554 * into the buffer pool. (Otherwise we might well miss such pages
4555 * entirely.) Therefore, while the tag might be changing while we
4556 * look at it, it can't be changing *to* a value we care about, only
4557 * *away* from such a value. So false negatives are impossible, and
4558 * false positives are safe because we'll recheck after getting the
4559 * buffer lock.
4560 *
4561 * We could check forkNum and blockNum as well as the rlocator, but
4562 * the incremental win from doing so seems small.
4563 */
4564 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4565 continue;
4566
4567 buf_state = LockBufHdr(bufHdr);
4568
4569 for (j = 0; j < nforks; j++)
4570 {
4571 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4572 BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4573 bufHdr->tag.blockNum >= firstDelBlock[j])
4574 {
4575 InvalidateBuffer(bufHdr); /* releases spinlock */
4576 break;
4577 }
4578 }
4579 if (j >= nforks)
4580 UnlockBufHdr(bufHdr, buf_state);
4581 }
4582}
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:88
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4764
int j
Definition: isn.c:78
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:663
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:847

References RelFileLocatorBackend::backend, buftag::blockNum, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, RelFileLocatorBackendIsTemp, SMgrRelationData::smgr_rlocator, smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 4593 of file bufmgr.c.

4594{
4595 int i;
4596 int n = 0;
4597 SMgrRelation *rels;
4598 BlockNumber (*block)[MAX_FORKNUM + 1];
4599 uint64 nBlocksToInvalidate = 0;
4600 RelFileLocator *locators;
4601 bool cached = true;
4602 bool use_bsearch;
4603
4604 if (nlocators == 0)
4605 return;
4606
4607 rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4608
4609 /* If it's a local relation, it's localbuf.c's problem. */
4610 for (i = 0; i < nlocators; i++)
4611 {
4612 if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4613 {
4614 if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4615 DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4616 }
4617 else
4618 rels[n++] = smgr_reln[i];
4619 }
4620
4621 /*
4622 * If there are no non-local relations, then we're done. Release the
4623 * memory and return.
4624 */
4625 if (n == 0)
4626 {
4627 pfree(rels);
4628 return;
4629 }
4630
4631 /*
4632 * This is used to remember the number of blocks for all the relations
4633 * forks.
4634 */
4635 block = (BlockNumber (*)[MAX_FORKNUM + 1])
4636 palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4637
4638 /*
4639 * We can avoid scanning the entire buffer pool if we know the exact size
4640 * of each of the given relation forks. See DropRelationBuffers.
4641 */
4642 for (i = 0; i < n && cached; i++)
4643 {
4644 for (int j = 0; j <= MAX_FORKNUM; j++)
4645 {
4646 /* Get the number of blocks for a relation's fork. */
4647 block[i][j] = smgrnblocks_cached(rels[i], j);
4648
4649 /* We need to only consider the relation forks that exists. */
4650 if (block[i][j] == InvalidBlockNumber)
4651 {
4652 if (!smgrexists(rels[i], j))
4653 continue;
4654 cached = false;
4655 break;
4656 }
4657
4658 /* calculate the total number of blocks to be invalidated */
4659 nBlocksToInvalidate += block[i][j];
4660 }
4661 }
4662
4663 /*
4664 * We apply the optimization iff the total number of blocks to invalidate
4665 * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4666 */
4667 if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4668 {
4669 for (i = 0; i < n; i++)
4670 {
4671 for (int j = 0; j <= MAX_FORKNUM; j++)
4672 {
4673 /* ignore relation forks that doesn't exist */
4674 if (!BlockNumberIsValid(block[i][j]))
4675 continue;
4676
4677 /* drop all the buffers for a particular relation fork */
4678 FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4679 j, block[i][j], 0);
4680 }
4681 }
4682
4683 pfree(block);
4684 pfree(rels);
4685 return;
4686 }
4687
4688 pfree(block);
4689 locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4690 for (i = 0; i < n; i++)
4691 locators[i] = rels[i]->smgr_rlocator.locator;
4692
4693 /*
4694 * For low number of relations to drop just use a simple walk through, to
4695 * save the bsearch overhead. The threshold to use is rather a guess than
4696 * an exactly determined value, as it depends on many factors (CPU and RAM
4697 * speeds, amount of shared buffers etc.).
4698 */
4699 use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4700
4701 /* sort the list of rlocators if necessary */
4702 if (use_bsearch)
4703 qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4704
4705 for (i = 0; i < NBuffers; i++)
4706 {
4707 RelFileLocator *rlocator = NULL;
4708 BufferDesc *bufHdr = GetBufferDescriptor(i);
4709 uint32 buf_state;
4710
4711 /*
4712 * As in DropRelationBuffers, an unlocked precheck should be safe and
4713 * saves some cycles.
4714 */
4715
4716 if (!use_bsearch)
4717 {
4718 int j;
4719
4720 for (j = 0; j < n; j++)
4721 {
4722 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4723 {
4724 rlocator = &locators[j];
4725 break;
4726 }
4727 }
4728 }
4729 else
4730 {
4731 RelFileLocator locator;
4732
4733 locator = BufTagGetRelFileLocator(&bufHdr->tag);
4734 rlocator = bsearch(&locator,
4735 locators, n, sizeof(RelFileLocator),
4737 }
4738
4739 /* buffer doesn't belong to any of the given relfilelocators; skip it */
4740 if (rlocator == NULL)
4741 continue;
4742
4743 buf_state = LockBufHdr(bufHdr);
4744 if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4745 InvalidateBuffer(bufHdr); /* releases spinlock */
4746 else
4747 UnlockBufHdr(bufHdr, buf_state);
4748 }
4749
4750 pfree(locators);
4751 pfree(rels);
4752}
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:80
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:81
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:693
#define qsort(a, b, c, d)
Definition: port.h:479

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, if(), InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, palloc(), pfree(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ EvictAllUnpinnedBuffers()

void EvictAllUnpinnedBuffers ( int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 6610 of file bufmgr.c.

6612{
6613 *buffers_evicted = 0;
6614 *buffers_skipped = 0;
6615 *buffers_flushed = 0;
6616
6617 for (int buf = 1; buf <= NBuffers; buf++)
6618 {
6619 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6620 uint32 buf_state;
6621 bool buffer_flushed;
6622
6623 buf_state = pg_atomic_read_u32(&desc->state);
6624 if (!(buf_state & BM_VALID))
6625 continue;
6626
6629
6630 LockBufHdr(desc);
6631
6632 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6633 (*buffers_evicted)++;
6634 else
6635 (*buffers_skipped)++;
6636
6637 if (buffer_flushed)
6638 (*buffers_flushed)++;
6639 }
6640}
static bool EvictUnpinnedBufferInternal(BufferDesc *desc, bool *buffer_flushed)
Definition: bufmgr.c:6517

References BM_VALID, buf, CurrentResourceOwner, EvictUnpinnedBufferInternal(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u32(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), and BufferDesc::state.

Referenced by pg_buffercache_evict_all().

◆ EvictRelUnpinnedBuffers()

void EvictRelUnpinnedBuffers ( Relation  rel,
int32 buffers_evicted,
int32 buffers_flushed,
int32 buffers_skipped 
)

Definition at line 6658 of file bufmgr.c.

6660{
6662
6663 *buffers_skipped = 0;
6664 *buffers_evicted = 0;
6665 *buffers_flushed = 0;
6666
6667 for (int buf = 1; buf <= NBuffers; buf++)
6668 {
6669 BufferDesc *desc = GetBufferDescriptor(buf - 1);
6670 uint32 buf_state = pg_atomic_read_u32(&(desc->state));
6671 bool buffer_flushed;
6672
6673 /* An unlocked precheck should be safe and saves some cycles. */
6674 if ((buf_state & BM_VALID) == 0 ||
6676 continue;
6677
6678 /* Make sure we can pin the buffer. */
6681
6682 buf_state = LockBufHdr(desc);
6683
6684 /* recheck, could have changed without the lock */
6685 if ((buf_state & BM_VALID) == 0 ||
6687 {
6688 UnlockBufHdr(desc, buf_state);
6689 continue;
6690 }
6691
6692 if (EvictUnpinnedBufferInternal(desc, &buffer_flushed))
6693 (*buffers_evicted)++;
6694 else
6695 (*buffers_skipped)++;
6696
6697 if (buffer_flushed)
6698 (*buffers_flushed)++;
6699 }
6700}
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:648
RelFileLocator rd_locator
Definition: rel.h:57

References Assert(), BM_VALID, buf, BufTagMatchesRelFileLocator(), CurrentResourceOwner, EvictUnpinnedBufferInternal(), GetBufferDescriptor(), LockBufHdr(), NBuffers, pg_atomic_read_u32(), RelationData::rd_locator, RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by pg_buffercache_evict_relation().

◆ EvictUnpinnedBuffer()

bool EvictUnpinnedBuffer ( Buffer  buf,
bool *  buffer_flushed 
)

Definition at line 6581 of file bufmgr.c.

6582{
6583 BufferDesc *desc;
6584
6586
6587 /* Make sure we can pin the buffer. */
6590
6591 desc = GetBufferDescriptor(buf - 1);
6592 LockBufHdr(desc);
6593
6594 return EvictUnpinnedBufferInternal(desc, buffer_flushed);
6595}

References Assert(), buf, BufferIsLocal, BufferIsValid(), CurrentResourceOwner, EvictUnpinnedBufferInternal(), GetBufferDescriptor(), LockBufHdr(), ReservePrivateRefCountEntry(), and ResourceOwnerEnlarge().

Referenced by invalidate_rel_block(), modify_rel_block(), and pg_buffercache_evict().

◆ EvictUnpinnedBufferInternal()

static bool EvictUnpinnedBufferInternal ( BufferDesc desc,
bool *  buffer_flushed 
)
static

Definition at line 6517 of file bufmgr.c.

6518{
6519 uint32 buf_state;
6520 bool result;
6521
6522 *buffer_flushed = false;
6523
6524 buf_state = pg_atomic_read_u32(&(desc->state));
6525 Assert(buf_state & BM_LOCKED);
6526
6527 if ((buf_state & BM_VALID) == 0)
6528 {
6529 UnlockBufHdr(desc, buf_state);
6530 return false;
6531 }
6532
6533 /* Check that it's not pinned already. */
6534 if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6535 {
6536 UnlockBufHdr(desc, buf_state);
6537 return false;
6538 }
6539
6540 PinBuffer_Locked(desc); /* releases spinlock */
6541
6542 /* If it was dirty, try to clean it once. */
6543 if (buf_state & BM_DIRTY)
6544 {
6547 *buffer_flushed = true;
6549 }
6550
6551 /* This will return false if it becomes dirty or someone else pins it. */
6552 result = InvalidateVictimBuffer(desc);
6553
6554 UnpinBuffer(desc);
6555
6556 return result;
6557}
#define BM_LOCKED
Definition: buf_internals.h:68
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:4221
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:3176
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:2275

References Assert(), BM_DIRTY, BM_LOCKED, BM_VALID, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock(), FlushBuffer(), InvalidateVictimBuffer(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, LW_SHARED, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), PinBuffer_Locked(), BufferDesc::state, UnlockBufHdr(), and UnpinBuffer().

Referenced by EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), and EvictUnpinnedBuffer().

◆ ExtendBufferedRel()

Buffer ExtendBufferedRel ( BufferManagerRelation  bmr,
ForkNumber  forkNum,
BufferAccessStrategy  strategy,
uint32  flags 
)

Definition at line 851 of file bufmgr.c.

855{
856 Buffer buf;
857 uint32 extend_by = 1;
858
859 ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
860 &buf, &extend_by);
861
862 return buf;
863}
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:883

References buf, and ExtendBufferedRelBy().

Referenced by _bt_allocbuf(), _hash_getnewbuf(), BloomNewBuffer(), brinbuild(), brinbuildempty(), fill_seq_fork_with_data(), ginbuildempty(), GinNewBuffer(), gistbuildempty(), gistNewBuffer(), ReadBuffer_common(), revmap_physical_extend(), and SpGistNewBuffer().

◆ ExtendBufferedRelBy()

BlockNumber ExtendBufferedRelBy ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
Buffer buffers,
uint32 extended_by 
)

Definition at line 883 of file bufmgr.c.

890{
891 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
892 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
893 Assert(extend_by > 0);
894
895 if (bmr.smgr == NULL)
896 {
897 bmr.smgr = RelationGetSmgr(bmr.rel);
898 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
899 }
900
901 return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
902 extend_by, InvalidBlockNumber,
903 buffers, extended_by);
904}
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2559
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:578
struct SMgrRelationData * smgr
Definition: bufmgr.h:104
Form_pg_class rd_rel
Definition: rel.h:111

References Assert(), ExtendBufferedRelCommon(), InvalidBlockNumber, RelationData::rd_rel, BufferManagerRelation::rel, RelationGetSmgr(), BufferManagerRelation::relpersistence, and BufferManagerRelation::smgr.

Referenced by ExtendBufferedRel(), grow_rel(), and RelationAddBlocks().

◆ ExtendBufferedRelCommon()

static BlockNumber ExtendBufferedRelCommon ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2559 of file bufmgr.c.

2567{
2568 BlockNumber first_block;
2569
2570 TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2575 extend_by);
2576
2577 if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2578 first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2579 extend_by, extend_upto,
2580 buffers, &extend_by);
2581 else
2582 first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2583 extend_by, extend_upto,
2584 buffers, &extend_by);
2585 *extended_by = extend_by;
2586
2587 TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2592 *extended_by,
2593 first_block);
2594
2595 return first_block;
2596}
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2603
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:345

References RelFileLocatorBackend::backend, RelFileLocator::dbOid, ExtendBufferedRelLocal(), ExtendBufferedRelShared(), RelFileLocatorBackend::locator, RelFileLocator::relNumber, BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_rlocator, and RelFileLocator::spcOid.

Referenced by ExtendBufferedRelBy(), and ExtendBufferedRelTo().

◆ ExtendBufferedRelShared()

static BlockNumber ExtendBufferedRelShared ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2603 of file bufmgr.c.

2611{
2612 BlockNumber first_block;
2613 IOContext io_context = IOContextForStrategy(strategy);
2614 instr_time io_start;
2615
2616 LimitAdditionalPins(&extend_by);
2617
2618 /*
2619 * Acquire victim buffers for extension without holding extension lock.
2620 * Writing out victim buffers is the most expensive part of extending the
2621 * relation, particularly when doing so requires WAL flushes. Zeroing out
2622 * the buffers is also quite expensive, so do that before holding the
2623 * extension lock as well.
2624 *
2625 * These pages are pinned by us and not valid. While we hold the pin they
2626 * can't be acquired as victim buffers by another backend.
2627 */
2628 for (uint32 i = 0; i < extend_by; i++)
2629 {
2630 Block buf_block;
2631
2632 buffers[i] = GetVictimBuffer(strategy, io_context);
2633 buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2634
2635 /* new buffers are zero-filled */
2636 MemSet(buf_block, 0, BLCKSZ);
2637 }
2638
2639 /*
2640 * Lock relation against concurrent extensions, unless requested not to.
2641 *
2642 * We use the same extension lock for all forks. That's unnecessarily
2643 * restrictive, but currently extensions for forks don't happen often
2644 * enough to make it worth locking more granularly.
2645 *
2646 * Note that another backend might have extended the relation by the time
2647 * we get the lock.
2648 */
2649 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2651
2652 /*
2653 * If requested, invalidate size cache, so that smgrnblocks asks the
2654 * kernel.
2655 */
2656 if (flags & EB_CLEAR_SIZE_CACHE)
2658
2659 first_block = smgrnblocks(bmr.smgr, fork);
2660
2661 /*
2662 * Now that we have the accurate relation size, check if the caller wants
2663 * us to extend to only up to a specific size. If there were concurrent
2664 * extensions, we might have acquired too many buffers and need to release
2665 * them.
2666 */
2667 if (extend_upto != InvalidBlockNumber)
2668 {
2669 uint32 orig_extend_by = extend_by;
2670
2671 if (first_block > extend_upto)
2672 extend_by = 0;
2673 else if ((uint64) first_block + extend_by > extend_upto)
2674 extend_by = extend_upto - first_block;
2675
2676 for (uint32 i = extend_by; i < orig_extend_by; i++)
2677 {
2678 BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2679
2680 /*
2681 * The victim buffer we acquired previously is clean and unused,
2682 * let it be found again quickly
2683 */
2684 StrategyFreeBuffer(buf_hdr);
2685 UnpinBuffer(buf_hdr);
2686 }
2687
2688 if (extend_by == 0)
2689 {
2690 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2692 *extended_by = extend_by;
2693 return first_block;
2694 }
2695 }
2696
2697 /* Fail if relation is already at maximum possible length */
2698 if ((uint64) first_block + extend_by >= MaxBlockNumber)
2699 ereport(ERROR,
2700 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2701 errmsg("cannot extend relation %s beyond %u blocks",
2702 relpath(bmr.smgr->smgr_rlocator, fork).str,
2703 MaxBlockNumber)));
2704
2705 /*
2706 * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2707 *
2708 * This needs to happen before we extend the relation, because as soon as
2709 * we do, other backends can start to read in those pages.
2710 */
2711 for (uint32 i = 0; i < extend_by; i++)
2712 {
2713 Buffer victim_buf = buffers[i];
2714 BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2715 BufferTag tag;
2716 uint32 hash;
2717 LWLock *partition_lock;
2718 int existing_id;
2719
2720 /* in case we need to pin an existing buffer below */
2723
2724 InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2725 hash = BufTableHashCode(&tag);
2726 partition_lock = BufMappingPartitionLock(hash);
2727
2728 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2729
2730 existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2731
2732 /*
2733 * We get here only in the corner case where we are trying to extend
2734 * the relation but we found a pre-existing buffer. This can happen
2735 * because a prior attempt at extending the relation failed, and
2736 * because mdread doesn't complain about reads beyond EOF (when
2737 * zero_damaged_pages is ON) and so a previous attempt to read a block
2738 * beyond EOF could have left a "valid" zero-filled buffer.
2739 * Unfortunately, we have also seen this case occurring because of
2740 * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2741 * that doesn't account for a recent write. In that situation, the
2742 * pre-existing buffer would contain valid data that we don't want to
2743 * overwrite. Since the legitimate cases should always have left a
2744 * zero-filled buffer, complain if not PageIsNew.
2745 */
2746 if (existing_id >= 0)
2747 {
2748 BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2749 Block buf_block;
2750 bool valid;
2751
2752 /*
2753 * Pin the existing buffer before releasing the partition lock,
2754 * preventing it from being evicted.
2755 */
2756 valid = PinBuffer(existing_hdr, strategy);
2757
2758 LWLockRelease(partition_lock);
2759
2760 /*
2761 * The victim buffer we acquired previously is clean and unused,
2762 * let it be found again quickly
2763 */
2764 StrategyFreeBuffer(victim_buf_hdr);
2765 UnpinBuffer(victim_buf_hdr);
2766
2767 buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2768 buf_block = BufHdrGetBlock(existing_hdr);
2769
2770 if (valid && !PageIsNew((Page) buf_block))
2771 ereport(ERROR,
2772 (errmsg("unexpected data beyond EOF in block %u of relation %s",
2773 existing_hdr->tag.blockNum,
2774 relpath(bmr.smgr->smgr_rlocator, fork).str),
2775 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2776
2777 /*
2778 * We *must* do smgr[zero]extend before succeeding, else the page
2779 * will not be reserved by the kernel, and the next P_NEW call
2780 * will decide to return the same page. Clear the BM_VALID bit,
2781 * do StartBufferIO() and proceed.
2782 *
2783 * Loop to handle the very small possibility that someone re-sets
2784 * BM_VALID between our clearing it and StartBufferIO inspecting
2785 * it.
2786 */
2787 do
2788 {
2789 uint32 buf_state = LockBufHdr(existing_hdr);
2790
2791 buf_state &= ~BM_VALID;
2792 UnlockBufHdr(existing_hdr, buf_state);
2793 } while (!StartBufferIO(existing_hdr, true, false));
2794 }
2795 else
2796 {
2797 uint32 buf_state;
2798
2799 buf_state = LockBufHdr(victim_buf_hdr);
2800
2801 /* some sanity checks while we hold the buffer header lock */
2802 Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2803 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2804
2805 victim_buf_hdr->tag = tag;
2806
2807 buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2808 if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2809 buf_state |= BM_PERMANENT;
2810
2811 UnlockBufHdr(victim_buf_hdr, buf_state);
2812
2813 LWLockRelease(partition_lock);
2814
2815 /* XXX: could combine the locked operations in it with the above */
2816 StartBufferIO(victim_buf_hdr, true, false);
2817 }
2818 }
2819
2821
2822 /*
2823 * Note: if smgrzeroextend fails, we will end up with buffers that are
2824 * allocated but not marked BM_VALID. The next relation extension will
2825 * still select the same block number (because the relation didn't get any
2826 * longer on disk) and so future attempts to extend the relation will find
2827 * the same buffers (if they have not been recycled) but come right back
2828 * here to try smgrzeroextend again.
2829 *
2830 * We don't need to set checksum for all-zero pages.
2831 */
2832 smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2833
2834 /*
2835 * Release the file-extension lock; it's now OK for someone else to extend
2836 * the relation some more.
2837 *
2838 * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2839 * take noticeable time.
2840 */
2841 if (!(flags & EB_SKIP_EXTENSION_LOCK))
2843
2845 io_start, 1, extend_by * BLCKSZ);
2846
2847 /* Set BM_VALID, terminate IO, and wake up any waiters */
2848 for (uint32 i = 0; i < extend_by; i++)
2849 {
2850 Buffer buf = buffers[i];
2851 BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2852 bool lock = false;
2853
2854 if (flags & EB_LOCK_FIRST && i == 0)
2855 lock = true;
2856 else if (flags & EB_LOCK_TARGET)
2857 {
2858 Assert(extend_upto != InvalidBlockNumber);
2859 if (first_block + i + 1 == extend_upto)
2860 lock = true;
2861 }
2862
2863 if (lock)
2865
2866 TerminateBufferIO(buf_hdr, false, BM_VALID, true, false);
2867 }
2868
2870
2871 *extended_by = extend_by;
2872
2873 return first_block;
2874}
#define MaxBlockNumber
Definition: block.h:35
#define BM_JUST_DIRTIED
Definition: buf_internals.h:74
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:69
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2541
bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:5975
void * Block
Definition: bufmgr.h:26
@ EB_LOCK_TARGET
Definition: bufmgr.h:93
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:90
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:75
@ EB_LOCK_FIRST
Definition: bufmgr.h:87
static bool PageIsNew(const PageData *page)
Definition: bufpage.h:234
#define MemSet(start, val, len)
Definition: c.h:991
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:424
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:474
#define ExclusiveLock
Definition: lockdefs.h:42
@ IOOP_EXTEND
Definition: pgstat.h:311
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
#define relpath(rlocator, forknum)
Definition: relpath.h:150
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:819
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:649
int64 shared_blks_written
Definition: instrument.h:29
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:47

References Assert(), buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufHdrGetBlock, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), CurrentResourceOwner, EB_CLEAR_SIZE_CACHE, EB_LOCK_FIRST, EB_LOCK_TARGET, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errhint(), errmsg(), ERROR, ExclusiveLock, GetBufferDescriptor(), GetVictimBuffer(), hash(), i, INIT_FORKNUM, InitBufferTag(), InvalidBlockNumber, IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EXTEND, LimitAdditionalPins(), RelFileLocatorBackend::locator, LockBufHdr(), LockRelationForExtension(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MaxBlockNumber, MemSet, PageIsNew(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer(), BufferManagerRelation::rel, relpath, BufferManagerRelation::relpersistence, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_written, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, SMgrRelationData::smgr_rlocator, smgrnblocks(), smgrzeroextend(), StartBufferIO(), StrategyFreeBuffer(), BufferDesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdr(), UnlockRelationForExtension(), and UnpinBuffer().

Referenced by ExtendBufferedRelCommon().

◆ ExtendBufferedRelTo()

Buffer ExtendBufferedRelTo ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
BlockNumber  extend_to,
ReadBufferMode  mode 
)

Definition at line 915 of file bufmgr.c.

921{
923 uint32 extended_by = 0;
924 Buffer buffer = InvalidBuffer;
925 Buffer buffers[64];
926
927 Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
928 Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
929 Assert(extend_to != InvalidBlockNumber && extend_to > 0);
930
931 if (bmr.smgr == NULL)
932 {
933 bmr.smgr = RelationGetSmgr(bmr.rel);
934 bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
935 }
936
937 /*
938 * If desired, create the file if it doesn't exist. If
939 * smgr_cached_nblocks[fork] is positive then it must exist, no need for
940 * an smgrexists call.
941 */
942 if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
943 (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
945 !smgrexists(bmr.smgr, fork))
946 {
948
949 /* recheck, fork might have been created concurrently */
950 if (!smgrexists(bmr.smgr, fork))
951 smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
952
954 }
955
956 /*
957 * If requested, invalidate size cache, so that smgrnblocks asks the
958 * kernel.
959 */
960 if (flags & EB_CLEAR_SIZE_CACHE)
962
963 /*
964 * Estimate how many pages we'll need to extend by. This avoids acquiring
965 * unnecessarily many victim buffers.
966 */
967 current_size = smgrnblocks(bmr.smgr, fork);
968
969 /*
970 * Since no-one else can be looking at the page contents yet, there is no
971 * difference between an exclusive lock and a cleanup-strength lock. Note
972 * that we pass the original mode to ReadBuffer_common() below, when
973 * falling back to reading the buffer to a concurrent relation extension.
974 */
976 flags |= EB_LOCK_TARGET;
977
978 while (current_size < extend_to)
979 {
980 uint32 num_pages = lengthof(buffers);
981 BlockNumber first_block;
982
983 if ((uint64) current_size + num_pages > extend_to)
984 num_pages = extend_to - current_size;
985
986 first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
987 num_pages, extend_to,
988 buffers, &extended_by);
989
990 current_size = first_block + extended_by;
991 Assert(num_pages != 0 || current_size >= extend_to);
992
993 for (uint32 i = 0; i < extended_by; i++)
994 {
995 if (first_block + i != extend_to - 1)
996 ReleaseBuffer(buffers[i]);
997 else
998 buffer = buffers[i];
999 }
1000 }
1001
1002 /*
1003 * It's possible that another backend concurrently extended the relation.
1004 * In that case read the buffer.
1005 *
1006 * XXX: Should we control this via a flag?
1007 */
1008 if (buffer == InvalidBuffer)
1009 {
1010 Assert(extended_by == 0);
1011 buffer = ReadBuffer_common(bmr.rel, bmr.smgr, bmr.relpersistence,
1012 fork, extend_to - 1, mode, strategy);
1013 }
1014
1015 return buffer;
1016}
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1186
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5303
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:78
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:84
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:49
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:47
#define lengthof(array)
Definition: c.h:759
static PgChecksumMode mode
Definition: pg_checksums.c:55
static int64 current_size
Definition: pg_checksums.c:63

References Assert(), PrivateRefCountEntry::buffer, current_size, EB_CLEAR_SIZE_CACHE, EB_CREATE_FORK_IF_NEEDED, EB_LOCK_TARGET, EB_PERFORMING_RECOVERY, ExclusiveLock, ExtendBufferedRelCommon(), i, InvalidBlockNumber, InvalidBuffer, lengthof, LockRelationForExtension(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RelationData::rd_rel, ReadBuffer_common(), BufferManagerRelation::rel, RelationGetSmgr(), ReleaseBuffer(), BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, smgrcreate(), smgrexists(), smgrnblocks(), and UnlockRelationForExtension().

Referenced by fsm_extend(), vm_extend(), and XLogReadBufferExtended().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 4764 of file bufmgr.c.

4767{
4768 BlockNumber curBlock;
4769
4770 for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4771 {
4772 uint32 bufHash; /* hash value for tag */
4773 BufferTag bufTag; /* identity of requested block */
4774 LWLock *bufPartitionLock; /* buffer partition lock for it */
4775 int buf_id;
4776 BufferDesc *bufHdr;
4777 uint32 buf_state;
4778
4779 /* create a tag so we can lookup the buffer */
4780 InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4781
4782 /* determine its hash code and partition lock ID */
4783 bufHash = BufTableHashCode(&bufTag);
4784 bufPartitionLock = BufMappingPartitionLock(bufHash);
4785
4786 /* Check that it is in the buffer pool. If not, do nothing. */
4787 LWLockAcquire(bufPartitionLock, LW_SHARED);
4788 buf_id = BufTableLookup(&bufTag, bufHash);
4789 LWLockRelease(bufPartitionLock);
4790
4791 if (buf_id < 0)
4792 continue;
4793
4794 bufHdr = GetBufferDescriptor(buf_id);
4795
4796 /*
4797 * We need to lock the buffer header and recheck if the buffer is
4798 * still associated with the same block because the buffer could be
4799 * evicted by some other backend loading blocks for a different
4800 * relation after we release lock on the BufMapping table.
4801 */
4802 buf_state = LockBufHdr(bufHdr);
4803
4804 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4805 BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4806 bufHdr->tag.blockNum >= firstDelBlock)
4807 InvalidateBuffer(bufHdr); /* releases spinlock */
4808 else
4809 UnlockBufHdr(bufHdr, buf_state);
4810 }
4811}

References buftag::blockNum, BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), BufferDesc::tag, and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 4221 of file bufmgr.c.

4223{
4224 XLogRecPtr recptr;
4225 ErrorContextCallback errcallback;
4226 instr_time io_start;
4227 Block bufBlock;
4228 char *bufToWrite;
4229 uint32 buf_state;
4230
4231 /*
4232 * Try to start an I/O operation. If StartBufferIO returns false, then
4233 * someone else flushed the buffer before we could, so we need not do
4234 * anything.
4235 */
4236 if (!StartBufferIO(buf, false, false))
4237 return;
4238
4239 /* Setup error traceback support for ereport() */
4241 errcallback.arg = buf;
4242 errcallback.previous = error_context_stack;
4243 error_context_stack = &errcallback;
4244
4245 /* Find smgr relation for buffer */
4246 if (reln == NULL)
4248
4249 TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
4250 buf->tag.blockNum,
4254
4255 buf_state = LockBufHdr(buf);
4256
4257 /*
4258 * Run PageGetLSN while holding header lock, since we don't have the
4259 * buffer locked exclusively in all cases.
4260 */
4261 recptr = BufferGetLSN(buf);
4262
4263 /* To check if block content changes while flushing. - vadim 01/17/97 */
4264 buf_state &= ~BM_JUST_DIRTIED;
4265 UnlockBufHdr(buf, buf_state);
4266
4267 /*
4268 * Force XLOG flush up to buffer's LSN. This implements the basic WAL
4269 * rule that log updates must hit disk before any of the data-file changes
4270 * they describe do.
4271 *
4272 * However, this rule does not apply to unlogged relations, which will be
4273 * lost after a crash anyway. Most unlogged relation pages do not bear
4274 * LSNs since we never emit WAL records for them, and therefore flushing
4275 * up through the buffer LSN would be useless, but harmless. However,
4276 * GiST indexes use LSNs internally to track page-splits, and therefore
4277 * unlogged GiST pages bear "fake" LSNs generated by
4278 * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
4279 * LSN counter could advance past the WAL insertion point; and if it did
4280 * happen, attempting to flush WAL through that location would fail, with
4281 * disastrous system-wide consequences. To make sure that can't happen,
4282 * skip the flush if the buffer isn't permanent.
4283 */
4284 if (buf_state & BM_PERMANENT)
4285 XLogFlush(recptr);
4286
4287 /*
4288 * Now it's safe to write the buffer to disk. Note that no one else should
4289 * have been able to write it, while we were busy with log flushing,
4290 * because we got the exclusive right to perform I/O by setting the
4291 * BM_IO_IN_PROGRESS bit.
4292 */
4293 bufBlock = BufHdrGetBlock(buf);
4294
4295 /*
4296 * Update page checksum if desired. Since we have only shared lock on the
4297 * buffer, other processes might be updating hint bits in it, so we must
4298 * copy the page to private storage if we do checksumming.
4299 */
4300 bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
4301
4303
4304 /*
4305 * bufToWrite is either the shared buffer or a copy, as appropriate.
4306 */
4307 smgrwrite(reln,
4308 BufTagGetForkNum(&buf->tag),
4309 buf->tag.blockNum,
4310 bufToWrite,
4311 false);
4312
4313 /*
4314 * When a strategy is in use, only flushes of dirty buffers already in the
4315 * strategy ring are counted as strategy writes (IOCONTEXT
4316 * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
4317 * statistics tracking.
4318 *
4319 * If a shared buffer initially added to the ring must be flushed before
4320 * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
4321 *
4322 * If a shared buffer which was added to the ring later because the
4323 * current strategy buffer is pinned or in use or because all strategy
4324 * buffers were dirty and rejected (for BAS_BULKREAD operations only)
4325 * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
4326 * (from_ring will be false).
4327 *
4328 * When a strategy is not in use, the write can only be a "regular" write
4329 * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
4330 */
4332 IOOP_WRITE, io_start, 1, BLCKSZ);
4333
4335
4336 /*
4337 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
4338 * end the BM_IO_IN_PROGRESS state.
4339 */
4340 TerminateBufferIO(buf, true, 0, true, false);
4341
4342 TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
4343 buf->tag.blockNum,
4347
4348 /* Pop the error context stack */
4349 error_context_stack = errcallback.previous;
4350}
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:70
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6130
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1509
ErrorContextCallback * error_context_stack
Definition: elog.c:95
@ IOOP_WRITE
Definition: pgstat.h:313
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:131
struct ErrorContextCallback * previous
Definition: elog.h:297
void(* callback)(void *arg)
Definition: elog.h:298
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2923

References ErrorContextCallback::arg, BM_PERMANENT, buf, BufferGetLSN, BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, RelFileLocator::dbOid, error_context_stack, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITE, RelFileLocatorBackend::locator, LockBufHdr(), PageSetChecksumCopy(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), ErrorContextCallback::previous, RelFileLocator::relNumber, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rlocator, smgropen(), smgrwrite(), RelFileLocator::spcOid, StartBufferIO(), TerminateBufferIO(), track_io_timing, UnlockBufHdr(), and XLogFlush().

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), and SyncOneBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 5241 of file bufmgr.c.

5242{
5243 int i;
5244 BufferDesc *bufHdr;
5245
5246 for (i = 0; i < NBuffers; i++)
5247 {
5248 uint32 buf_state;
5249
5250 bufHdr = GetBufferDescriptor(i);
5251
5252 /*
5253 * As in DropRelationBuffers, an unlocked precheck should be safe and
5254 * saves some cycles.
5255 */
5256 if (bufHdr->tag.dbOid != dbid)
5257 continue;
5258
5259 /* Make sure we can handle the pin */
5262
5263 buf_state = LockBufHdr(bufHdr);
5264 if (bufHdr->tag.dbOid == dbid &&
5265 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5266 {
5267 PinBuffer_Locked(bufHdr);
5271 UnpinBuffer(bufHdr);
5272 }
5273 else
5274 UnlockBufHdr(bufHdr, buf_state);
5275 }
5276}

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), CurrentResourceOwner, buftag::dbOid, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 5283 of file bufmgr.c.

5284{
5285 BufferDesc *bufHdr;
5286
5287 /* currently not needed, but no fundamental reason not to support */
5288 Assert(!BufferIsLocal(buffer));
5289
5290 Assert(BufferIsPinned(buffer));
5291
5292 bufHdr = GetBufferDescriptor(buffer - 1);
5293
5295
5297}

References Assert(), PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), invalidate_rel_block(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 4873 of file bufmgr.c.

4874{
4875 int i;
4876 BufferDesc *bufHdr;
4877 SMgrRelation srel = RelationGetSmgr(rel);
4878
4879 if (RelationUsesLocalBuffers(rel))
4880 {
4881 for (i = 0; i < NLocBuffer; i++)
4882 {
4883 uint32 buf_state;
4884
4885 bufHdr = GetLocalBufferDescriptor(i);
4886 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4887 ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4888 (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4889 {
4890 ErrorContextCallback errcallback;
4891
4892 /* Setup error traceback support for ereport() */
4894 errcallback.arg = bufHdr;
4895 errcallback.previous = error_context_stack;
4896 error_context_stack = &errcallback;
4897
4898 /* Make sure we can handle the pin */
4901
4902 /*
4903 * Pin/upin mostly to make valgrind work, but it also seems
4904 * like the right thing to do.
4905 */
4906 PinLocalBuffer(bufHdr, false);
4907
4908
4909 FlushLocalBuffer(bufHdr, srel);
4910
4912
4913 /* Pop the error context stack */
4914 error_context_stack = errcallback.previous;
4915 }
4916 }
4917
4918 return;
4919 }
4920
4921 for (i = 0; i < NBuffers; i++)
4922 {
4923 uint32 buf_state;
4924
4925 bufHdr = GetBufferDescriptor(i);
4926
4927 /*
4928 * As in DropRelationBuffers, an unlocked precheck should be safe and
4929 * saves some cycles.
4930 */
4931 if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4932 continue;
4933
4934 /* Make sure we can handle the pin */
4937
4938 buf_state = LockBufHdr(bufHdr);
4939 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4940 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4941 {
4942 PinBuffer_Locked(bufHdr);
4946 UnpinBuffer(bufHdr);
4947 }
4948 else
4949 UnlockBufHdr(bufHdr, buf_state);
4950 }
4951}
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:6146
void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln)
Definition: localbuf.c:182
void UnpinLocalBuffer(Buffer buffer)
Definition: localbuf.c:832
bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
Definition: localbuf.c:796
int NLocBuffer
Definition: localbuf.c:44

References ErrorContextCallback::arg, BM_DIRTY, BM_VALID, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), FlushLocalBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, local_buffer_write_error_callback(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, pg_atomic_read_u32(), PinBuffer_Locked(), PinLocalBuffer(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 4963 of file bufmgr.c.

4964{
4965 int i;
4966 SMgrSortArray *srels;
4967 bool use_bsearch;
4968
4969 if (nrels == 0)
4970 return;
4971
4972 /* fill-in array for qsort */
4973 srels = palloc(sizeof(SMgrSortArray) * nrels);
4974
4975 for (i = 0; i < nrels; i++)
4976 {
4977 Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4978
4979 srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4980 srels[i].srel = smgrs[i];
4981 }
4982
4983 /*
4984 * Save the bsearch overhead for low number of relations to sync. See
4985 * DropRelationsAllBuffers for details.
4986 */
4987 use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4988
4989 /* sort the list of SMgrRelations if necessary */
4990 if (use_bsearch)
4991 qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4992
4993 for (i = 0; i < NBuffers; i++)
4994 {
4995 SMgrSortArray *srelent = NULL;
4996 BufferDesc *bufHdr = GetBufferDescriptor(i);
4997 uint32 buf_state;
4998
4999 /*
5000 * As in DropRelationBuffers, an unlocked precheck should be safe and
5001 * saves some cycles.
5002 */
5003
5004 if (!use_bsearch)
5005 {
5006 int j;
5007
5008 for (j = 0; j < nrels; j++)
5009 {
5010 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
5011 {
5012 srelent = &srels[j];
5013 break;
5014 }
5015 }
5016 }
5017 else
5018 {
5019 RelFileLocator rlocator;
5020
5021 rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
5022 srelent = bsearch(&rlocator,
5023 srels, nrels, sizeof(SMgrSortArray),
5025 }
5026
5027 /* buffer doesn't belong to any of the given relfilelocators; skip it */
5028 if (srelent == NULL)
5029 continue;
5030
5031 /* Make sure we can handle the pin */
5034
5035 buf_state = LockBufHdr(bufHdr);
5036 if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
5037 (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
5038 {
5039 PinBuffer_Locked(bufHdr);
5043 UnpinBuffer(bufHdr);
5044 }
5045 else
5046 UnlockBufHdr(bufHdr, buf_state);
5047 }
5048
5049 pfree(srels);
5050}
SMgrRelation srel
Definition: bufmgr.c:137
RelFileLocator rlocator
Definition: bufmgr.c:136

References Assert(), BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, RelFileLocatorBackend::locator, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, palloc(), pfree(), PinBuffer_Locked(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrSortArray::rlocator, rlocator_comparator(), SMgrRelationData::smgr_rlocator, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 445 of file bufmgr.c.

446{
447 Assert(ref->refcount == 0);
448
449 if (ref >= &PrivateRefCountArray[0] &&
451 {
452 ref->buffer = InvalidBuffer;
453
454 /*
455 * Mark the just used entry as reserved - in many scenarios that
456 * allows us to avoid ever having to search the array/hash for free
457 * entries.
458 */
460 }
461 else
462 {
463 bool found;
464 Buffer buffer = ref->buffer;
465
467 Assert(found);
470 }
471}
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:216
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
@ HASH_REMOVE
Definition: hsearch.h:115

References Assert(), PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by UnpinBufferNoOwner().

◆ GetAdditionalPinLimit()

uint32 GetAdditionalPinLimit ( void  )

Definition at line 2515 of file bufmgr.c.

2516{
2517 uint32 estimated_pins_held;
2518
2519 /*
2520 * We get the number of "overflowed" pins for free, but don't know the
2521 * number of pins in PrivateRefCountArray. The cost of calculating that
2522 * exactly doesn't seem worth it, so just assume the max.
2523 */
2524 estimated_pins_held = PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2525
2526 /* Is this backend already holding more than its fair share? */
2527 if (estimated_pins_held > MaxProportionalPins)
2528 return 0;
2529
2530 return MaxProportionalPins - estimated_pins_held;
2531}
static uint32 MaxProportionalPins
Definition: bufmgr.c:218

References MaxProportionalPins, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by LimitAdditionalPins(), and read_stream_start_pending_read().

◆ GetPinLimit()

uint32 GetPinLimit ( void  )

Definition at line 2503 of file bufmgr.c.

2504{
2505 return MaxProportionalPins;
2506}

References MaxProportionalPins.

Referenced by GetAccessStrategy(), and read_stream_begin_impl().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 422 of file bufmgr.c.

423{
425
426 Assert(BufferIsValid(buffer));
427 Assert(!BufferIsLocal(buffer));
428
429 /*
430 * Not moving the entry - that's ok for the current users, but we might
431 * want to change this one day.
432 */
433 ref = GetPrivateRefCountEntry(buffer, false);
434
435 if (ref == NULL)
436 return 0;
437 return ref->refcount;
438}
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:348

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by CheckBufferIsPinnedOnce(), ConditionalLockBufferForCleanup(), DebugPrintBufferRefcount(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), MarkBufferDirtyHint(), and ReadRecentBuffer().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 348 of file bufmgr.c.

349{
351 int i;
352
353 Assert(BufferIsValid(buffer));
354 Assert(!BufferIsLocal(buffer));
355
356 /*
357 * First search for references in the array, that'll be sufficient in the
358 * majority of cases.
359 */
360 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
361 {
362 res = &PrivateRefCountArray[i];
363
364 if (res->buffer == buffer)
365 return res;
366 }
367
368 /*
369 * By here we know that the buffer, if already pinned, isn't residing in
370 * the array.
371 *
372 * Only look up the buffer in the hashtable if we've previously overflowed
373 * into it.
374 */
376 return NULL;
377
378 res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
379
380 if (res == NULL)
381 return NULL;
382 else if (!do_move)
383 {
384 /* caller doesn't want us to move the hash entry into the array */
385 return res;
386 }
387 else
388 {
389 /* move buffer from hashtable into the free array slot */
390 bool found;
392
393 /* Ensure there's a free array slot */
395
396 /* Use up the reserved slot */
400 Assert(free->buffer == InvalidBuffer);
401
402 /* and fill it */
403 free->buffer = buffer;
404 free->refcount = res->refcount;
405
406 /* delete from hashtable */
408 Assert(found);
411
412 return free;
413 }
414}
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113

References Assert(), PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBufferNoOwner().

◆ GetVictimBuffer()

static Buffer GetVictimBuffer ( BufferAccessStrategy  strategy,
IOContext  io_context 
)
static

Definition at line 2343 of file bufmgr.c.

2344{
2345 BufferDesc *buf_hdr;
2346 Buffer buf;
2347 uint32 buf_state;
2348 bool from_ring;
2349
2350 /*
2351 * Ensure, while the spinlock's not yet held, that there's a free refcount
2352 * entry, and a resource owner slot for the pin.
2353 */
2356
2357 /* we return here if a prospective victim buffer gets used concurrently */
2358again:
2359
2360 /*
2361 * Select a victim buffer. The buffer is returned with its header
2362 * spinlock still held!
2363 */
2364 buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
2365 buf = BufferDescriptorGetBuffer(buf_hdr);
2366
2367 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
2368
2369 /* Pin the buffer and then release the buffer spinlock */
2370 PinBuffer_Locked(buf_hdr);
2371
2372 /*
2373 * We shouldn't have any other pins for this buffer.
2374 */
2376
2377 /*
2378 * If the buffer was dirty, try to write it out. There is a race
2379 * condition here, in that someone might dirty it after we released the
2380 * buffer header lock above, or even while we are writing it out (since
2381 * our share-lock won't prevent hint-bit updates). We will recheck the
2382 * dirty bit after re-locking the buffer header.
2383 */
2384 if (buf_state & BM_DIRTY)
2385 {
2386 LWLock *content_lock;
2387
2388 Assert(buf_state & BM_TAG_VALID);
2389 Assert(buf_state & BM_VALID);
2390
2391 /*
2392 * We need a share-lock on the buffer contents to write it out (else
2393 * we might write invalid data, eg because someone else is compacting
2394 * the page contents while we write). We must use a conditional lock
2395 * acquisition here to avoid deadlock. Even though the buffer was not
2396 * pinned (and therefore surely not locked) when StrategyGetBuffer
2397 * returned it, someone else could have pinned and exclusive-locked it
2398 * by the time we get here. If we try to get the lock unconditionally,
2399 * we'd block waiting for them; if they later block waiting for us,
2400 * deadlock ensues. (This has been observed to happen when two
2401 * backends are both trying to split btree index pages, and the second
2402 * one just happens to be trying to split the page the first one got
2403 * from StrategyGetBuffer.)
2404 */
2405 content_lock = BufferDescriptorGetContentLock(buf_hdr);
2406 if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2407 {
2408 /*
2409 * Someone else has locked the buffer, so give it up and loop back
2410 * to get another one.
2411 */
2412 UnpinBuffer(buf_hdr);
2413 goto again;
2414 }
2415
2416 /*
2417 * If using a nondefault strategy, and writing the buffer would
2418 * require a WAL flush, let the strategy decide whether to go ahead
2419 * and write/reuse the buffer or to choose another victim. We need a
2420 * lock to inspect the page LSN, so this can't be done inside
2421 * StrategyGetBuffer.
2422 */
2423 if (strategy != NULL)
2424 {
2425 XLogRecPtr lsn;
2426
2427 /* Read the LSN while holding buffer header lock */
2428 buf_state = LockBufHdr(buf_hdr);
2429 lsn = BufferGetLSN(buf_hdr);
2430 UnlockBufHdr(buf_hdr, buf_state);
2431
2432 if (XLogNeedsFlush(lsn)
2433 && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2434 {
2435 LWLockRelease(content_lock);
2436 UnpinBuffer(buf_hdr);
2437 goto again;
2438 }
2439 }
2440
2441 /* OK, do the I/O */
2442 FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2443 LWLockRelease(content_lock);
2444
2446 &buf_hdr->tag);
2447 }
2448
2449
2450 if (buf_state & BM_VALID)
2451 {
2452 /*
2453 * When a BufferAccessStrategy is in use, blocks evicted from shared
2454 * buffers are counted as IOOP_EVICT in the corresponding context
2455 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2456 * strategy in two cases: 1) while initially claiming buffers for the
2457 * strategy ring 2) to replace an existing strategy ring buffer
2458 * because it is pinned or in use and cannot be reused.
2459 *
2460 * Blocks evicted from buffers already in the strategy ring are
2461 * counted as IOOP_REUSE in the corresponding strategy context.
2462 *
2463 * At this point, we can accurately count evictions and reuses,
2464 * because we have successfully claimed the valid buffer. Previously,
2465 * we may have been forced to release the buffer due to concurrent
2466 * pinners or erroring out.
2467 */
2469 from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
2470 }
2471
2472 /*
2473 * If the buffer has an entry in the buffer mapping table, delete it. This
2474 * can fail because another backend could have pinned or dirtied the
2475 * buffer.
2476 */
2477 if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2478 {
2479 UnpinBuffer(buf_hdr);
2480 goto again;
2481 }
2482
2483 /* a final set of sanity checks */
2484#ifdef USE_ASSERT_CHECKING
2485 buf_state = pg_atomic_read_u32(&buf_hdr->state);
2486
2487 Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2488 Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2489
2491#endif
2492
2493 return buf;
2494}
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5584
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:6343
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:840
@ IOOP_EVICT
Definition: pgstat.h:304
@ IOOP_REUSE
Definition: pgstat.h:307
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3254

References Assert(), BackendWritebackContext, BM_DIRTY, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufferGetLSN, CheckBufferIsPinnedOnce(), CurrentResourceOwner, FlushBuffer(), InvalidateVictimBuffer(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, LockBufHdr(), LW_SHARED, LWLockConditionalAcquire(), LWLockRelease(), pg_atomic_read_u32(), pgstat_count_io_op(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), BufferDesc::state, StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by BufferAlloc(), and ExtendBufferedRelShared().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 5759 of file bufmgr.c.

5760{
5761 int bufid = GetStartupBufferPinWaitBufId();
5762
5763 /*
5764 * If we get woken slowly then it's possible that the Startup process was
5765 * already woken by other backends before we got here. Also possible that
5766 * we get here by multiple interrupts or interrupts at inappropriate
5767 * times, so make sure we do nothing if the bufid is not set.
5768 */
5769 if (bufid < 0)
5770 return false;
5771
5772 if (GetPrivateRefCount(bufid + 1) > 0)
5773 return true;
5774
5775 return false;
5776}
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:767

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and ProcessRecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

◆ InitBufferManagerAccess()

void InitBufferManagerAccess ( void  )

Definition at line 4006 of file bufmgr.c.

4007{
4008 HASHCTL hash_ctl;
4009
4010 /*
4011 * An advisory limit on the number of pins each backend should hold, based
4012 * on shared_buffers and the maximum number of connections possible.
4013 * That's very pessimistic, but outside toy-sized shared_buffers it should
4014 * allow plenty of pins. LimitAdditionalPins() and
4015 * GetAdditionalPinLimit() can be used to check the remaining balance.
4016 */
4018
4019 memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
4020
4021 hash_ctl.keysize = sizeof(int32);
4022 hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
4023
4024 PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
4026
4027 /*
4028 * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
4029 * the corresponding phase of backend shutdown.
4030 */
4031 Assert(MyProc != NULL);
4033}
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:4040
struct PrivateRefCountEntry PrivateRefCountEntry
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
int MaxBackends
Definition: globals.c:147
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
#define NUM_AUXILIARY_PROCS
Definition: proc.h:447
PGPROC * MyProc
Definition: proc.c:67
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76

References Assert(), AtProcExit_Buffers(), HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MaxBackends, MaxProportionalPins, MyProc, NBuffers, NUM_AUXILIARY_PROCS, on_shmem_exit(), PrivateRefCountArray, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 2176 of file bufmgr.c.

2177{
2178 BufferTag oldTag;
2179 uint32 oldHash; /* hash value for oldTag */
2180 LWLock *oldPartitionLock; /* buffer partition lock for it */
2181 uint32 oldFlags;
2182 uint32 buf_state;
2183
2184 /* Save the original buffer tag before dropping the spinlock */
2185 oldTag = buf->tag;
2186
2187 buf_state = pg_atomic_read_u32(&buf->state);
2188 Assert(buf_state & BM_LOCKED);
2189 UnlockBufHdr(buf, buf_state);
2190
2191 /*
2192 * Need to compute the old tag's hashcode and partition lock ID. XXX is it
2193 * worth storing the hashcode in BufferDesc so we need not recompute it
2194 * here? Probably not.
2195 */
2196 oldHash = BufTableHashCode(&oldTag);
2197 oldPartitionLock = BufMappingPartitionLock(oldHash);
2198
2199retry:
2200
2201 /*
2202 * Acquire exclusive mapping lock in preparation for changing the buffer's
2203 * association.
2204 */
2205 LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
2206
2207 /* Re-lock the buffer header */
2208 buf_state = LockBufHdr(buf);
2209
2210 /* If it's changed while we were waiting for lock, do nothing */
2211 if (!BufferTagsEqual(&buf->tag, &oldTag))
2212 {
2213 UnlockBufHdr(buf, buf_state);
2214 LWLockRelease(oldPartitionLock);
2215 return;
2216 }
2217
2218 /*
2219 * We assume the reason for it to be pinned is that either we were
2220 * asynchronously reading the page in before erroring out or someone else
2221 * is flushing the page out. Wait for the IO to finish. (This could be
2222 * an infinite loop if the refcount is messed up... it would be nice to
2223 * time out after awhile, but there seems no way to be sure how many loops
2224 * may be needed. Note that if the other guy has pinned the buffer but
2225 * not yet done StartBufferIO, WaitIO will fall through and we'll
2226 * effectively be busy-looping here.)
2227 */
2228 if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
2229 {
2230 UnlockBufHdr(buf, buf_state);
2231 LWLockRelease(oldPartitionLock);
2232 /* safety check: should definitely not be our *own* pin */
2234 elog(ERROR, "buffer is pinned in InvalidateBuffer");
2235 WaitIO(buf);
2236 goto retry;
2237 }
2238
2239 /*
2240 * Clear out the buffer's tag and flags. We must do this to ensure that
2241 * linear scans of the buffer array don't think the buffer is valid.
2242 */
2243 oldFlags = buf_state & BUF_FLAG_MASK;
2244 ClearBufferTag(&buf->tag);
2245 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2246 UnlockBufHdr(buf, buf_state);
2247
2248 /*
2249 * Remove the buffer from the lookup hashtable, if it was in there.
2250 */
2251 if (oldFlags & BM_TAG_VALID)
2252 BufTableDelete(&oldTag, oldHash);
2253
2254 /*
2255 * Done with mapping lock.
2256 */
2257 LWLockRelease(oldPartitionLock);
2258
2259 /*
2260 * Insert the buffer at the head of the list of free buffers.
2261 */
2263}
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:53
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
static void ClearBufferTag(BufferTag *tag)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5896

References Assert(), BM_LOCKED, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), StrategyFreeBuffer(), UnlockBufHdr(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ InvalidateVictimBuffer()

static bool InvalidateVictimBuffer ( BufferDesc buf_hdr)
static

Definition at line 2275 of file bufmgr.c.

2276{
2277 uint32 buf_state;
2278 uint32 hash;
2279 LWLock *partition_lock;
2280 BufferTag tag;
2281
2283
2284 /* have buffer pinned, so it's safe to read tag without lock */
2285 tag = buf_hdr->tag;
2286
2287 hash = BufTableHashCode(&tag);
2288 partition_lock = BufMappingPartitionLock(hash);
2289
2290 LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2291
2292 /* lock the buffer header */
2293 buf_state = LockBufHdr(buf_hdr);
2294
2295 /*
2296 * We have the buffer pinned nobody else should have been able to unset
2297 * this concurrently.
2298 */
2299 Assert(buf_state & BM_TAG_VALID);
2300 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2301 Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
2302
2303 /*
2304 * If somebody else pinned the buffer since, or even worse, dirtied it,
2305 * give up on this buffer: It's clearly in use.
2306 */
2307 if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
2308 {
2309 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2310
2311 UnlockBufHdr(buf_hdr, buf_state);
2312 LWLockRelease(partition_lock);
2313
2314 return false;
2315 }
2316
2317 /*
2318 * Clear out the buffer's tag and flags and usagecount. This is not
2319 * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
2320 * doing anything with the buffer. But currently it's beneficial, as the
2321 * cheaper pre-check for several linear scans of shared buffers use the
2322 * tag (see e.g. FlushDatabaseBuffers()).
2323 */
2324 ClearBufferTag(&buf_hdr->tag);
2325 buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
2326 UnlockBufHdr(buf_hdr, buf_state);
2327
2328 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2329
2330 /* finally delete buffer from the buffer mapping table */
2331 BufTableDelete(&tag, hash);
2332
2333 LWLockRelease(partition_lock);
2334
2335 Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
2336 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2338
2339 return true;
2340}

References Assert(), BM_DIRTY, BM_TAG_VALID, BM_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), GetPrivateRefCount(), hash(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by EvictUnpinnedBufferInternal(), and GetVictimBuffer().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 5843 of file bufmgr.c.

5844{
5845 BufferDesc *bufHdr;
5846 uint32 buf_state;
5847
5848 Assert(BufferIsValid(buffer));
5849
5850 /* see AIO related comment in LockBufferForCleanup() */
5851
5852 if (BufferIsLocal(buffer))
5853 {
5854 /* There should be exactly one pin */
5855 if (LocalRefCount[-buffer - 1] != 1)
5856 return false;
5857 /* Nobody else to wait for */
5858 return true;
5859 }
5860
5861 /* There should be exactly one local pin */
5862 if (GetPrivateRefCount(buffer) != 1)
5863 return false;
5864
5865 bufHdr = GetBufferDescriptor(buffer - 1);
5866
5867 /* caller must hold exclusive lock on buffer */
5869 LW_EXCLUSIVE));
5870
5871 buf_state = LockBufHdr(bufHdr);
5872
5873 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5874 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5875 {
5876 /* pincount is OK. */
5877 UnlockBufHdr(bufHdr, buf_state);
5878 return true;
5879 }
5880
5881 UnlockBufHdr(bufHdr, buf_state);
5882 return false;
5883}

References Assert(), BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsValid(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext wb_context,
IOContext  io_context 
)

Definition at line 6393 of file bufmgr.c.

6394{
6395 instr_time io_start;
6396 int i;
6397
6398 if (wb_context->nr_pending == 0)
6399 return;
6400
6401 /*
6402 * Executing the writes in-order can make them a lot faster, and allows to
6403 * merge writeback requests to consecutive blocks into larger writebacks.
6404 */
6405 sort_pending_writebacks(wb_context->pending_writebacks,
6406 wb_context->nr_pending);
6407
6409
6410 /*
6411 * Coalesce neighbouring writes, but nothing else. For that we iterate
6412 * through the, now sorted, array of pending flushes, and look forward to
6413 * find all neighbouring (or identical) writes.
6414 */
6415 for (i = 0; i < wb_context->nr_pending; i++)
6416 {
6419 SMgrRelation reln;
6420 int ahead;
6421 BufferTag tag;
6422 RelFileLocator currlocator;
6423 Size nblocks = 1;
6424
6425 cur = &wb_context->pending_writebacks[i];
6426 tag = cur->tag;
6427 currlocator = BufTagGetRelFileLocator(&tag);
6428
6429 /*
6430 * Peek ahead, into following writeback requests, to see if they can
6431 * be combined with the current one.
6432 */
6433 for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
6434 {
6435
6436 next = &wb_context->pending_writebacks[i + ahead + 1];
6437
6438 /* different file, stop */
6439 if (!RelFileLocatorEquals(currlocator,
6440 BufTagGetRelFileLocator(&next->tag)) ||
6441 BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
6442 break;
6443
6444 /* ok, block queued twice, skip */
6445 if (cur->tag.blockNum == next->tag.blockNum)
6446 continue;
6447
6448 /* only merge consecutive writes */
6449 if (cur->tag.blockNum + 1 != next->tag.blockNum)
6450 break;
6451
6452 nblocks++;
6453 cur = next;
6454 }
6455
6456 i += ahead;
6457
6458 /* and finally tell the kernel to write the data to storage */
6459 reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6460 smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6461 }
6462
6463 /*
6464 * Assume that writeback requests are only issued for buffers containing
6465 * blocks of permanent relations.
6466 */
6468 IOOP_WRITEBACK, io_start, wb_context->nr_pending, 0);
6469
6470 wb_context->nr_pending = 0;
6471}
static int32 next
Definition: blutils.c:224
struct cursor * cur
Definition: ecpg.c:29
@ IOOP_WRITEBACK
Definition: pgstat.h:308
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:805
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, i, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITEBACK, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RelFileLocatorEquals, smgropen(), smgrwriteback(), and track_io_timing.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ LimitAdditionalPins()

void LimitAdditionalPins ( uint32 additional_pins)

Definition at line 2541 of file bufmgr.c.

2542{
2543 uint32 limit;
2544
2545 if (*additional_pins <= 1)
2546 return;
2547
2548 limit = GetAdditionalPinLimit();
2549 limit = Max(limit, 1);
2550 if (limit < *additional_pins)
2551 *additional_pins = limit;
2552}
uint32 GetAdditionalPinLimit(void)
Definition: bufmgr.c:2515
#define Max(x, y)
Definition: c.h:969

References GetAdditionalPinLimit(), and Max.

Referenced by ExtendBufferedRelShared().

◆ local_buffer_readv_complete()

static PgAioResult local_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 7360 of file bufmgr.c.

7362{
7363 return buffer_readv_complete(ioh, prior_result, cb_data, true);
7364}
static pg_attribute_always_inline PgAioResult buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data, bool is_temp)
Definition: bufmgr.c:7107

References buffer_readv_complete().

◆ local_buffer_readv_stage()

static void local_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 7354 of file bufmgr.c.

7355{
7356 buffer_stage_common(ioh, false, true);
7357}
static pg_attribute_always_inline void buffer_stage_common(PgAioHandle *ioh, bool is_write, bool is_temp)
Definition: bufmgr.c:6717

References buffer_stage_common().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 6146 of file bufmgr.c.

6147{
6148 BufferDesc *bufHdr = (BufferDesc *) arg;
6149
6150 if (bufHdr != NULL)
6151 errcontext("writing block %u of relation %s",
6152 bufHdr->tag.blockNum,
6155 BufTagGetForkNum(&bufHdr->tag)).str);
6156}
#define errcontext
Definition: elog.h:197
void * arg

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, MyProcNumber, relpathbackend, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 5537 of file bufmgr.c.

5538{
5539 BufferDesc *buf;
5540
5541 Assert(BufferIsPinned(buffer));
5542 if (BufferIsLocal(buffer))
5543 return; /* local buffers need no lock */
5544
5545 buf = GetBufferDescriptor(buffer - 1);
5546
5547 if (mode == BUFFER_LOCK_UNLOCK)
5549 else if (mode == BUFFER_LOCK_SHARE)
5551 else if (mode == BUFFER_LOCK_EXCLUSIVE)
5553 else
5554 elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5555}
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:197
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:198

References Assert(), buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), and mode.

Referenced by _bt_lockbuf(), _bt_unlockbuf(), _bt_upgradelockbufcleanup(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), BitmapHeapScanNextBlock(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), create_toy_buffer(), entryLoadMoreItems(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), gin_refind_parent(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishOldSplit(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_lock(), heap_inplace_unlock(), heap_inplace_update_and_unlock(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_prepare_pagescan(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgettup(), initBloomState(), invalidate_rel_block(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), LockBufferForCleanup(), log_newpage_range(), modify_rel_block(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), ScanSourceDatabasePgClass(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferForRedoExtended(), XLogRecordPageWithFreeSpace(), and ZeroAndLockBuffer().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 5617 of file bufmgr.c.

5618{
5619 BufferDesc *bufHdr;
5620 TimestampTz waitStart = 0;
5621 bool waiting = false;
5622 bool logged_recovery_conflict = false;
5623
5624 Assert(BufferIsPinned(buffer));
5625 Assert(PinCountWaitBuf == NULL);
5626
5628
5629 /*
5630 * We do not yet need to be worried about in-progress AIOs holding a pin,
5631 * as we, so far, only support doing reads via AIO and this function can
5632 * only be called once the buffer is valid (i.e. no read can be in
5633 * flight).
5634 */
5635
5636 /* Nobody else to wait for */
5637 if (BufferIsLocal(buffer))
5638 return;
5639
5640 bufHdr = GetBufferDescriptor(buffer - 1);
5641
5642 for (;;)
5643 {
5644 uint32 buf_state;
5645
5646 /* Try to acquire lock */
5648 buf_state = LockBufHdr(bufHdr);
5649
5650 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5651 if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5652 {
5653 /* Successfully acquired exclusive lock with pincount 1 */
5654 UnlockBufHdr(bufHdr, buf_state);
5655
5656 /*
5657 * Emit the log message if recovery conflict on buffer pin was
5658 * resolved but the startup process waited longer than
5659 * deadlock_timeout for it.
5660 */
5661 if (logged_recovery_conflict)
5663 waitStart, GetCurrentTimestamp(),
5664 NULL, false);
5665
5666 if (waiting)
5667 {
5668 /* reset ps display to remove the suffix if we added one */
5670 waiting = false;
5671 }
5672 return;
5673 }
5674 /* Failed, so mark myself as waiting for pincount 1 */
5675 if (buf_state & BM_PIN_COUNT_WAITER)
5676 {
5677 UnlockBufHdr(bufHdr, buf_state);
5679 elog(ERROR, "multiple backends attempting to wait for pincount 1");
5680 }
5682 PinCountWaitBuf = bufHdr;
5683 buf_state |= BM_PIN_COUNT_WAITER;
5684 UnlockBufHdr(bufHdr, buf_state);
5686
5687 /* Wait to be signaled by UnpinBuffer() */
5688 if (InHotStandby)
5689 {
5690 if (!waiting)
5691 {
5692 /* adjust the process title to indicate that it's waiting */
5693 set_ps_display_suffix("waiting");
5694 waiting = true;
5695 }
5696
5697 /*
5698 * Emit the log message if the startup process is waiting longer
5699 * than deadlock_timeout for recovery conflict on buffer pin.
5700 *
5701 * Skip this if first time through because the startup process has
5702 * not started waiting yet in this case. So, the wait start
5703 * timestamp is set after this logic.
5704 */
5705 if (waitStart != 0 && !logged_recovery_conflict)
5706 {
5708
5709 if (TimestampDifferenceExceeds(waitStart, now,
5711 {
5713 waitStart, now, NULL, true);
5714 logged_recovery_conflict = true;
5715 }
5716 }
5717
5718 /*
5719 * Set the wait start timestamp if logging is enabled and first
5720 * time through.
5721 */
5722 if (log_recovery_conflict_waits && waitStart == 0)
5723 waitStart = GetCurrentTimestamp();
5724
5725 /* Publish the bufid that Startup process waits on */
5726 SetStartupBufferPinWaitBufId(buffer - 1);
5727 /* Set alarm and then wait to be signaled by UnpinBuffer() */
5729 /* Reset the published bufid */
5731 }
5732 else
5733 ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5734
5735 /*
5736 * Remove flag marking us as waiter. Normally this will not be set
5737 * anymore, but ProcWaitForSignal() can return for other signals as
5738 * well. We take care to only reset the flag if we're the waiter, as
5739 * theoretically another backend could have started waiting. That's
5740 * impossible with the current usages due to table level locking, but
5741 * better be safe.
5742 */
5743 buf_state = LockBufHdr(bufHdr);
5744 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5746 buf_state &= ~BM_PIN_COUNT_WAITER;
5747 UnlockBufHdr(bufHdr, buf_state);
5748
5749 PinCountWaitBuf = NULL;
5750 /* Loop back and try again */
5751 }
5752}
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1781
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1645
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1609
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:75
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:180
int64 TimestampTz
Definition: timestamp.h:39
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:48
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:423
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:371
int DeadlockTimeout
Definition: proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:755
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1975
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:793
bool log_recovery_conflict_waits
Definition: standby.c:42
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:274
int wait_backend_pgprocno
static volatile sig_atomic_t waiting
Definition: waiteventset.c:170
#define InHotStandby
Definition: xlogutils.h:60

References Assert(), BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, CheckBufferIsPinnedOnce(), DeadlockTimeout, elog, ERROR, GetBufferDescriptor(), GetCurrentTimestamp(), InHotStandby, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProcNumber, now(), PinCountWaitBuf, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), BufferDesc::wait_backend_pgprocno, and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), XLogReadBufferForRedoExtended(), and ZeroAndLockBuffer().

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 6189 of file bufmgr.c.

6190{
6191 SpinDelayStatus delayStatus;
6192 uint32 old_buf_state;
6193
6195
6196 init_local_spin_delay(&delayStatus);
6197
6198 while (true)
6199 {
6200 /* set BM_LOCKED flag */
6201 old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
6202 /* if it wasn't set before we're OK */
6203 if (!(old_buf_state & BM_LOCKED))
6204 break;
6205 perform_spin_delay(&delayStatus);
6206 }
6207 finish_spin_delay(&delayStatus);
6208 return old_buf_state | BM_LOCKED;
6209}
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:410
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:126
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:186
#define init_local_spin_delay(status)
Definition: s_lock.h:751

References Assert(), BM_LOCKED, BufferDescriptorGetBuffer(), BufferIsLocal, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), buffer_stage_common(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), create_toy_buffer(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FindAndDropRelationBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), GetVictimBuffer(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_numa_pages(), pg_buffercache_pages(), ReadRecentBuffer(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), WaitIO(), and WakePinCountWaiter().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 2945 of file bufmgr.c.

2946{
2947 BufferDesc *bufHdr;
2948 uint32 buf_state;
2949 uint32 old_buf_state;
2950
2951 if (!BufferIsValid(buffer))
2952 elog(ERROR, "bad buffer ID: %d", buffer);
2953
2954 if (BufferIsLocal(buffer))
2955 {
2956 MarkLocalBufferDirty(buffer);
2957 return;
2958 }
2959
2960 bufHdr = GetBufferDescriptor(buffer - 1);
2961
2962 Assert(BufferIsPinned(buffer));
2964 LW_EXCLUSIVE));
2965
2966 old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2967 for (;;)
2968 {
2969 if (old_buf_state & BM_LOCKED)
2970 old_buf_state = WaitBufHdrUnlocked(bufHdr);
2971
2972 buf_state = old_buf_state;
2973
2974 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2975 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2976
2977 if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2978 buf_state))
2979 break;
2980 }
2981
2982 /*
2983 * If the buffer was not dirty already, do vacuum accounting.
2984 */
2985 if (!(old_buf_state & BM_DIRTY))
2986 {
2988 if (VacuumCostActive)
2990 }
2991}
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:349
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:6219
int VacuumCostPageDirty
Definition: globals.c:154
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:489
int64 shared_blks_dirtied
Definition: instrument.h:28

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), dataExecPlaceToPageInternal(), dataExecPlaceToPageLeaf(), do_setval(), doPickSplit(), entryExecPlaceToPage(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_inplace_update_and_unlock(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune_and_freeze(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_new_or_empty(), lazy_scan_prune(), lazy_vacuum_heap_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 5367 of file bufmgr.c.

5368{
5369 BufferDesc *bufHdr;
5370 Page page = BufferGetPage(buffer);
5371
5372 if (!BufferIsValid(buffer))
5373 elog(ERROR, "bad buffer ID: %d", buffer);
5374
5375 if (BufferIsLocal(buffer))
5376 {
5377 MarkLocalBufferDirty(buffer);
5378 return;
5379 }
5380
5381 bufHdr = GetBufferDescriptor(buffer - 1);
5382
5383 Assert(GetPrivateRefCount(buffer) > 0);
5384 /* here, either share or exclusive lock is OK */
5386
5387 /*
5388 * This routine might get called many times on the same page, if we are
5389 * making the first scan after commit of an xact that added/deleted many
5390 * tuples. So, be as quick as we can if the buffer is already dirty. We
5391 * do this by not acquiring spinlock if it looks like the status bits are
5392 * already set. Since we make this test unlocked, there's a chance we
5393 * might fail to notice that the flags have just been cleared, and failed
5394 * to reset them, due to memory-ordering issues. But since this function
5395 * is only intended to be used in cases where failing to write out the
5396 * data would be harmless anyway, it doesn't really matter.
5397 */
5398 if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5400 {
5402 bool dirtied = false;
5403 bool delayChkptFlags = false;
5404 uint32 buf_state;
5405
5406 /*
5407 * If we need to protect hint bit updates from torn writes, WAL-log a
5408 * full page image of the page. This full page image is only necessary
5409 * if the hint bit update is the first change to the page since the
5410 * last checkpoint.
5411 *
5412 * We don't check full_page_writes here because that logic is included
5413 * when we call XLogInsert() since the value changes dynamically.
5414 */
5415 if (XLogHintBitIsNeeded() &&
5417 {
5418 /*
5419 * If we must not write WAL, due to a relfilelocator-specific
5420 * condition or being in recovery, don't dirty the page. We can
5421 * set the hint, just not dirty the page as a result so the hint
5422 * is lost when we evict the page or shutdown.
5423 *
5424 * See src/backend/storage/page/README for longer discussion.
5425 */
5426 if (RecoveryInProgress() ||
5428 return;
5429
5430 /*
5431 * If the block is already dirty because we either made a change
5432 * or set a hint already, then we don't need to write a full page
5433 * image. Note that aggressive cleaning of blocks dirtied by hint
5434 * bit setting would increase the call rate. Bulk setting of hint
5435 * bits would reduce the call rate...
5436 *
5437 * We must issue the WAL record before we mark the buffer dirty.
5438 * Otherwise we might write the page before we write the WAL. That
5439 * causes a race condition, since a checkpoint might occur between
5440 * writing the WAL record and marking the buffer dirty. We solve
5441 * that with a kluge, but one that is already in use during
5442 * transaction commit to prevent race conditions. Basically, we
5443 * simply prevent the checkpoint WAL record from being written
5444 * until we have marked the buffer dirty. We don't start the
5445 * checkpoint flush until we have marked dirty, so our checkpoint
5446 * must flush the change to disk successfully or the checkpoint
5447 * never gets written, so crash recovery will fix.
5448 *
5449 * It's possible we may enter here without an xid, so it is
5450 * essential that CreateCheckPoint waits for virtual transactions
5451 * rather than full transactionids.
5452 */
5455 delayChkptFlags = true;
5456 lsn = XLogSaveBufferForHint(buffer, buffer_std);
5457 }
5458
5459 buf_state = LockBufHdr(bufHdr);
5460
5461 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5462
5463 if (!(buf_state & BM_DIRTY))
5464 {
5465 dirtied = true; /* Means "will be dirtied by this action" */
5466
5467 /*
5468 * Set the page LSN if we wrote a backup block. We aren't supposed
5469 * to set this when only holding a share lock but as long as we
5470 * serialise it somehow we're OK. We choose to set LSN while
5471 * holding the buffer header lock, which causes any reader of an
5472 * LSN who holds only a share lock to also obtain a buffer header
5473 * lock before using PageGetLSN(), which is enforced in
5474 * BufferGetLSNAtomic().
5475 *
5476 * If checksums are enabled, you might think we should reset the
5477 * checksum here. That will happen when the page is written
5478 * sometime later in this checkpoint cycle.
5479 */
5480 if (!XLogRecPtrIsInvalid(lsn))
5481 PageSetLSN(page, lsn);
5482 }
5483
5484 buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5485 UnlockBufHdr(bufHdr, buf_state);
5486
5487 if (delayChkptFlags)
5488 MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
5489
5490 if (dirtied)
5491 {
5493 if (VacuumCostActive)
5495 }
5496 }
5497}
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
#define DELAY_CHKPT_START
Definition: proc.h:120
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:573
int delayChkptFlags
Definition: proc.h:241
bool RecoveryInProgress(void)
Definition: xlog.c:6522
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1065

References Assert(), BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferGetPage(), BufferIsLocal, BufferIsValid(), BufTagGetRelFileLocator(), DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog, ERROR, GetBufferDescriptor(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyProc, PageSetLSN(), pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune_and_freeze(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 322 of file bufmgr.c.

323{
325
326 /* only allowed to be called when a reservation has been made */
328
329 /* use up the reserved entry */
332
333 /* and fill it */
334 res->buffer = buffer;
335 res->refcount = 0;
336
337 return res;
338}

References Assert(), PrivateRefCountEntry::buffer, PrivateRefCountEntry::refcount, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 3065 of file bufmgr.c.

3066{
3068 bool result;
3070
3073
3074 ref = GetPrivateRefCountEntry(b, true);
3075
3076 if (ref == NULL)
3077 {
3078 uint32 buf_state;
3079 uint32 old_buf_state;
3080
3082
3083 old_buf_state = pg_atomic_read_u32(&buf->state);
3084 for (;;)
3085 {
3086 if (old_buf_state & BM_LOCKED)
3087 old_buf_state = WaitBufHdrUnlocked(buf);
3088
3089 buf_state = old_buf_state;
3090
3091 /* increase refcount */
3092 buf_state += BUF_REFCOUNT_ONE;
3093
3094 if (strategy == NULL)
3095 {
3096 /* Default case: increase usagecount unless already max. */
3098 buf_state += BUF_USAGECOUNT_ONE;
3099 }
3100 else
3101 {
3102 /*
3103 * Ring buffers shouldn't evict others from pool. Thus we
3104 * don't make usagecount more than 1.
3105 */
3106 if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3107 buf_state += BUF_USAGECOUNT_ONE;
3108 }
3109
3110 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3111 buf_state))
3112 {
3113 result = (buf_state & BM_VALID) != 0;
3114
3115 /*
3116 * Assume that we acquired a buffer pin for the purposes of
3117 * Valgrind buffer client checks (even in !result case) to
3118 * keep things simple. Buffers that are unsafe to access are
3119 * not generally guaranteed to be marked undefined or
3120 * non-accessible in any case.
3121 */
3123 break;
3124 }
3125 }
3126 }
3127 else
3128 {
3129 /*
3130 * If we previously pinned the buffer, it is likely to be valid, but
3131 * it may not be if StartReadBuffers() was called and
3132 * WaitReadBuffers() hasn't been called yet. We'll check by loading
3133 * the flags without locking. This is racy, but it's OK to return
3134 * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
3135 * it'll see that it's now valid.
3136 *
3137 * Note: We deliberately avoid a Valgrind client request here.
3138 * Individual access methods can optionally superimpose buffer page
3139 * client requests on top of our client requests to enforce that
3140 * buffers are only accessed while locked (and pinned). It's possible
3141 * that the buffer page is legitimately non-accessible here. We
3142 * cannot meddle with that.
3143 */
3144 result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
3145 }
3146
3147 ref->refcount++;
3148 Assert(ref->refcount > 0);
3150 return result;
3151}
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:86
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:60
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:322

References Assert(), b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservedRefCountEntry, ResourceOwnerRememberBuffer(), VALGRIND_MAKE_MEM_DEFINED, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), ExtendBufferedRelShared(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 3176 of file bufmgr.c.

3177{
3178 Buffer b;
3180 uint32 buf_state;
3181
3182 /*
3183 * As explained, We don't expect any preexisting pins. That allows us to
3184 * manipulate the PrivateRefCount after releasing the spinlock
3185 */
3187
3188 /*
3189 * Buffer can't have a preexisting pin, so mark its page as defined to
3190 * Valgrind (this is similar to the PinBuffer() case where the backend
3191 * doesn't already have a buffer pin)
3192 */
3194
3195 /*
3196 * Since we hold the buffer spinlock, we can update the buffer state and
3197 * release the lock in one operation.
3198 */
3199 buf_state = pg_atomic_read_u32(&buf->state);
3200 Assert(buf_state & BM_LOCKED);
3201 buf_state += BUF_REFCOUNT_ONE;
3202 UnlockBufHdr(buf, buf_state);
3203
3205
3207 ref->refcount++;
3208
3210}

References Assert(), b, BM_LOCKED, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), UnlockBufHdr(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by EvictUnpinnedBufferInternal(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ PinBufferForBlock()

static pg_attribute_always_inline Buffer PinBufferForBlock ( Relation  rel,
SMgrRelation  smgr,
char  persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool *  foundPtr 
)
static

Definition at line 1103 of file bufmgr.c.

1110{
1111 BufferDesc *bufHdr;
1112 IOContext io_context;
1113 IOObject io_object;
1114
1115 Assert(blockNum != P_NEW);
1116
1117 /* Persistence should be set before */
1118 Assert((persistence == RELPERSISTENCE_TEMP ||
1119 persistence == RELPERSISTENCE_PERMANENT ||
1120 persistence == RELPERSISTENCE_UNLOGGED));
1121
1122 if (persistence == RELPERSISTENCE_TEMP)
1123 {
1124 io_context = IOCONTEXT_NORMAL;
1125 io_object = IOOBJECT_TEMP_RELATION;
1126 }
1127 else
1128 {
1129 io_context = IOContextForStrategy(strategy);
1130 io_object = IOOBJECT_RELATION;
1131 }
1132
1133 TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1137 smgr->smgr_rlocator.backend);
1138
1139 if (persistence == RELPERSISTENCE_TEMP)
1140 {
1141 bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1142 if (*foundPtr)
1144 }
1145 else
1146 {
1147 bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1148 strategy, foundPtr, io_context);
1149 if (*foundPtr)
1151 }
1152 if (rel)
1153 {
1154 /*
1155 * While pgBufferUsage's "read" counter isn't bumped unless we reach
1156 * WaitReadBuffers() (so, not for hits, and not for buffers that are
1157 * zeroed instead), the per-relation stats always count them.
1158 */
1160 if (*foundPtr)
1162 }
1163 if (*foundPtr)
1164 {
1165 pgstat_count_io_op(io_object, io_context, IOOP_HIT, 1, 0);
1166 if (VacuumCostActive)
1168
1169 TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1173 smgr->smgr_rlocator.backend,
1174 true);
1175 }
1176
1177 return BufferDescriptorGetBuffer(bufHdr);
1178}
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:1998
#define P_NEW
Definition: bufmgr.h:191
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:118
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:704

References Assert(), RelFileLocatorBackend::backend, BufferAlloc(), BufferDescriptorGetBuffer(), RelFileLocator::dbOid, IOCONTEXT_NORMAL, IOContextForStrategy(), IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_HIT, BufferUsage::local_blks_hit, LocalBufferAlloc(), RelFileLocatorBackend::locator, P_NEW, pgBufferUsage, pgstat_count_buffer_hit, pgstat_count_buffer_read, pgstat_count_io_op(), RelFileLocator::relNumber, BufferUsage::shared_blks_hit, SMgrRelationData::smgr_rlocator, RelFileLocator::spcOid, VacuumCostActive, VacuumCostBalance, and VacuumCostPageHit.

Referenced by ReadBuffer_common(), and StartReadBuffersImpl().

◆ PrefetchBuffer()

PrefetchBufferResult PrefetchBuffer ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 644 of file bufmgr.c.

645{
646 Assert(RelationIsValid(reln));
647 Assert(BlockNumberIsValid(blockNum));
648
649 if (RelationUsesLocalBuffers(reln))
650 {
651 /* see comments in ReadBufferExtended */
652 if (RELATION_IS_OTHER_TEMP(reln))
654 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
655 errmsg("cannot access temporary tables of other sessions")));
656
657 /* pass it off to localbuf.c */
658 return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
659 }
660 else
661 {
662 /* pass it to the shared buffer version */
663 return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
664 }
665}
PrefetchBufferResult PrefetchSharedBuffer(SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
Definition: bufmgr.c:554
PrefetchBufferResult PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum)
Definition: localbuf.c:71
#define RELATION_IS_OTHER_TEMP(relation)
Definition: rel.h:669
#define RelationIsValid(relation)
Definition: rel.h:489

References Assert(), BlockNumberIsValid(), ereport, errcode(), errmsg(), ERROR, PrefetchLocalBuffer(), PrefetchSharedBuffer(), RELATION_IS_OTHER_TEMP, RelationGetSmgr(), RelationIsValid, and RelationUsesLocalBuffers.

Referenced by count_nondeletable_pages(), invalidate_rel_block(), and pg_prewarm().

◆ PrefetchSharedBuffer()

PrefetchBufferResult PrefetchSharedBuffer ( SMgrRelation  smgr_reln,
ForkNumber  forkNum,
BlockNumber  blockNum 
)

Definition at line 554 of file bufmgr.c.

557{
558 PrefetchBufferResult result = {InvalidBuffer, false};
559 BufferTag newTag; /* identity of requested block */
560 uint32 newHash; /* hash value for newTag */
561 LWLock *newPartitionLock; /* buffer partition lock for it */
562 int buf_id;
563
564 Assert(BlockNumberIsValid(blockNum));
565
566 /* create a tag so we can lookup the buffer */
567 InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
568 forkNum, blockNum);
569
570 /* determine its hash code and partition lock ID */
571 newHash = BufTableHashCode(&newTag);
572 newPartitionLock = BufMappingPartitionLock(newHash);
573
574 /* see if the block is in the buffer pool already */
575 LWLockAcquire(newPartitionLock, LW_SHARED);
576 buf_id = BufTableLookup(&newTag, newHash);
577 LWLockRelease(newPartitionLock);
578
579 /* If not in buffers, initiate prefetch */
580 if (buf_id < 0)
581 {
582#ifdef USE_PREFETCH
583 /*
584 * Try to initiate an asynchronous read. This returns false in
585 * recovery if the relation file doesn't exist.
586 */
587 if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
588 smgrprefetch(smgr_reln, forkNum, blockNum, 1))
589 {
590 result.initiated_io = true;
591 }
592#endif /* USE_PREFETCH */
593 }
594 else
595 {
596 /*
597 * Report the buffer it was in at that time. The caller may be able
598 * to avoid a buffer table lookup, but it's not pinned and it must be
599 * rechecked!
600 */
601 result.recent_buffer = buf_id + 1;
602 }
603
604 /*
605 * If the block *is* in buffers, we do nothing. This is not really ideal:
606 * the block might be just about to be evicted, which would be stupid
607 * since we know we are going to need it soon. But the only easy answer
608 * is to bump the usage_count, which does not seem like a great solution:
609 * when the caller does ultimately touch the block, usage_count would get
610 * bumped again, resulting in too much favoritism for blocks that are
611 * involved in a prefetch sequence. A real fix would involve some
612 * additional per-buffer state, and it's not clear that there's enough of
613 * a problem to justify that.
614 */
615
616 return result;
617}
int io_direct_flags
Definition: fd.c:168
#define IO_DIRECT_DATA
Definition: fd.h:54
bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
Definition: smgr.c:678
Buffer recent_buffer
Definition: bufmgr.h:61

References Assert(), BlockNumberIsValid(), BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), InitBufferTag(), PrefetchBufferResult::initiated_io, InvalidBuffer, IO_DIRECT_DATA, io_direct_flags, RelFileLocatorBackend::locator, LW_SHARED, LWLockAcquire(), LWLockRelease(), PrefetchBufferResult::recent_buffer, SMgrRelationData::smgr_rlocator, and smgrprefetch().

Referenced by PrefetchBuffer(), and XLogPrefetcherNextBlock().

◆ ProcessReadBuffersResult()

static void ProcessReadBuffersResult ( ReadBuffersOperation operation)
static

Definition at line 1591 of file bufmgr.c.

1592{
1593 PgAioReturn *aio_ret = &operation->io_return;
1594 PgAioResultStatus rs = aio_ret->result.status;
1595 int newly_read_blocks = 0;
1596
1597 Assert(pgaio_wref_valid(&operation->io_wref));
1598 Assert(aio_ret->result.status != PGAIO_RS_UNKNOWN);
1599
1600 /*
1601 * SMGR reports the number of blocks successfully read as the result of
1602 * the IO operation. Thus we can simply add that to ->nblocks_done.
1603 */
1604
1605 if (likely(rs != PGAIO_RS_ERROR))
1606 newly_read_blocks = aio_ret->result.result;
1607
1608 if (rs == PGAIO_RS_ERROR || rs == PGAIO_RS_WARNING)
1609 pgaio_result_report(aio_ret->result, &aio_ret->target_data,
1610 rs == PGAIO_RS_ERROR ? ERROR : WARNING);
1611 else if (aio_ret->result.status == PGAIO_RS_PARTIAL)
1612 {
1613 /*
1614 * We'll retry, so we just emit a debug message to the server log (or
1615 * not even that in prod scenarios).
1616 */
1617 pgaio_result_report(aio_ret->result, &aio_ret->target_data, DEBUG1);
1618 elog(DEBUG3, "partial read, will retry");
1619 }
1620
1621 Assert(newly_read_blocks > 0);
1622 Assert(newly_read_blocks <= MAX_IO_COMBINE_LIMIT);
1623
1624 operation->nblocks_done += newly_read_blocks;
1625
1626 Assert(operation->nblocks_done <= operation->nblocks);
1627}
bool pgaio_wref_valid(PgAioWaitRef *iow)
Definition: aio.c:873
PgAioResultStatus
Definition: aio_types.h:79
@ PGAIO_RS_UNKNOWN
Definition: aio_types.h:80
@ PGAIO_RS_PARTIAL
Definition: aio_types.h:82
#define likely(x)
Definition: c.h:346
#define DEBUG3
Definition: elog.h:28
PgAioResult result
Definition: aio_types.h:132
PgAioTargetData target_data
Definition: aio_types.h:133

References Assert(), DEBUG1, DEBUG3, elog, ERROR, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, likely, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, pgaio_result_report(), PGAIO_RS_ERROR, PGAIO_RS_PARTIAL, PGAIO_RS_UNKNOWN, PGAIO_RS_WARNING, pgaio_wref_valid(), PgAioResult::result, PgAioReturn::result, PgAioResult::status, PgAioReturn::target_data, and WARNING.

Referenced by WaitReadBuffers().

◆ ReadBuffer()

Buffer ReadBuffer ( Relation  reln,
BlockNumber  blockNum 
)

Definition at line 751 of file bufmgr.c.

752{
753 return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
754}
Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:798
@ RBM_NORMAL
Definition: bufmgr.h:46

References MAIN_FORKNUM, RBM_NORMAL, and ReadBufferExtended().

Referenced by _bt_allocbuf(), _bt_getbuf(), _bt_search_insert(), _hash_getbuf(), _hash_getbuf_with_condlock_cleanup(), blbulkdelete(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brinGetStats(), brinGetTupleForHeapBlock(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), ginFindLeafPage(), ginFindParents(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), GinNewBuffer(), ginStepRight(), ginUpdateStats(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfixsplit(), gistGetMaxLevel(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_lock_tuple(), heap_update(), initBloomState(), pg_visibility(), pgstatginindex_internal(), read_seq_tuple(), RelationGetBufferForTuple(), ReleaseAndReadBuffer(), revmap_get_buffer(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), shiftList(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), and spgWalk().

◆ ReadBuffer_common()

static pg_attribute_always_inline Buffer ReadBuffer_common ( Relation  rel,
SMgrRelation  smgr,
char  smgr_persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
static

Definition at line 1186 of file bufmgr.c.

1190{
1191 ReadBuffersOperation operation;
1192 Buffer buffer;
1193 int flags;
1194 char persistence;
1195
1196 /*
1197 * Backward compatibility path, most code should use ExtendBufferedRel()
1198 * instead, as acquiring the extension lock inside ExtendBufferedRel()
1199 * scales a lot better.
1200 */
1201 if (unlikely(blockNum == P_NEW))
1202 {
1204
1205 /*
1206 * Since no-one else can be looking at the page contents yet, there is
1207 * no difference between an exclusive lock and a cleanup-strength
1208 * lock.
1209 */
1211 flags |= EB_LOCK_FIRST;
1212
1213 return ExtendBufferedRel(BMR_REL(rel), forkNum, strategy, flags);
1214 }
1215
1216 if (rel)
1217 persistence = rel->rd_rel->relpersistence;
1218 else
1219 persistence = smgr_persistence;
1220
1223 {
1224 bool found;
1225
1226 buffer = PinBufferForBlock(rel, smgr, persistence,
1227 forkNum, blockNum, strategy, &found);
1228 ZeroAndLockBuffer(buffer, mode, found);
1229 return buffer;
1230 }
1231
1232 /*
1233 * Signal that we are going to immediately wait. If we're immediately
1234 * waiting, there is no benefit in actually executing the IO
1235 * asynchronously, it would just add dispatch overhead.
1236 */
1238 if (mode == RBM_ZERO_ON_ERROR)
1240 operation.smgr = smgr;
1241 operation.rel = rel;
1242 operation.persistence = persistence;
1243 operation.forknum = forkNum;
1244 operation.strategy = strategy;
1245 if (StartReadBuffer(&operation,
1246 &buffer,
1247 blockNum,
1248 flags))
1249 WaitReadBuffers(&operation);
1250
1251 return buffer;
1252}
Buffer ExtendBufferedRel(BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
Definition: bufmgr.c:851
static void ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
Definition: bufmgr.c:1024
static pg_attribute_always_inline Buffer PinBufferForBlock(Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
Definition: bufmgr.c:1103
void WaitReadBuffers(ReadBuffersOperation *operation)
Definition: bufmgr.c:1630
bool StartReadBuffer(ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
Definition: bufmgr.c:1506
@ RBM_ZERO_ON_ERROR
Definition: bufmgr.h:51
#define BMR_REL(p_rel)
Definition: bufmgr.h:108

References BMR_REL, PrivateRefCountEntry::buffer, EB_LOCK_FIRST, EB_SKIP_EXTENSION_LOCK, ExtendBufferedRel(), ReadBuffersOperation::forknum, mode, P_NEW, ReadBuffersOperation::persistence, PinBufferForBlock(), RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RBM_ZERO_ON_ERROR, RelationData::rd_rel, READ_BUFFERS_SYNCHRONOUSLY, READ_BUFFERS_ZERO_ON_ERROR, ReadBuffersOperation::rel, ReadBuffersOperation::smgr, StartReadBuffer(), ReadBuffersOperation::strategy, unlikely, WaitReadBuffers(), and ZeroAndLockBuffer().

Referenced by ExtendBufferedRelTo(), ReadBufferExtended(), and ReadBufferWithoutRelcache().

◆ ReadBufferExtended()

Buffer ReadBufferExtended ( Relation  reln,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy 
)
inline

Definition at line 798 of file bufmgr.c.

800{
801 Buffer buf;
802
803 /*
804 * Reject attempts to read non-local temporary relations; we would be
805 * likely to get wrong data since we have no visibility into the owning
806 * session's local buffers.
807 */
808 if (RELATION_IS_OTHER_TEMP(reln))
810 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
811 errmsg("cannot access temporary tables of other sessions")));
812
813 /*
814 * Read the buffer, and update pgstat counters to reflect a cache hit or
815 * miss.
816 */
817 buf = ReadBuffer_common(reln, RelationGetSmgr(reln), 0,
818 forkNum, blockNum, mode, strategy);
819
820 return buf;
821}

References buf, ereport, errcode(), errmsg(), ERROR, mode, ReadBuffer_common(), RELATION_IS_OTHER_TEMP, and RelationGetSmgr().

Referenced by _hash_getbuf_with_strategy(), _hash_getinitbuf(), _hash_getnewbuf(), blbulkdelete(), blgetbitmap(), BloomInitMetapage(), blvacuumcleanup(), brin_vacuum_scan(), bt_recheck_sibling_links(), btvacuumpage(), count_nondeletable_pages(), create_toy_buffer(), fsm_readbuf(), get_raw_page_internal(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), gin_refind_parent(), ginbulkdelete(), ginDeletePage(), ginScanToDelete(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbulkdelete(), heapam_scan_sample_next_block(), log_newpage_range(), modify_rel_block(), palloc_btree_page(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstathashindex(), pgstatindex_impl(), ReadBuffer(), ReadBufferBI(), spgprocesspending(), statapprox_heap(), and vm_readbuf().

◆ ReadBuffersCanStartIO()

static bool ReadBuffersCanStartIO ( Buffer  buffer,
bool  nowait 
)
inlinestatic

Definition at line 1562 of file bufmgr.c.

1563{
1564 /*
1565 * If this backend currently has staged IO, we need to submit the pending
1566 * IO before waiting for the right to issue IO, to avoid the potential for
1567 * deadlocks (and, more commonly, unnecessary delays for other backends).
1568 */
1569 if (!nowait && pgaio_have_staged())
1570 {
1571 if (ReadBuffersCanStartIOOnce(buffer, true))
1572 return true;
1573
1574 /*
1575 * Unfortunately StartBufferIO() returning false doesn't allow to
1576 * distinguish between the buffer already being valid and IO already
1577 * being in progress. Since IO already being in progress is quite
1578 * rare, this approach seems fine.
1579 */
1581 }
1582
1583 return ReadBuffersCanStartIOOnce(buffer, nowait);
1584}
bool pgaio_have_staged(void)
Definition: aio.c:1004
static bool ReadBuffersCanStartIOOnce(Buffer buffer, bool nowait)
Definition: bufmgr.c:1549

References PrivateRefCountEntry::buffer, pgaio_have_staged(), pgaio_submit_staged(), and ReadBuffersCanStartIOOnce().

Referenced by AsyncReadBuffers().

◆ ReadBuffersCanStartIOOnce()

static bool ReadBuffersCanStartIOOnce ( Buffer  buffer,
bool  nowait 
)
inlinestatic

Definition at line 1549 of file bufmgr.c.

1550{
1551 if (BufferIsLocal(buffer))
1552 return StartLocalBufferIO(GetLocalBufferDescriptor(-buffer - 1),
1553 true, nowait);
1554 else
1555 return StartBufferIO(GetBufferDescriptor(buffer - 1), true, nowait);
1556}
bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait)
Definition: localbuf.c:521

References PrivateRefCountEntry::buffer, BufferIsLocal, GetBufferDescriptor(), GetLocalBufferDescriptor(), StartBufferIO(), and StartLocalBufferIO().

Referenced by ReadBuffersCanStartIO().

◆ ReadBufferWithoutRelcache()

Buffer ReadBufferWithoutRelcache ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
ReadBufferMode  mode,
BufferAccessStrategy  strategy,
bool  permanent 
)

Definition at line 835 of file bufmgr.c.

838{
839 SMgrRelation smgr = smgropen(rlocator, INVALID_PROC_NUMBER);
840
841 return ReadBuffer_common(NULL, smgr,
842 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
843 forkNum, blockNum,
844 mode, strategy);
845}

References INVALID_PROC_NUMBER, mode, ReadBuffer_common(), and smgropen().

Referenced by RelationCopyStorageUsingBuffer(), ScanSourceDatabasePgClass(), and XLogReadBufferExtended().

◆ ReadRecentBuffer()

bool ReadRecentBuffer ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  blockNum,
Buffer  recent_buffer 
)

Definition at line 675 of file bufmgr.c.

677{
678 BufferDesc *bufHdr;
679 BufferTag tag;
680 uint32 buf_state;
681 bool have_private_ref;
682
683 Assert(BufferIsValid(recent_buffer));
684
687 InitBufferTag(&tag, &rlocator, forkNum, blockNum);
688
689 if (BufferIsLocal(recent_buffer))
690 {
691 int b = -recent_buffer - 1;
692
693 bufHdr = GetLocalBufferDescriptor(b);
694 buf_state = pg_atomic_read_u32(&bufHdr->state);
695
696 /* Is it still valid and holding the right tag? */
697 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
698 {
699 PinLocalBuffer(bufHdr, true);
700
702
703 return true;
704 }
705 }
706 else
707 {
708 bufHdr = GetBufferDescriptor(recent_buffer - 1);
709 have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
710
711 /*
712 * Do we already have this buffer pinned with a private reference? If
713 * so, it must be valid and it is safe to check the tag without
714 * locking. If not, we have to lock the header first and then check.
715 */
716 if (have_private_ref)
717 buf_state = pg_atomic_read_u32(&bufHdr->state);
718 else
719 buf_state = LockBufHdr(bufHdr);
720
721 if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
722 {
723 /*
724 * It's now safe to pin the buffer. We can't pin first and ask
725 * questions later, because it might confuse code paths like
726 * InvalidateBuffer() if we pinned a random non-matching buffer.
727 */
728 if (have_private_ref)
729 PinBuffer(bufHdr, NULL); /* bump pin count */
730 else
731 PinBuffer_Locked(bufHdr); /* pin for first time */
732
734
735 return true;
736 }
737
738 /* If we locked the header above, now unlock. */
739 if (!have_private_ref)
740 UnlockBufHdr(bufHdr, buf_state);
741 }
742
743 return false;
744}

References Assert(), b, BM_VALID, BufferIsLocal, BufferIsValid(), BufferTagsEqual(), CurrentResourceOwner, GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), InitBufferTag(), BufferUsage::local_blks_hit, LockBufHdr(), pg_atomic_read_u32(), pgBufferUsage, PinBuffer(), PinBuffer_Locked(), PinLocalBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_hit, BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by invalidate_rel_block(), and XLogReadBufferExtended().

◆ RelationCopyStorageUsingBuffer()

static void RelationCopyStorageUsingBuffer ( RelFileLocator  srclocator,
RelFileLocator  dstlocator,
ForkNumber  forkNum,
bool  permanent 
)
static

Definition at line 5063 of file bufmgr.c.

5066{
5067 Buffer srcBuf;
5068 Buffer dstBuf;
5069 Page srcPage;
5070 Page dstPage;
5071 bool use_wal;
5072 BlockNumber nblocks;
5073 BlockNumber blkno;
5075 BufferAccessStrategy bstrategy_src;
5076 BufferAccessStrategy bstrategy_dst;
5078 ReadStream *src_stream;
5079 SMgrRelation src_smgr;
5080
5081 /*
5082 * In general, we want to write WAL whenever wal_level > 'minimal', but we
5083 * can skip it when copying any fork of an unlogged relation other than
5084 * the init fork.
5085 */
5086 use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
5087
5088 /* Get number of blocks in the source relation. */
5089 nblocks = smgrnblocks(smgropen(srclocator, INVALID_PROC_NUMBER),
5090 forkNum);
5091
5092 /* Nothing to copy; just return. */
5093 if (nblocks == 0)
5094 return;
5095
5096 /*
5097 * Bulk extend the destination relation of the same size as the source
5098 * relation before starting to copy block by block.
5099 */
5100 memset(buf.data, 0, BLCKSZ);
5101 smgrextend(smgropen(dstlocator, INVALID_PROC_NUMBER), forkNum, nblocks - 1,
5102 buf.data, true);
5103
5104 /* This is a bulk operation, so use buffer access strategies. */
5105 bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
5106 bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
5107
5108 /* Initialize streaming read */
5109 p.current_blocknum = 0;
5110 p.last_exclusive = nblocks;
5111 src_smgr = smgropen(srclocator, INVALID_PROC_NUMBER);
5112
5113 /*
5114 * It is safe to use batchmode as block_range_read_stream_cb takes no
5115 * locks.
5116 */
5119 bstrategy_src,
5120 src_smgr,
5121 permanent ? RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED,
5122 forkNum,
5124 &p,
5125 0);
5126
5127 /* Iterate over each block of the source relation file. */
5128 for (blkno = 0; blkno < nblocks; blkno++)
5129 {
5131
5132 /* Read block from source relation. */
5133 srcBuf = read_stream_next_buffer(src_stream, NULL);
5135 srcPage = BufferGetPage(srcBuf);
5136
5137 dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum,
5138 BufferGetBlockNumber(srcBuf),
5139 RBM_ZERO_AND_LOCK, bstrategy_dst,
5140 permanent);
5141 dstPage = BufferGetPage(dstBuf);
5142
5144
5145 /* Copy page data from the source to the destination. */
5146 memcpy(dstPage, srcPage, BLCKSZ);
5147 MarkBufferDirty(dstBuf);
5148
5149 /* WAL-log the copied page. */
5150 if (use_wal)
5151 log_newpage_buffer(dstBuf, true);
5152
5154
5155 UnlockReleaseBuffer(dstBuf);
5156 UnlockReleaseBuffer(srcBuf);
5157 }
5158 Assert(read_stream_next_buffer(src_stream, NULL) == InvalidBuffer);
5159 read_stream_end(src_stream);
5160
5161 FreeAccessStrategy(bstrategy_src);
5162 FreeAccessStrategy(bstrategy_dst);
5163}
void UnlockReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:5320
void MarkBufferDirty(Buffer buffer)
Definition: bufmgr.c:2945
Buffer ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
Definition: bufmgr.c:835
@ BAS_BULKREAD
Definition: bufmgr.h:37
@ BAS_BULKWRITE
Definition: bufmgr.h:39
BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype)
Definition: freelist.c:541
void FreeAccessStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:723
#define START_CRIT_SECTION()
Definition: miscadmin.h:150
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
#define END_CRIT_SECTION()
Definition: miscadmin.h:152
ReadStream * read_stream_begin_smgr_relation(int flags, BufferAccessStrategy strategy, SMgrRelation smgr, char smgr_persistence, ForkNumber forknum, ReadStreamBlockNumberCB callback, void *callback_private_data, size_t per_buffer_data_size)
Definition: read_stream.c:740
Buffer read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
Definition: read_stream.c:770
void read_stream_end(ReadStream *stream)
Definition: read_stream.c:1055
BlockNumber block_range_read_stream_cb(ReadStream *stream, void *callback_private_data, void *per_buffer_data)
Definition: read_stream.c:162
#define READ_STREAM_USE_BATCHING
Definition: read_stream.h:64
#define READ_STREAM_FULL
Definition: read_stream.h:43
void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.c:620
#define XLogIsNeeded()
Definition: xlog.h:109
XLogRecPtr log_newpage_buffer(Buffer buffer, bool page_std)
Definition: xloginsert.c:1237

References Assert(), BAS_BULKREAD, BAS_BULKWRITE, block_range_read_stream_cb(), buf, BUFFER_LOCK_SHARE, BufferGetBlockNumber(), BufferGetPage(), CHECK_FOR_INTERRUPTS, BlockRangeReadStreamPrivate::current_blocknum, END_CRIT_SECTION, FreeAccessStrategy(), GetAccessStrategy(), INIT_FORKNUM, INVALID_PROC_NUMBER, InvalidBuffer, BlockRangeReadStreamPrivate::last_exclusive, LockBuffer(), log_newpage_buffer(), MarkBufferDirty(), RBM_ZERO_AND_LOCK, read_stream_begin_smgr_relation(), read_stream_end(), READ_STREAM_FULL, read_stream_next_buffer(), READ_STREAM_USE_BATCHING, ReadBufferWithoutRelcache(), smgrextend(), smgrnblocks(), smgropen(), START_CRIT_SECTION, UnlockReleaseBuffer(), and XLogIsNeeded.

Referenced by CreateAndCopyRelationData().

◆ RelationGetNumberOfBlocksInFork()

BlockNumber RelationGetNumberOfBlocksInFork ( Relation  relation,
ForkNumber  forkNum 
)

Definition at line 4361 of file bufmgr.c.

4362{
4363 if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
4364 {
4365 /*
4366 * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
4367 * tableam returns the size in bytes - but for the purpose of this
4368 * routine, we want the number of blocks. Therefore divide, rounding
4369 * up.
4370 */
4371 uint64 szbytes;
4372
4373 szbytes = table_relation_size(relation, forkNum);
4374
4375 return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
4376 }
4377 else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
4378 {
4379 return smgrnblocks(RelationGetSmgr(relation), forkNum);
4380 }
4381 else
4382 Assert(false);
4383
4384 return 0; /* keep compiler quiet */
4385}
static uint64 table_relation_size(Relation rel, ForkNumber forkNumber)
Definition: tableam.h:1828

References Assert(), RelationData::rd_rel, RelationGetSmgr(), smgrnblocks(), and table_relation_size().

Referenced by _hash_getnewbuf(), _hash_init(), autoprewarm_database_main(), get_raw_page_internal(), and pg_prewarm().

◆ ReleaseAndReadBuffer()

Buffer ReleaseAndReadBuffer ( Buffer  buffer,
Relation  relation,
BlockNumber  blockNum 
)

Definition at line 3007 of file bufmgr.c.

3010{
3011 ForkNumber forkNum = MAIN_FORKNUM;
3012 BufferDesc *bufHdr;
3013
3014 if (BufferIsValid(buffer))
3015 {
3016 Assert(BufferIsPinned(buffer));
3017 if (BufferIsLocal(buffer))
3018 {
3019 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3020 if (bufHdr->tag.blockNum == blockNum &&
3021 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3022 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3023 return buffer;
3024 UnpinLocalBuffer(buffer);
3025 }
3026 else
3027 {
3028 bufHdr = GetBufferDescriptor(buffer - 1);
3029 /* we have pin, so it's ok to examine tag without spinlock */
3030 if (bufHdr->tag.blockNum == blockNum &&
3031 BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
3032 BufTagGetForkNum(&bufHdr->tag) == forkNum)
3033 return buffer;
3034 UnpinBuffer(bufHdr);
3035 }
3036 }
3037
3038 return ReadBuffer(relation, blockNum);
3039}
Buffer ReadBuffer(Relation reln, BlockNumber blockNum)
Definition: bufmgr.c:751

References Assert(), buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), MAIN_FORKNUM, RelationData::rd_locator, ReadBuffer(), BufferDesc::tag, UnpinBuffer(), and UnpinLocalBuffer().

Referenced by _bt_relandgetbuf(), ginFindLeafPage(), and heapam_index_fetch_tuple().

◆ ReleaseBuffer()

void ReleaseBuffer ( Buffer  buffer)

Definition at line 5303 of file bufmgr.c.

5304{
5305 if (!BufferIsValid(buffer))
5306 elog(ERROR, "bad buffer ID: %d", buffer);
5307
5308 if (BufferIsLocal(buffer))
5309 UnpinLocalBuffer(buffer);
5310 else
5311 UnpinBuffer(GetBufferDescriptor(buffer - 1));
5312}

References PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), UnpinBuffer(), and UnpinLocalBuffer().

Referenced by _bt_allocbuf(), _bt_drop_lock_and_maybe_pin(), _bt_pagedel(), _bt_relbuf(), _bt_search_insert(), _bt_unlink_halfdead_page(), _hash_dropbuf(), _hash_getbuf_with_condlock_cleanup(), autoprewarm_database_main(), BitmapHeapScanNextBlock(), blinsert(), BloomNewBuffer(), brin_getinsertbuffer(), brin_vacuum_scan(), bringetbitmap(), brinGetTupleForHeapBlock(), brininsert(), brinRevmapTerminate(), brinsummarize(), buffer_create_toy(), collect_corrupt_items(), collect_visibility_data(), entryLoadMoreItems(), ExecEndIndexOnlyScan(), ExtendBufferedRelTo(), FreeBulkInsertState(), freeGinBtreeStack(), fsm_search(), fsm_vacuum_page(), get_actual_variable_endpoint(), get_raw_page_internal(), GetRecordedFreeSpace(), gin_check_parent_keys_consistency(), gin_check_posting_tree_parent_keys_consistency(), ginDeletePage(), ginFindParents(), ginFinishSplit(), ginFreeScanKeys(), ginInsertCleanup(), GinNewBuffer(), ginScanToDelete(), gistdoinsert(), gistFindCorrectParent(), gistNewBuffer(), gistvacuum_delete_empty_pages(), grow_rel(), heap_abort_speculative(), heap_delete(), heap_endscan(), heap_fetch(), heap_fetch_next_buffer(), heap_force_common(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_rescan(), heap_update(), heap_vac_scan_next_block(), heap_xlog_delete(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_update(), heap_xlog_visible(), heapam_index_fetch_reset(), heapam_scan_sample_next_block(), heapam_tuple_lock(), heapgettup(), heapgettup_pagemode(), invalidate_rel_block(), lazy_scan_heap(), lazy_vacuum_heap_rel(), modify_rel_block(), pg_prewarm(), pg_visibility(), pg_visibility_map(), pg_visibility_map_summary(), pgstatindex_impl(), read_rel_block_ll(), read_stream_reset(), ReadBufferBI(), RelationAddBlocks(), RelationGetBufferForTuple(), ReleaseBulkInsertStatePin(), revmap_get_buffer(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), SpGistUpdateMetaPage(), statapprox_heap(), summarize_range(), terminate_brin_buildstate(), tts_buffer_heap_clear(), tts_buffer_heap_materialize(), tts_buffer_heap_store_tuple(), UnlockReleaseBuffer(), verify_heapam(), visibilitymap_count(), visibilitymap_get_status(), visibilitymap_pin(), and XLogReadBufferExtended().

◆ ReservePrivateRefCountEntry()

static void ReservePrivateRefCountEntry ( void  )
static

Definition at line 256 of file bufmgr.c.

257{
258 /* Already reserved (or freed), nothing to do */
259 if (ReservedRefCountEntry != NULL)
260 return;
261
262 /*
263 * First search for a free entry the array, that'll be sufficient in the
264 * majority of cases.
265 */
266 {
267 int i;
268
269 for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
270 {
272
273 res = &PrivateRefCountArray[i];
274
275 if (res->buffer == InvalidBuffer)
276 {
278 return;
279 }
280 }
281 }
282
283 /*
284 * No luck. All array entries are full. Move one array entry into the hash
285 * table.
286 */
287 {
288 /*
289 * Move entry from the current clock position in the array into the
290 * hashtable. Use that slot.
291 */
292 PrivateRefCountEntry *hashent;
293 bool found;
294
295 /* select victim slot */
298
299 /* Better be used, otherwise we shouldn't get here. */
301
302 /* enter victim array entry into hashtable */
306 &found);
307 Assert(!found);
309
310 /* clear the now free array slot */
313
315 }
316}
static uint32 PrivateRefCountClock
Definition: bufmgr.c:215
@ HASH_ENTER
Definition: hsearch.h:114

References Assert(), PrivateRefCountEntry::buffer, HASH_ENTER, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountClock, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by BufferAlloc(), EvictAllUnpinnedBuffers(), EvictRelUnpinnedBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetPrivateRefCountEntry(), GetVictimBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ ResOwnerPrintBufferIO()

static char * ResOwnerPrintBufferIO ( Datum  res)
static

Definition at line 6484 of file bufmgr.c.

6485{
6486 Buffer buffer = DatumGetInt32(res);
6487
6488 return psprintf("lost track of buffer IO on buffer %d", buffer);
6489}
static int32 DatumGetInt32(Datum X)
Definition: postgres.h:207

References PrivateRefCountEntry::buffer, DatumGetInt32(), and psprintf().

◆ ResOwnerPrintBufferPin()

static char * ResOwnerPrintBufferPin ( Datum  res)
static

Definition at line 6507 of file bufmgr.c.

6508{
6510}

References DatumGetInt32(), and DebugPrintBufferRefcount().

◆ ResOwnerReleaseBufferIO()

static void ResOwnerReleaseBufferIO ( Datum  res)
static

Definition at line 6476 of file bufmgr.c.

6477{
6478 Buffer buffer = DatumGetInt32(res);
6479
6480 AbortBufferIO(buffer);
6481}
static void AbortBufferIO(Buffer buffer)
Definition: bufmgr.c:6091

References AbortBufferIO(), PrivateRefCountEntry::buffer, and DatumGetInt32().

◆ ResOwnerReleaseBufferPin()

static void ResOwnerReleaseBufferPin ( Datum  res)
static

Definition at line 6492 of file bufmgr.c.

6493{
6494 Buffer buffer = DatumGetInt32(res);
6495
6496 /* Like ReleaseBuffer, but don't call ResourceOwnerForgetBuffer */
6497 if (!BufferIsValid(buffer))
6498 elog(ERROR, "bad buffer ID: %d", buffer);
6499
6500 if (BufferIsLocal(buffer))
6502 else
6504}
static void UnpinBufferNoOwner(BufferDesc *buf)
Definition: bufmgr.c:3266
void UnpinLocalBufferNoOwner(Buffer buffer)
Definition: localbuf.c:839

References PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), DatumGetInt32(), elog, ERROR, GetBufferDescriptor(), UnpinBufferNoOwner(), and UnpinLocalBufferNoOwner().

◆ rlocator_comparator()

static int rlocator_comparator ( const void *  p1,
const void *  p2 
)
static

Definition at line 6162 of file bufmgr.c.

6163{
6164 RelFileLocator n1 = *(const RelFileLocator *) p1;
6165 RelFileLocator n2 = *(const RelFileLocator *) p2;
6166
6167 if (n1.relNumber < n2.relNumber)
6168 return -1;
6169 else if (n1.relNumber > n2.relNumber)
6170 return 1;
6171
6172 if (n1.dbOid < n2.dbOid)
6173 return -1;
6174 else if (n1.dbOid > n2.dbOid)
6175 return 1;
6176
6177 if (n1.spcOid < n2.spcOid)
6178 return -1;
6179 else if (n1.spcOid > n2.spcOid)
6180 return 1;
6181 else
6182 return 0;
6183}

References RelFileLocator::dbOid, RelFileLocator::relNumber, and RelFileLocator::spcOid.

Referenced by buffertag_comparator(), DropRelationsAllBuffers(), and FlushRelationsAllBuffers().

◆ ScheduleBufferTagForWriteback()

void ScheduleBufferTagForWriteback ( WritebackContext wb_context,
IOContext  io_context,
BufferTag tag 
)

Definition at line 6343 of file bufmgr.c.

6345{
6346 PendingWriteback *pending;
6347
6348 /*
6349 * As pg_flush_data() doesn't do anything with fsync disabled, there's no
6350 * point in tracking in that case.
6351 */
6353 !enableFsync)
6354 return;
6355
6356 /*
6357 * Add buffer to the pending writeback array, unless writeback control is
6358 * disabled.
6359 */
6360 if (*wb_context->max_pending > 0)
6361 {
6363
6364 pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
6365
6366 pending->tag = *tag;
6367 }
6368
6369 /*
6370 * Perform pending flushes if the writeback limit is exceeded. This
6371 * includes the case where previously an item has been added, but control
6372 * is now disabled.
6373 */
6374 if (wb_context->nr_pending >= *wb_context->max_pending)
6375 IssuePendingWritebacks(wb_context, io_context);
6376}
bool enableFsync
Definition: globals.c:130
#define WRITEBACK_MAX_PENDING_FLUSHES

References Assert(), enableFsync, IO_DIRECT_DATA, io_direct_flags, IssuePendingWritebacks(), WritebackContext::max_pending, WritebackContext::nr_pending, WritebackContext::pending_writebacks, PendingWriteback::tag, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by GetVictimBuffer(), and SyncOneBuffer().

◆ shared_buffer_readv_complete()

static PgAioResult shared_buffer_readv_complete ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 7309 of file bufmgr.c.

7311{
7312 return buffer_readv_complete(ioh, prior_result, cb_data, false);
7313}

References buffer_readv_complete().

◆ shared_buffer_readv_complete_local()

static PgAioResult shared_buffer_readv_complete_local ( PgAioHandle ioh,
PgAioResult  prior_result,
uint8  cb_data 
)
static

Definition at line 7323 of file bufmgr.c.

7325{
7326 bool zeroed_any,
7327 ignored_any;
7328 uint8 zeroed_or_error_count,
7329 checkfail_count,
7330 first_off;
7331
7332 if (prior_result.status == PGAIO_RS_OK)
7333 return prior_result;
7334
7335 buffer_readv_decode_error(prior_result,
7336 &zeroed_any,
7337 &ignored_any,
7338 &zeroed_or_error_count,
7339 &checkfail_count,
7340 &first_off);
7341
7342 if (checkfail_count)
7343 {
7345
7347 checkfail_count);
7348 }
7349
7350 return prior_result;
7351}
@ PGAIO_RS_OK
Definition: aio_types.h:81

References buffer_readv_decode_error(), RelFileLocator::dbOid, pgaio_io_get_target_data(), PGAIO_RS_OK, pgstat_report_checksum_failures_in_db(), PgAioTargetData::rlocator, PgAioTargetData::smgr, and PgAioResult::status.

◆ shared_buffer_readv_stage()

static void shared_buffer_readv_stage ( PgAioHandle ioh,
uint8  cb_data 
)
static

Definition at line 7303 of file bufmgr.c.

7304{
7305 buffer_stage_common(ioh, false, false);
7306}

References buffer_stage_common().

◆ shared_buffer_write_error_callback()

static void shared_buffer_write_error_callback ( void *  arg)
static

Definition at line 6130 of file bufmgr.c.

6131{
6132 BufferDesc *bufHdr = (BufferDesc *) arg;
6133
6134 /* Buffer is pinned, so we can read the tag without locking the spinlock */
6135 if (bufHdr != NULL)
6136 errcontext("writing block %u of relation %s",
6137 bufHdr->tag.blockNum,
6139 BufTagGetForkNum(&bufHdr->tag)).str);
6140}

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, relpathperm, and BufferDesc::tag.

Referenced by FlushBuffer().

◆ StartBufferIO()

bool StartBufferIO ( BufferDesc buf,
bool  forInput,
bool  nowait 
)

Definition at line 5975 of file bufmgr.c.

5976{
5977 uint32 buf_state;
5978
5980
5981 for (;;)
5982 {
5983 buf_state = LockBufHdr(buf);
5984
5985 if (!(buf_state & BM_IO_IN_PROGRESS))
5986 break;
5987 UnlockBufHdr(buf, buf_state);
5988 if (nowait)
5989 return false;
5990 WaitIO(buf);
5991 }
5992
5993 /* Once we get here, there is definitely no I/O active on this buffer */
5994
5995 /* Check if someone else already did the I/O */
5996 if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
5997 {
5998 UnlockBufHdr(buf, buf_state);
5999 return false;
6000 }
6001
6002 buf_state |= BM_IO_IN_PROGRESS;
6003 UnlockBufHdr(buf, buf_state);
6004
6007
6008 return true;
6009}
static void ResourceOwnerRememberBufferIO(ResourceOwner owner, Buffer buffer)

References BM_DIRTY, BM_IO_IN_PROGRESS, BM_VALID, buf, BufferDescriptorGetBuffer(), CurrentResourceOwner, LockBufHdr(), ResourceOwnerEnlarge(), ResourceOwnerRememberBufferIO(), UnlockBufHdr(), and WaitIO().

Referenced by buffer_call_start_io(), ExtendBufferedRelShared(), FlushBuffer(), read_rel_block_ll(), ReadBuffersCanStartIOOnce(), and ZeroAndLockBuffer().

◆ StartReadBuffer()

bool StartReadBuffer ( ReadBuffersOperation operation,
Buffer buffer,
BlockNumber  blocknum,
int  flags 
)

Definition at line 1506 of file bufmgr.c.

1510{
1511 int nblocks = 1;
1512 bool result;
1513
1514 result = StartReadBuffersImpl(operation, buffer, blocknum, &nblocks, flags,
1515 false /* single block, no forwarding */ );
1516 Assert(nblocks == 1); /* single block can't be short */
1517
1518 return result;
1519}
static pg_attribute_always_inline bool StartReadBuffersImpl(ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags, bool allow_forwarding)
Definition: bufmgr.c:1255

References Assert(), PrivateRefCountEntry::buffer, and StartReadBuffersImpl().

Referenced by read_stream_next_buffer(), and ReadBuffer_common().

◆ StartReadBuffers()

bool StartReadBuffers ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int *  nblocks,
int  flags 
)

Definition at line 1487 of file bufmgr.c.

1492{
1493 return StartReadBuffersImpl(operation, buffers, blockNum, nblocks, flags,
1494 true /* expect forwarded buffers */ );
1495}

References StartReadBuffersImpl().

Referenced by read_stream_start_pending_read().

◆ StartReadBuffersImpl()

static pg_attribute_always_inline bool StartReadBuffersImpl ( ReadBuffersOperation operation,
Buffer buffers,
BlockNumber  blockNum,
int *  nblocks,
int  flags,
bool  allow_forwarding 
)
static

Definition at line 1255 of file bufmgr.c.

1261{
1262 int actual_nblocks = *nblocks;
1263 int maxcombine = 0;
1264 bool did_start_io;
1265
1266 Assert(*nblocks == 1 || allow_forwarding);
1267 Assert(*nblocks > 0);
1268 Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
1269
1270 for (int i = 0; i < actual_nblocks; ++i)
1271 {
1272 bool found;
1273
1274 if (allow_forwarding && buffers[i] != InvalidBuffer)
1275 {
1276 BufferDesc *bufHdr;
1277
1278 /*
1279 * This is a buffer that was pinned by an earlier call to
1280 * StartReadBuffers(), but couldn't be handled in one operation at
1281 * that time. The operation was split, and the caller has passed
1282 * an already pinned buffer back to us to handle the rest of the
1283 * operation. It must continue at the expected block number.
1284 */
1285 Assert(BufferGetBlockNumber(buffers[i]) == blockNum + i);
1286
1287 /*
1288 * It might be an already valid buffer (a hit) that followed the
1289 * final contiguous block of an earlier I/O (a miss) marking the
1290 * end of it, or a buffer that some other backend has since made
1291 * valid by performing the I/O for us, in which case we can handle
1292 * it as a hit now. It is safe to check for a BM_VALID flag with
1293 * a relaxed load, because we got a fresh view of it while pinning
1294 * it in the previous call.
1295 *
1296 * On the other hand if we don't see BM_VALID yet, it must be an
1297 * I/O that was split by the previous call and we need to try to
1298 * start a new I/O from this block. We're also racing against any
1299 * other backend that might start the I/O or even manage to mark
1300 * it BM_VALID after this check, but StartBufferIO() will handle
1301 * those cases.
1302 */
1303 if (BufferIsLocal(buffers[i]))
1304 bufHdr = GetLocalBufferDescriptor(-buffers[i] - 1);
1305 else
1306 bufHdr = GetBufferDescriptor(buffers[i] - 1);
1308 found = pg_atomic_read_u32(&bufHdr->state) & BM_VALID;
1309 }
1310 else
1311 {
1312 buffers[i] = PinBufferForBlock(operation->rel,
1313 operation->smgr,
1314 operation->persistence,
1315 operation->forknum,
1316 blockNum + i,
1317 operation->strategy,
1318 &found);
1319 }
1320
1321 if (found)
1322 {
1323 /*
1324 * We have a hit. If it's the first block in the requested range,
1325 * we can return it immediately and report that WaitReadBuffers()
1326 * does not need to be called. If the initial value of *nblocks
1327 * was larger, the caller will have to call again for the rest.
1328 */
1329 if (i == 0)
1330 {
1331 *nblocks = 1;
1332
1333#ifdef USE_ASSERT_CHECKING
1334
1335 /*
1336 * Initialize enough of ReadBuffersOperation to make
1337 * CheckReadBuffersOperation() work. Outside of assertions
1338 * that's not necessary when no IO is issued.
1339 */
1340 operation->buffers = buffers;
1341 operation->blocknum = blockNum;
1342 operation->nblocks = 1;
1343 operation->nblocks_done = 1;
1344 CheckReadBuffersOperation(operation, true);
1345#endif
1346 return false;
1347 }
1348
1349 /*
1350 * Otherwise we already have an I/O to perform, but this block
1351 * can't be included as it is already valid. Split the I/O here.
1352 * There may or may not be more blocks requiring I/O after this
1353 * one, we haven't checked, but they can't be contiguous with this
1354 * one in the way. We'll leave this buffer pinned, forwarding it
1355 * to the next call, avoiding the need to unpin it here and re-pin
1356 * it in the next call.
1357 */
1358 actual_nblocks = i;
1359 break;
1360 }
1361 else
1362 {
1363 /*
1364 * Check how many blocks we can cover with the same IO. The smgr
1365 * implementation might e.g. be limited due to a segment boundary.
1366 */
1367 if (i == 0 && actual_nblocks > 1)
1368 {
1369 maxcombine = smgrmaxcombine(operation->smgr,
1370 operation->forknum,
1371 blockNum);
1372 if (unlikely(maxcombine < actual_nblocks))
1373 {
1374 elog(DEBUG2, "limiting nblocks at %u from %u to %u",
1375 blockNum, actual_nblocks, maxcombine);
1376 actual_nblocks = maxcombine;
1377 }
1378 }
1379 }
1380 }
1381 *nblocks = actual_nblocks;
1382
1383 /* Populate information needed for I/O. */
1384 operation->buffers = buffers;
1385 operation->blocknum = blockNum;
1386 operation->flags = flags;
1387 operation->nblocks = actual_nblocks;
1388 operation->nblocks_done = 0;
1389 pgaio_wref_clear(&operation->io_wref);
1390
1391 /*
1392 * When using AIO, start the IO in the background. If not, issue prefetch
1393 * requests if desired by the caller.
1394 *
1395 * The reason we have a dedicated path for IOMETHOD_SYNC here is to
1396 * de-risk the introduction of AIO somewhat. It's a large architectural
1397 * change, with lots of chances for unanticipated performance effects.
1398 *
1399 * Use of IOMETHOD_SYNC already leads to not actually performing IO
1400 * asynchronously, but without the check here we'd execute IO earlier than
1401 * we used to. Eventually this IOMETHOD_SYNC specific path should go away.
1402 */
1403 if (io_method != IOMETHOD_SYNC)
1404 {
1405 /*
1406 * Try to start IO asynchronously. It's possible that no IO needs to
1407 * be started, if another backend already performed the IO.
1408 *
1409 * Note that if an IO is started, it might not cover the entire
1410 * requested range, e.g. because an intermediary block has been read
1411 * in by another backend. In that case any "trailing" buffers we
1412 * already pinned above will be "forwarded" by read_stream.c to the
1413 * next call to StartReadBuffers().
1414 *
1415 * This is signalled to the caller by decrementing *nblocks *and*
1416 * reducing operation->nblocks. The latter is done here, but not below
1417 * WaitReadBuffers(), as in WaitReadBuffers() we can't "shorten" the
1418 * overall read size anymore, we need to retry until done in its
1419 * entirety or until failed.
1420 */
1421 did_start_io = AsyncReadBuffers(operation, nblocks);
1422
1423 operation->nblocks = *nblocks;
1424 }
1425 else
1426 {
1427 operation->flags |= READ_BUFFERS_SYNCHRONOUSLY;
1428
1429 if (flags & READ_BUFFERS_ISSUE_ADVICE)
1430 {
1431 /*
1432 * In theory we should only do this if PinBufferForBlock() had to
1433 * allocate new buffers above. That way, if two calls to
1434 * StartReadBuffers() were made for the same blocks before
1435 * WaitReadBuffers(), only the first would issue the advice.
1436 * That'd be a better simulation of true asynchronous I/O, which
1437 * would only start the I/O once, but isn't done here for
1438 * simplicity.
1439 */
1440 smgrprefetch(operation->smgr,
1441 operation->forknum,
1442 blockNum,
1443 actual_nblocks);
1444 }
1445
1446 /*
1447 * Indicate that WaitReadBuffers() should be called. WaitReadBuffers()
1448 * will initiate the necessary IO.
1449 */
1450 did_start_io = true;
1451 }
1452
1453 CheckReadBuffersOperation(operation, !did_start_io);
1454
1455 return did_start_io;
1456}
int io_method
Definition: aio.c:77
@ IOMETHOD_SYNC
Definition: aio.h:34
static void CheckReadBuffersOperation(ReadBuffersOperation *operation, bool is_complete)
Definition: bufmgr.c:1525
static bool AsyncReadBuffers(ReadBuffersOperation *operation, int *nblocks_progress)
Definition: bufmgr.c:1762
#define READ_BUFFERS_ISSUE_ADVICE
Definition: bufmgr.h:114
uint32 smgrmaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
Definition: smgr.c:697

References Assert(), AsyncReadBuffers(), ReadBuffersOperation::blocknum, BM_TAG_VALID, BM_VALID, BufferGetBlockNumber(), BufferIsLocal, ReadBuffersOperation::buffers, CheckReadBuffersOperation(), DEBUG2, elog, ReadBuffersOperation::flags, ReadBuffersOperation::forknum, GetBufferDescriptor(), GetLocalBufferDescriptor(), i, InvalidBuffer, io_method, ReadBuffersOperation::io_wref, IOMETHOD_SYNC, MAX_IO_COMBINE_LIMIT, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, pg_atomic_read_u32(), pgaio_wref_clear(), PinBufferForBlock(), READ_BUFFERS_ISSUE_ADVICE, READ_BUFFERS_SYNCHRONOUSLY, ReadBuffersOperation::rel, ReadBuffersOperation::smgr, smgrmaxcombine(), smgrprefetch(), BufferDesc::state, ReadBuffersOperation::strategy, and unlikely.

Referenced by StartReadBuffer(), and StartReadBuffers().

◆ SyncOneBuffer()

static int SyncOneBuffer ( int  buf_id,
bool  skip_recently_used,
WritebackContext wb_context 
)
static

Definition at line 3916 of file bufmgr.c.

3917{
3918 BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
3919 int result = 0;
3920 uint32 buf_state;
3921 BufferTag tag;
3922
3923 /* Make sure we can handle the pin */
3926
3927 /*
3928 * Check whether buffer needs writing.
3929 *
3930 * We can make this check without taking the buffer content lock so long
3931 * as we mark pages dirty in access methods *before* logging changes with
3932 * XLogInsert(): if someone marks the buffer dirty just after our check we
3933 * don't worry because our checkpoint.redo points before log record for
3934 * upcoming changes and so we are not required to write such dirty buffer.
3935 */
3936 buf_state = LockBufHdr(bufHdr);
3937
3938 if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
3939 BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
3940 {
3941 result |= BUF_REUSABLE;
3942 }
3943 else if (skip_recently_used)
3944 {
3945 /* Caller told us not to write recently-used buffers */
3946 UnlockBufHdr(bufHdr, buf_state);
3947 return result;
3948 }
3949
3950 if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
3951 {
3952 /* It's clean, so nothing to do */
3953 UnlockBufHdr(bufHdr, buf_state);
3954 return result;
3955 }
3956
3957 /*
3958 * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
3959 * buffer is clean by the time we've locked it.)
3960 */
3961 PinBuffer_Locked(bufHdr);
3963
3965
3967
3968 tag = bufHdr->tag;
3969
3970 UnpinBuffer(bufHdr);
3971
3972 /*
3973 * SyncOneBuffer() is only called by checkpointer and bgwriter, so
3974 * IOContext will always be IOCONTEXT_NORMAL.
3975 */
3977
3978 return result | BUF_WRITTEN;
3979}

References BM_DIRTY, BM_VALID, BUF_REUSABLE, BUF_STATE_GET_REFCOUNT, BUF_STATE_GET_USAGECOUNT, BUF_WRITTEN, BufferDescriptorGetContentLock(), CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by BgBufferSync(), and BufferSync().

◆ TerminateBufferIO()

void TerminateBufferIO ( BufferDesc buf,
bool  clear_dirty,
uint32  set_flag_bits,
bool  forget_owner,
bool  release_aio 
)

Definition at line 6032 of file bufmgr.c.

6034{
6035 uint32 buf_state;
6036
6037 buf_state = LockBufHdr(buf);
6038
6039 Assert(buf_state & BM_IO_IN_PROGRESS);
6040 buf_state &= ~BM_IO_IN_PROGRESS;
6041
6042 /* Clear earlier errors, if this IO failed, it'll be marked again */
6043 buf_state &= ~BM_IO_ERROR;
6044
6045 if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
6046 buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
6047
6048 if (release_aio)
6049 {
6050 /* release ownership by the AIO subsystem */
6051 Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
6052 buf_state -= BUF_REFCOUNT_ONE;
6053 pgaio_wref_clear(&buf->io_wref);
6054 }
6055
6056 buf_state |= set_flag_bits;
6057 UnlockBufHdr(buf, buf_state);
6058
6059 if (forget_owner)
6062
6064
6065 /*
6066 * Support LockBufferForCleanup()
6067 *
6068 * We may have just released the last pin other than the waiter's. In most
6069 * cases, this backend holds another pin on the buffer. But, if, for
6070 * example, this backend is completing an IO issued by another backend, it
6071 * may be time to wake the waiter.
6072 */
6073 if (release_aio && (buf_state & BM_PIN_COUNT_WAITER))
6075}
static ConditionVariable * BufferDescriptorGetIOCV(const BufferDesc *bdesc)
static void WakePinCountWaiter(BufferDesc *buf)
Definition: bufmgr.c:3222
void ConditionVariableBroadcast(ConditionVariable *cv)

References Assert(), BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_IO_IN_PROGRESS, BM_JUST_DIRTIED, BM_PIN_COUNT_WAITER, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetIOCV(), ConditionVariableBroadcast(), CurrentResourceOwner, LockBufHdr(), pgaio_wref_clear(), ResourceOwnerForgetBufferIO(), UnlockBufHdr(), and WakePinCountWaiter().

Referenced by AbortBufferIO(), buffer_call_terminate_io(), buffer_readv_complete_one(), ExtendBufferedRelShared(), FlushBuffer(), and ZeroAndLockBuffer().

◆ ts_ckpt_progress_comparator()

static int ts_ckpt_progress_comparator ( Datum  a,
Datum  b,
void *  arg 
)
static

Definition at line 6308 of file bufmgr.c.

6309{
6311 CkptTsStatus *sb = (CkptTsStatus *) b;
6312
6313 /* we want a min-heap, so return 1 for the a < b */
6314 if (sa->progress < sb->progress)
6315 return 1;
6316 else if (sa->progress == sb->progress)
6317 return 0;
6318 else
6319 return -1;
6320}

References a, b, and CkptTsStatus::progress.

Referenced by BufferSync().

◆ UnlockBuffers()

void UnlockBuffers ( void  )

Definition at line 5509 of file bufmgr.c.

5510{
5512
5513 if (buf)
5514 {
5515 uint32 buf_state;
5516
5517 buf_state = LockBufHdr(buf);
5518
5519 /*
5520 * Don't complain if flag bit not set; it could have been reset but we
5521 * got a cancel/die interrupt before getting the signal.
5522 */
5523 if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5524 buf->wait_backend_pgprocno == MyProcNumber)
5525 buf_state &= ~BM_PIN_COUNT_WAITER;
5526
5527 UnlockBufHdr(buf, buf_state);
5528
5529 PinCountWaitBuf = NULL;
5530 }
5531}

References BM_PIN_COUNT_WAITER, buf, LockBufHdr(), MyProcNumber, PinCountWaitBuf, and UnlockBufHdr().

Referenced by AbortSubTransaction(), AbortTransaction(), AtProcExit_Buffers(), AutoVacLauncherMain(), BackgroundWriterMain(), CheckpointerMain(), and WalWriterMain().

◆ UnlockReleaseBuffer()

void UnlockReleaseBuffer ( Buffer  buffer)

Definition at line 5320 of file bufmgr.c.

5321{
5323 ReleaseBuffer(buffer);
5324}

References PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, LockBuffer(), and ReleaseBuffer().

Referenced by _bt_clear_incomplete_split(), _bt_restore_meta(), _hash_relbuf(), allocNewBuffer(), AlterSequence(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinGetStats(), brinRevmapDesummarizeRange(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), collect_corrupt_items(), collect_visibility_data(), count_nondeletable_pages(), createPostingTree(), do_setval(), doPickSplit(), entryLoadMoreItems(), fill_seq_fork_with_data(), flushCachedPage(), FreeSpaceMapPrepareTruncateRel(), fsm_search(), fsm_set_and_search(), generic_redo(), gin_refind_parent(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoSplit(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginRedoVacuumPage(), ginScanToDelete(), ginStepRight(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTree(), ginVacuumPostingTreeLeaves(), gistbufferinginserttuples(), gistbuild(), gistbuildempty(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistplacetopage(), gistProcessItup(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_split_page(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_insert(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heap_xlog_visible(), heapam_scan_analyze_next_tuple(), initBloomState(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), log_newpage_range(), moveLeafs(), nextval_internal(), palloc_btree_page(), pg_get_sequence_data(), pg_sequence_last_value(), pg_visibility(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), ResetSequence(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), scanPostingTree(), ScanSourceDatabasePgClass(), seq_redo(), SequenceChangePersistence(), shiftList(), spgAddNodeAction(), spgbuild(), spgdoinsert(), spgGetCache(), SpGistGetBuffer(), SpGistUpdateMetaPage(), spgMatchNodeAction(), spgprocesspending(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), spgvacuumpage(), spgWalk(), statapprox_heap(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_prepare_truncate(), writeListPage(), xlog_redo(), and XLogRecordPageWithFreeSpace().

◆ UnpinBuffer()

◆ UnpinBufferNoOwner()

static void UnpinBufferNoOwner ( BufferDesc buf)
static

Definition at line 3266 of file bufmgr.c.

3267{
3270
3272
3273 /* not moving as we're likely deleting it soon anyway */
3274 ref = GetPrivateRefCountEntry(b, false);
3275 Assert(ref != NULL);
3276 Assert(ref->refcount > 0);
3277 ref->refcount--;
3278 if (ref->refcount == 0)
3279 {
3280 uint32 buf_state;
3281 uint32 old_buf_state;
3282
3283 /*
3284 * Mark buffer non-accessible to Valgrind.
3285 *
3286 * Note that the buffer may have already been marked non-accessible
3287 * within access method code that enforces that buffers are only
3288 * accessed while a buffer lock is held.
3289 */
3291
3292 /* I'd better not still hold the buffer content lock */
3294
3295 /*
3296 * Decrement the shared reference count.
3297 *
3298 * Since buffer spinlock holder can update status using just write,
3299 * it's not safe to use atomic decrement here; thus use a CAS loop.
3300 */
3301 old_buf_state = pg_atomic_read_u32(&buf->state);
3302 for (;;)
3303 {
3304 if (old_buf_state & BM_LOCKED)
3305 old_buf_state = WaitBufHdrUnlocked(buf);
3306
3307 buf_state = old_buf_state;
3308
3309 buf_state -= BUF_REFCOUNT_ONE;
3310
3311 if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
3312 buf_state))
3313 break;
3314 }
3315
3316 /* Support LockBufferForCleanup() */
3317 if (buf_state & BM_PIN_COUNT_WAITER)
3319
3321 }
3322}
static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
Definition: bufmgr.c:445

References Assert(), b, BM_LOCKED, BM_PIN_COUNT_WAITER, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufferIsLocal, BufHdrGetBlock, ForgetPrivateRefCountEntry(), GetPrivateRefCountEntry(), LWLockHeldByMe(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, VALGRIND_MAKE_MEM_NOACCESS, WaitBufHdrUnlocked(), and WakePinCountWaiter().

Referenced by ResOwnerReleaseBufferPin(), and UnpinBuffer().

◆ WaitBufHdrUnlocked()

static uint32 WaitBufHdrUnlocked ( BufferDesc buf)
static

Definition at line 6219 of file bufmgr.c.

6220{
6221 SpinDelayStatus delayStatus;
6222 uint32 buf_state;
6223
6224 init_local_spin_delay(&delayStatus);
6225
6226 buf_state = pg_atomic_read_u32(&buf->state);
6227
6228 while (buf_state & BM_LOCKED)
6229 {
6230 perform_spin_delay(&delayStatus);
6231 buf_state = pg_atomic_read_u32(&buf->state);
6232 }
6233
6234 finish_spin_delay(&delayStatus);
6235
6236 return buf_state;
6237}

References BM_LOCKED, buf, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), and pg_atomic_read_u32().

Referenced by MarkBufferDirty(), PinBuffer(), and UnpinBufferNoOwner().

◆ WaitIO()

static void WaitIO ( BufferDesc buf)
static

Definition at line 5896 of file bufmgr.c.

5897{
5899
5901 for (;;)
5902 {
5903 uint32 buf_state;
5904 PgAioWaitRef iow;
5905
5906 /*
5907 * It may not be necessary to acquire the spinlock to check the flag
5908 * here, but since this test is essential for correctness, we'd better
5909 * play it safe.
5910 */
5911 buf_state = LockBufHdr(buf);
5912
5913 /*
5914 * Copy the wait reference while holding the spinlock. This protects
5915 * against a concurrent TerminateBufferIO() in another backend from
5916 * clearing the wref while it's being read.
5917 */
5918 iow = buf->io_wref;
5919 UnlockBufHdr(buf, buf_state);
5920
5921 /* no IO in progress, we don't need to wait */
5922 if (!(buf_state & BM_IO_IN_PROGRESS))
5923 break;
5924
5925 /*
5926 * The buffer has asynchronous IO in progress, wait for it to
5927 * complete.
5928 */
5929 if (pgaio_wref_valid(&iow))
5930 {
5931 pgaio_wref_wait(&iow);
5932
5933 /*
5934 * The AIO subsystem internally uses condition variables and thus
5935 * might remove this backend from the BufferDesc's CV. While that
5936 * wouldn't cause a correctness issue (the first CV sleep just
5937 * immediately returns if not already registered), it seems worth
5938 * avoiding unnecessary loop iterations, given that we take care
5939 * to do so at the start of the function.
5940 */
5942 continue;
5943 }
5944
5945 /* wait on BufferDesc->cv, e.g. for concurrent synchronous IO */
5946 ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
5947 }
5949}
void pgaio_wref_wait(PgAioWaitRef *iow)
Definition: aio.c:893
bool ConditionVariableCancelSleep(void)
void ConditionVariablePrepareToSleep(ConditionVariable *cv)
void ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)

References BM_IO_IN_PROGRESS, buf, BufferDescriptorGetIOCV(), ConditionVariableCancelSleep(), ConditionVariablePrepareToSleep(), ConditionVariableSleep(), LockBufHdr(), pgaio_wref_valid(), pgaio_wref_wait(), and UnlockBufHdr().

Referenced by InvalidateBuffer(), and StartBufferIO().

◆ WaitReadBuffers()

void WaitReadBuffers ( ReadBuffersOperation operation)

Definition at line 1630 of file bufmgr.c.

1631{
1632 PgAioReturn *aio_ret = &operation->io_return;
1633 IOContext io_context;
1634 IOObject io_object;
1635
1636 if (operation->persistence == RELPERSISTENCE_TEMP)
1637 {
1638 io_context = IOCONTEXT_NORMAL;
1639 io_object = IOOBJECT_TEMP_RELATION;
1640 }
1641 else
1642 {
1643 io_context = IOContextForStrategy(operation->strategy);
1644 io_object = IOOBJECT_RELATION;
1645 }
1646
1647 /*
1648 * If we get here without an IO operation having been issued, the
1649 * io_method == IOMETHOD_SYNC path must have been used. Otherwise the
1650 * caller should not have called WaitReadBuffers().
1651 *
1652 * In the case of IOMETHOD_SYNC, we start - as we used to before the
1653 * introducing of AIO - the IO in WaitReadBuffers(). This is done as part
1654 * of the retry logic below, no extra code is required.
1655 *
1656 * This path is expected to eventually go away.
1657 */
1658 if (!pgaio_wref_valid(&operation->io_wref) && io_method != IOMETHOD_SYNC)
1659 elog(ERROR, "waiting for read operation that didn't read");
1660
1661 /*
1662 * To handle partial reads, and IOMETHOD_SYNC, we re-issue IO until we're
1663 * done. We may need multiple retries, not just because we could get
1664 * multiple partial reads, but also because some of the remaining
1665 * to-be-read buffers may have been read in by other backends, limiting
1666 * the IO size.
1667 */
1668 while (true)
1669 {
1670 int ignored_nblocks_progress;
1671
1672 CheckReadBuffersOperation(operation, false);
1673
1674 /*
1675 * If there is an IO associated with the operation, we may need to
1676 * wait for it.
1677 */
1678 if (pgaio_wref_valid(&operation->io_wref))
1679 {
1680 /*
1681 * Track the time spent waiting for the IO to complete. As
1682 * tracking a wait even if we don't actually need to wait
1683 *
1684 * a) is not cheap, due to the timestamping overhead
1685 *
1686 * b) reports some time as waiting, even if we never waited
1687 *
1688 * we first check if we already know the IO is complete.
1689 */
1690 if (aio_ret->result.status == PGAIO_RS_UNKNOWN &&
1691 !pgaio_wref_check_done(&operation->io_wref))
1692 {
1694
1695 pgaio_wref_wait(&operation->io_wref);
1696
1697 /*
1698 * The IO operation itself was already counted earlier, in
1699 * AsyncReadBuffers(), this just accounts for the wait time.
1700 */
1701 pgstat_count_io_op_time(io_object, io_context, IOOP_READ,
1702 io_start, 0, 0);
1703 }
1704 else
1705 {
1706 Assert(pgaio_wref_check_done(&operation->io_wref));
1707 }
1708
1709 /*
1710 * We now are sure the IO completed. Check the results. This
1711 * includes reporting on errors if there were any.
1712 */
1713 ProcessReadBuffersResult(operation);
1714 }
1715
1716 /*
1717 * Most of the time, the one IO we already started, will read in
1718 * everything. But we need to deal with partial reads and buffers not
1719 * needing IO anymore.
1720 */
1721 if (operation->nblocks_done == operation->nblocks)
1722 break;
1723
1725
1726 /*
1727 * This may only complete the IO partially, either because some
1728 * buffers were already valid, or because of a partial read.
1729 *
1730 * NB: In contrast to after the AsyncReadBuffers() call in
1731 * StartReadBuffers(), we do *not* reduce
1732 * ReadBuffersOperation->nblocks here, callers expect the full
1733 * operation to be completed at this point (as more operations may
1734 * have been queued).
1735 */
1736 AsyncReadBuffers(operation, &ignored_nblocks_progress);
1737 }
1738
1739 CheckReadBuffersOperation(operation, true);
1740
1741 /* NB: READ_DONE tracepoint was already executed in completion callback */
1742}
bool pgaio_wref_check_done(PgAioWaitRef *iow)
Definition: aio.c:907
static void ProcessReadBuffersResult(ReadBuffersOperation *operation)
Definition: bufmgr.c:1591

References Assert(), AsyncReadBuffers(), CHECK_FOR_INTERRUPTS, CheckReadBuffersOperation(), elog, ERROR, io_method, ReadBuffersOperation::io_return, ReadBuffersOperation::io_wref, IOCONTEXT_NORMAL, IOContextForStrategy(), IOMETHOD_SYNC, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_READ, ReadBuffersOperation::nblocks, ReadBuffersOperation::nblocks_done, ReadBuffersOperation::persistence, PGAIO_RS_UNKNOWN, pgaio_wref_check_done(), pgaio_wref_valid(), pgaio_wref_wait(), pgstat_count_io_op_time(), pgstat_prepare_io_time(), ProcessReadBuffersResult(), PgAioReturn::result, PgAioResult::status, ReadBuffersOperation::strategy, and track_io_timing.

Referenced by read_stream_next_buffer(), and ReadBuffer_common().

◆ WakePinCountWaiter()

static void WakePinCountWaiter ( BufferDesc buf)
static

Definition at line 3222 of file bufmgr.c.

3223{
3224 /*
3225 * Acquire the buffer header lock, re-check that there's a waiter. Another
3226 * backend could have unpinned this buffer, and already woken up the
3227 * waiter.
3228 *
3229 * There's no danger of the buffer being replaced after we unpinned it
3230 * above, as it's pinned by the waiter. The waiter removes
3231 * BM_PIN_COUNT_WAITER if it stops waiting for a reason other than this
3232 * backend waking it up.
3233 */
3234 uint32 buf_state = LockBufHdr(buf);
3235
3236 if ((buf_state & BM_PIN_COUNT_WAITER) &&
3237 BUF_STATE_GET_REFCOUNT(buf_state) == 1)
3238 {
3239 /* we just released the last pin other than the waiter's */
3240 int wait_backend_pgprocno = buf->wait_backend_pgprocno;
3241
3242 buf_state &= ~BM_PIN_COUNT_WAITER;
3243 UnlockBufHdr(buf, buf_state);
3244 ProcSendSignal(wait_backend_pgprocno);
3245 }
3246 else
3247 UnlockBufHdr(buf, buf_state);
3248}
void ProcSendSignal(ProcNumber procNumber)
Definition: proc.c:1987

References BM_PIN_COUNT_WAITER, buf, BUF_STATE_GET_REFCOUNT, LockBufHdr(), ProcSendSignal(), and UnlockBufHdr().

Referenced by TerminateBufferIO(), and UnpinBufferNoOwner().

◆ WritebackContextInit()

void WritebackContextInit ( WritebackContext context,
int *  max_pending 
)

Definition at line 6331 of file bufmgr.c.

6332{
6333 Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
6334
6335 context->max_pending = max_pending;
6336 context->nr_pending = 0;
6337}

References Assert(), WritebackContext::max_pending, WritebackContext::nr_pending, and WRITEBACK_MAX_PENDING_FLUSHES.

Referenced by BackgroundWriterMain(), BufferManagerShmemInit(), and BufferSync().

◆ ZeroAndLockBuffer()

static void ZeroAndLockBuffer ( Buffer  buffer,
ReadBufferMode  mode,
bool  already_valid 
)
static

Definition at line 1024 of file bufmgr.c.

1025{
1026 BufferDesc *bufHdr;
1027 bool need_to_zero;
1028 bool isLocalBuf = BufferIsLocal(buffer);
1029
1031
1032 if (already_valid)
1033 {
1034 /*
1035 * If the caller already knew the buffer was valid, we can skip some
1036 * header interaction. The caller just wants to lock the buffer.
1037 */
1038 need_to_zero = false;
1039 }
1040 else if (isLocalBuf)
1041 {
1042 /* Simple case for non-shared buffers. */
1043 bufHdr = GetLocalBufferDescriptor(-buffer - 1);
1044 need_to_zero = StartLocalBufferIO(bufHdr, true, false);
1045 }
1046 else
1047 {
1048 /*
1049 * Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set
1050 * concurrently. Even though we aren't doing I/O, that ensures that
1051 * we don't zero a page that someone else has pinned. An exclusive
1052 * content lock wouldn't be enough, because readers are allowed to
1053 * drop the content lock after determining that a tuple is visible
1054 * (see buffer access rules in README).
1055 */
1056 bufHdr = GetBufferDescriptor(buffer - 1);
1057 need_to_zero = StartBufferIO(bufHdr, true, false);
1058 }
1059
1060 if (need_to_zero)
1061 {
1062 memset(BufferGetPage(buffer), 0, BLCKSZ);
1063
1064 /*
1065 * Grab the buffer content lock before marking the page as valid, to
1066 * make sure that no other backend sees the zeroed page before the
1067 * caller has had a chance to initialize it.
1068 *
1069 * Since no-one else can be looking at the page contents yet, there is
1070 * no difference between an exclusive lock and a cleanup-strength
1071 * lock. (Note that we cannot use LockBuffer() or
1072 * LockBufferForCleanup() here, because they assert that the buffer is
1073 * already valid.)
1074 */
1075 if (!isLocalBuf)
1077
1078 /* Set BM_VALID, terminate IO, and wake up any waiters */
1079 if (isLocalBuf)
1080 TerminateLocalBufferIO(bufHdr, false, BM_VALID, false);
1081 else
1082 TerminateBufferIO(bufHdr, false, BM_VALID, true, false);
1083 }
1084 else if (!isLocalBuf)
1085 {
1086 /*
1087 * The buffer is valid, so we can't zero it. The caller still expects
1088 * the page to be locked on return.
1089 */
1090 if (mode == RBM_ZERO_AND_LOCK)
1092 else
1093 LockBufferForCleanup(buffer);
1094 }
1095}
void LockBufferForCleanup(Buffer buffer)
Definition: bufmgr.c:5617

References Assert(), BM_VALID, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BufferDescriptorGetContentLock(), BufferGetPage(), BufferIsLocal, GetBufferDescriptor(), GetLocalBufferDescriptor(), LockBuffer(), LockBufferForCleanup(), LW_EXCLUSIVE, LWLockAcquire(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, StartBufferIO(), StartLocalBufferIO(), TerminateBufferIO(), and TerminateLocalBufferIO().

Referenced by ReadBuffer_common().

Variable Documentation

◆ aio_local_buffer_readv_cb

const PgAioHandleCallbacks aio_local_buffer_readv_cb
Initial value:
= {
.complete_local = local_buffer_readv_complete,
}
static PgAioResult local_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7360
static void local_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7354
static void buffer_readv_report(PgAioResult result, const PgAioTargetData *td, int elevel)
Definition: bufmgr.c:7210

Definition at line 7376 of file bufmgr.c.

◆ aio_shared_buffer_readv_cb

const PgAioHandleCallbacks aio_shared_buffer_readv_cb
Initial value:
= {
.complete_shared = shared_buffer_readv_complete,
}
static PgAioResult shared_buffer_readv_complete_local(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7323
static void shared_buffer_readv_stage(PgAioHandle *ioh, uint8 cb_data)
Definition: bufmgr.c:7303
static PgAioResult shared_buffer_readv_complete(PgAioHandle *ioh, PgAioResult prior_result, uint8 cb_data)
Definition: bufmgr.c:7309

Definition at line 7367 of file bufmgr.c.

◆ backend_flush_after

int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER

Definition at line 177 of file bufmgr.c.

Referenced by BufferManagerShmemInit().

◆ bgwriter_flush_after

int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER

Definition at line 176 of file bufmgr.c.

Referenced by BackgroundWriterMain().

◆ bgwriter_lru_maxpages

int bgwriter_lru_maxpages = 100

Definition at line 142 of file bufmgr.c.

Referenced by BgBufferSync().

◆ bgwriter_lru_multiplier

double bgwriter_lru_multiplier = 2.0

Definition at line 143 of file bufmgr.c.

Referenced by BgBufferSync().

◆ buffer_io_resowner_desc

const ResourceOwnerDesc buffer_io_resowner_desc
Initial value:
=
{
.name = "buffer io",
.release_priority = RELEASE_PRIO_BUFFER_IOS,
.ReleaseResource = ResOwnerReleaseBufferIO,
.DebugPrint = ResOwnerPrintBufferIO
}
static void ResOwnerReleaseBufferIO(Datum res)
Definition: bufmgr.c:6476
static char * ResOwnerPrintBufferIO(Datum res)
Definition: bufmgr.c:6484
#define RELEASE_PRIO_BUFFER_IOS
Definition: resowner.h:62
@ RESOURCE_RELEASE_BEFORE_LOCKS
Definition: resowner.h:54

Definition at line 232 of file bufmgr.c.

Referenced by ResourceOwnerForgetBufferIO(), and ResourceOwnerRememberBufferIO().

◆ buffer_pin_resowner_desc

const ResourceOwnerDesc buffer_pin_resowner_desc
Initial value:
=
{
.name = "buffer pin",
.release_priority = RELEASE_PRIO_BUFFER_PINS,
.ReleaseResource = ResOwnerReleaseBufferPin,
.DebugPrint = ResOwnerPrintBufferPin
}
static char * ResOwnerPrintBufferPin(Datum res)
Definition: bufmgr.c:6507
static void ResOwnerReleaseBufferPin(Datum res)
Definition: bufmgr.c:6492
#define RELEASE_PRIO_BUFFER_PINS
Definition: resowner.h:63

Definition at line 241 of file bufmgr.c.

Referenced by ResourceOwnerForgetBuffer(), and ResourceOwnerRememberBuffer().

◆ checkpoint_flush_after

int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER

Definition at line 175 of file bufmgr.c.

Referenced by BufferSync().

◆ effective_io_concurrency

◆ io_combine_limit

◆ io_combine_limit_guc

int io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT

Definition at line 168 of file bufmgr.c.

Referenced by assign_io_combine_limit(), and assign_io_max_combine_limit().

◆ io_max_combine_limit

◆ maintenance_io_concurrency

◆ MaxProportionalPins

uint32 MaxProportionalPins
static

Definition at line 218 of file bufmgr.c.

Referenced by GetAdditionalPinLimit(), GetPinLimit(), and InitBufferManagerAccess().

◆ PinCountWaitBuf

BufferDesc* PinCountWaitBuf = NULL
static

Definition at line 180 of file bufmgr.c.

Referenced by LockBufferForCleanup(), and UnlockBuffers().

◆ PrivateRefCountArray

◆ PrivateRefCountClock

uint32 PrivateRefCountClock = 0
static

Definition at line 215 of file bufmgr.c.

Referenced by ReservePrivateRefCountEntry().

◆ PrivateRefCountHash

◆ PrivateRefCountOverflowed

◆ ReservedRefCountEntry

◆ track_io_timing

◆ zero_damaged_pages

bool zero_damaged_pages = false

Definition at line 141 of file bufmgr.c.

Referenced by AsyncReadBuffers(), mdreadv(), and read_rel_block_ll().