PostgreSQL Source Code  git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/read_stream.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
#include <lib/sort_template.h>
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static void ResOwnerReleaseBufferIO (Datum res)
 
static char * ResOwnerPrintBufferIO (Datum res)
 
static void ResOwnerReleaseBufferPin (Datum res)
 
static char * ResOwnerPrintBufferPin (Datum res)
 
static Buffer ReadBuffer_common (Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static BlockNumber ExtendBufferedRelCommon (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static BlockNumber ExtendBufferedRelShared (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void UnpinBufferNoOwner (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static bool StartBufferIO (BufferDesc *buf, bool forInput, bool nowait)
 
static void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner)
 
static void AbortBufferIO (Buffer buffer)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
 
static Buffer GetVictimBuffer (BufferAccessStrategy strategy, IOContext io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
Buffer ExtendBufferedRel (BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
 
BlockNumber ExtendBufferedRelBy (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
 
Buffer ExtendBufferedRelTo (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
 
static void ZeroAndLockBuffer (Buffer buffer, ReadBufferMode mode, bool already_valid)
 
static pg_attribute_always_inline Buffer PinBufferForBlock (Relation rel, SMgrRelation smgr, char persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static pg_attribute_always_inline bool StartReadBuffersImpl (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
 
bool StartReadBuffers (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
 
bool StartReadBuffer (ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
 
static bool WaitReadBuffersCanStartIO (Buffer buffer, bool nowait)
 
void WaitReadBuffers (ReadBuffersOperation *operation)
 
static void InvalidateBuffer (BufferDesc *buf)
 
static bool InvalidateVictimBuffer (BufferDesc *buf_hdr)
 
void LimitAdditionalPins (uint32 *additional_pins)
 
bool BufferIsExclusiveLocked (Buffer buffer)
 
bool BufferIsDirty (Buffer buffer)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferManagerAccess (void)
 
char * DebugPrintBufferRefcount (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void CheckBufferIsPinnedOnce (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *wb_context, IOContext io_context)
 
bool EvictUnpinnedBuffer (Buffer buf)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 
const ResourceOwnerDesc buffer_io_resowner_desc
 
const ResourceOwnerDesc buffer_pin_resowner_desc
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 87 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 77 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 76 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 69 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:416
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:351
int32 * LocalRefCount
Definition: localbuf.c:46

Definition at line 474 of file bufmgr.c.

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 68 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 72 of file bufmgr.c.

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 96 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 79 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 5929 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 5929 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 5931 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 5931 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 5928 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 5928 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 5930 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 5930 of file bufmgr.c.

◆ ST_SORT [1/2]

#define ST_SORT   sort_checkpoint_bufferids

Definition at line 5927 of file bufmgr.c.

◆ ST_SORT [2/2]

#define ST_SORT   sort_pending_writebacks

Definition at line 5927 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

static void AbortBufferIO ( Buffer  buffer)
static

Definition at line 5634 of file bufmgr.c.

5635 {
5636  BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5637  uint32 buf_state;
5638 
5639  buf_state = LockBufHdr(buf_hdr);
5640  Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5641 
5642  if (!(buf_state & BM_VALID))
5643  {
5644  Assert(!(buf_state & BM_DIRTY));
5645  UnlockBufHdr(buf_hdr, buf_state);
5646  }
5647  else
5648  {
5649  Assert(buf_state & BM_DIRTY);
5650  UnlockBufHdr(buf_hdr, buf_state);
5651 
5652  /* Issue notice if this is not the first failure... */
5653  if (buf_state & BM_IO_ERROR)
5654  {
5655  /* Buffer is pinned, so we can read tag without spinlock */
5656  char *path;
5657 
5658  path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5659  BufTagGetForkNum(&buf_hdr->tag));
5660  ereport(WARNING,
5661  (errcode(ERRCODE_IO_ERROR),
5662  errmsg("could not write block %u of %s",
5663  buf_hdr->tag.blockNum, path),
5664  errdetail("Multiple failures --- write error might be permanent.")));
5665  pfree(path);
5666  }
5667  }
5668 
5669  TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
5670 }
#define BM_TAG_VALID
Definition: buf_internals.h:63
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static BufferDesc * GetBufferDescriptor(uint32 id)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
#define BM_DIRTY
Definition: buf_internals.h:61
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:64
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:62
#define BM_IO_ERROR
Definition: buf_internals.h:65
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner)
Definition: bufmgr.c:5597
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:5743
unsigned int uint32
Definition: c.h:506
#define Assert(condition)
Definition: c.h:858
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errcode(int sqlerrcode)
Definition: elog.c:853
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define WARNING
Definition: elog.h:36
#define ereport(elevel,...)
Definition: elog.h:149
void pfree(void *pointer)
Definition: mcxt.c:1521
#define relpathperm(rlocator, forknum)
Definition: relpath.h:98
BufferTag tag
BlockNumber blockNum
Definition: buf_internals.h:98

References Assert, buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg(), GetBufferDescriptor(), LockBufHdr(), pfree(), relpathperm, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by ResOwnerReleaseBufferIO().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 3541 of file bufmgr.c.

3542 {
3544 
3545  AtEOXact_LocalBuffers(isCommit);
3546 
3548 }
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:3601
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:210
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:819

References Assert, AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 3583 of file bufmgr.c.

3584 {
3585  UnlockBuffers();
3586 
3588 
3589  /* localbuf.c needs a chance too */
3591 }
void UnlockBuffers(void)
Definition: bufmgr.c:5112
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:830

References AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferManagerAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 3170 of file bufmgr.c.

3171 {
3172  /* info obtained from freelist.c */
3173  int strategy_buf_id;
3174  uint32 strategy_passes;
3175  uint32 recent_alloc;
3176 
3177  /*
3178  * Information saved between calls so we can determine the strategy
3179  * point's advance rate and avoid scanning already-cleaned buffers.
3180  */
3181  static bool saved_info_valid = false;
3182  static int prev_strategy_buf_id;
3183  static uint32 prev_strategy_passes;
3184  static int next_to_clean;
3185  static uint32 next_passes;
3186 
3187  /* Moving averages of allocation rate and clean-buffer density */
3188  static float smoothed_alloc = 0;
3189  static float smoothed_density = 10.0;
3190 
3191  /* Potentially these could be tunables, but for now, not */
3192  float smoothing_samples = 16;
3193  float scan_whole_pool_milliseconds = 120000.0;
3194 
3195  /* Used to compute how far we scan ahead */
3196  long strategy_delta;
3197  int bufs_to_lap;
3198  int bufs_ahead;
3199  float scans_per_alloc;
3200  int reusable_buffers_est;
3201  int upcoming_alloc_est;
3202  int min_scan_buffers;
3203 
3204  /* Variables for the scanning loop proper */
3205  int num_to_scan;
3206  int num_written;
3207  int reusable_buffers;
3208 
3209  /* Variables for final smoothed_density update */
3210  long new_strategy_delta;
3211  uint32 new_recent_alloc;
3212 
3213  /*
3214  * Find out where the freelist clock sweep currently is, and how many
3215  * buffer allocations have happened since our last call.
3216  */
3217  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3218 
3219  /* Report buffer alloc counts to pgstat */
3220  PendingBgWriterStats.buf_alloc += recent_alloc;
3221 
3222  /*
3223  * If we're not running the LRU scan, just stop after doing the stats
3224  * stuff. We mark the saved state invalid so that we can recover sanely
3225  * if LRU scan is turned back on later.
3226  */
3227  if (bgwriter_lru_maxpages <= 0)
3228  {
3229  saved_info_valid = false;
3230  return true;
3231  }
3232 
3233  /*
3234  * Compute strategy_delta = how many buffers have been scanned by the
3235  * clock sweep since last time. If first time through, assume none. Then
3236  * see if we are still ahead of the clock sweep, and if so, how many
3237  * buffers we could scan before we'd catch up with it and "lap" it. Note:
3238  * weird-looking coding of xxx_passes comparisons are to avoid bogus
3239  * behavior when the passes counts wrap around.
3240  */
3241  if (saved_info_valid)
3242  {
3243  int32 passes_delta = strategy_passes - prev_strategy_passes;
3244 
3245  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3246  strategy_delta += (long) passes_delta * NBuffers;
3247 
3248  Assert(strategy_delta >= 0);
3249 
3250  if ((int32) (next_passes - strategy_passes) > 0)
3251  {
3252  /* we're one pass ahead of the strategy point */
3253  bufs_to_lap = strategy_buf_id - next_to_clean;
3254 #ifdef BGW_DEBUG
3255  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3256  next_passes, next_to_clean,
3257  strategy_passes, strategy_buf_id,
3258  strategy_delta, bufs_to_lap);
3259 #endif
3260  }
3261  else if (next_passes == strategy_passes &&
3262  next_to_clean >= strategy_buf_id)
3263  {
3264  /* on same pass, but ahead or at least not behind */
3265  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3266 #ifdef BGW_DEBUG
3267  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3268  next_passes, next_to_clean,
3269  strategy_passes, strategy_buf_id,
3270  strategy_delta, bufs_to_lap);
3271 #endif
3272  }
3273  else
3274  {
3275  /*
3276  * We're behind, so skip forward to the strategy point and start
3277  * cleaning from there.
3278  */
3279 #ifdef BGW_DEBUG
3280  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3281  next_passes, next_to_clean,
3282  strategy_passes, strategy_buf_id,
3283  strategy_delta);
3284 #endif
3285  next_to_clean = strategy_buf_id;
3286  next_passes = strategy_passes;
3287  bufs_to_lap = NBuffers;
3288  }
3289  }
3290  else
3291  {
3292  /*
3293  * Initializing at startup or after LRU scanning had been off. Always
3294  * start at the strategy point.
3295  */
3296 #ifdef BGW_DEBUG
3297  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3298  strategy_passes, strategy_buf_id);
3299 #endif
3300  strategy_delta = 0;
3301  next_to_clean = strategy_buf_id;
3302  next_passes = strategy_passes;
3303  bufs_to_lap = NBuffers;
3304  }
3305 
3306  /* Update saved info for next time */
3307  prev_strategy_buf_id = strategy_buf_id;
3308  prev_strategy_passes = strategy_passes;
3309  saved_info_valid = true;
3310 
3311  /*
3312  * Compute how many buffers had to be scanned for each new allocation, ie,
3313  * 1/density of reusable buffers, and track a moving average of that.
3314  *
3315  * If the strategy point didn't move, we don't update the density estimate
3316  */
3317  if (strategy_delta > 0 && recent_alloc > 0)
3318  {
3319  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3320  smoothed_density += (scans_per_alloc - smoothed_density) /
3321  smoothing_samples;
3322  }
3323 
3324  /*
3325  * Estimate how many reusable buffers there are between the current
3326  * strategy point and where we've scanned ahead to, based on the smoothed
3327  * density estimate.
3328  */
3329  bufs_ahead = NBuffers - bufs_to_lap;
3330  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3331 
3332  /*
3333  * Track a moving average of recent buffer allocations. Here, rather than
3334  * a true average we want a fast-attack, slow-decline behavior: we
3335  * immediately follow any increase.
3336  */
3337  if (smoothed_alloc <= (float) recent_alloc)
3338  smoothed_alloc = recent_alloc;
3339  else
3340  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3341  smoothing_samples;
3342 
3343  /* Scale the estimate by a GUC to allow more aggressive tuning. */
3344  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3345 
3346  /*
3347  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3348  * eventually underflow to zero, and the underflows produce annoying
3349  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3350  * zero, there's no point in tracking smaller and smaller values of
3351  * smoothed_alloc, so just reset it to exactly zero to avoid this
3352  * syndrome. It will pop back up as soon as recent_alloc increases.
3353  */
3354  if (upcoming_alloc_est == 0)
3355  smoothed_alloc = 0;
3356 
3357  /*
3358  * Even in cases where there's been little or no buffer allocation
3359  * activity, we want to make a small amount of progress through the buffer
3360  * cache so that as many reusable buffers as possible are clean after an
3361  * idle period.
3362  *
3363  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3364  * the BGW will be called during the scan_whole_pool time; slice the
3365  * buffer pool into that many sections.
3366  */
3367  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3368 
3369  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3370  {
3371 #ifdef BGW_DEBUG
3372  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3373  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3374 #endif
3375  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3376  }
3377 
3378  /*
3379  * Now write out dirty reusable buffers, working forward from the
3380  * next_to_clean point, until we have lapped the strategy scan, or cleaned
3381  * enough buffers to match our estimate of the next cycle's allocation
3382  * requirements, or hit the bgwriter_lru_maxpages limit.
3383  */
3384 
3385  num_to_scan = bufs_to_lap;
3386  num_written = 0;
3387  reusable_buffers = reusable_buffers_est;
3388 
3389  /* Execute the LRU scan */
3390  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3391  {
3392  int sync_state = SyncOneBuffer(next_to_clean, true,
3393  wb_context);
3394 
3395  if (++next_to_clean >= NBuffers)
3396  {
3397  next_to_clean = 0;
3398  next_passes++;
3399  }
3400  num_to_scan--;
3401 
3402  if (sync_state & BUF_WRITTEN)
3403  {
3404  reusable_buffers++;
3405  if (++num_written >= bgwriter_lru_maxpages)
3406  {
3408  break;
3409  }
3410  }
3411  else if (sync_state & BUF_REUSABLE)
3412  reusable_buffers++;
3413  }
3414 
3415  PendingBgWriterStats.buf_written_clean += num_written;
3416 
3417 #ifdef BGW_DEBUG
3418  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3419  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3420  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3421  bufs_to_lap - num_to_scan,
3422  num_written,
3423  reusable_buffers - reusable_buffers_est);
3424 #endif
3425 
3426  /*
3427  * Consider the above scan as being like a new allocation scan.
3428  * Characterize its density and update the smoothed one based on it. This
3429  * effectively halves the moving average period in cases where both the
3430  * strategy and the background writer are doing some useful scanning,
3431  * which is helpful because a long memory isn't as desirable on the
3432  * density estimates.
3433  */
3434  new_strategy_delta = bufs_to_lap - num_to_scan;
3435  new_recent_alloc = reusable_buffers - reusable_buffers_est;
3436  if (new_strategy_delta > 0 && new_recent_alloc > 0)
3437  {
3438  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3439  smoothed_density += (scans_per_alloc - smoothed_density) /
3440  smoothing_samples;
3441 
3442 #ifdef BGW_DEBUG
3443  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3444  new_recent_alloc, new_strategy_delta,
3445  scans_per_alloc, smoothed_density);
3446 #endif
3447  }
3448 
3449  /* Return true if OK to hibernate */
3450  return (bufs_to_lap == 0 && recent_alloc == 0);
3451 }
int BgWriterDelay
Definition: bgwriter.c:57
#define BUF_REUSABLE
Definition: bufmgr.c:77
double bgwriter_lru_multiplier
Definition: bufmgr.c:142
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3468
int bgwriter_lru_maxpages
Definition: bufmgr.c:141
#define BUF_WRITTEN
Definition: bufmgr.c:76
signed int int32
Definition: c.h:494
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:225
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
int NBuffers
Definition: globals.c:141
PgStat_BgWriterStats PendingBgWriterStats
PgStat_Counter buf_written_clean
Definition: pgstat.h:287
PgStat_Counter maxwritten_clean
Definition: pgstat.h:288
PgStat_Counter buf_alloc
Definition: pgstat.h:289

References Assert, bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, DEBUG1, DEBUG2, elog, PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ BufferAlloc()

static pg_attribute_always_inline BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr,
IOContext  io_context 
)
inlinestatic

Definition at line 1588 of file bufmgr.c.

1592 {
1593  BufferTag newTag; /* identity of requested block */
1594  uint32 newHash; /* hash value for newTag */
1595  LWLock *newPartitionLock; /* buffer partition lock for it */
1596  int existing_buf_id;
1597  Buffer victim_buffer;
1598  BufferDesc *victim_buf_hdr;
1599  uint32 victim_buf_state;
1600 
1601  /* Make sure we will have room to remember the buffer pin */
1604 
1605  /* create a tag so we can lookup the buffer */
1606  InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1607 
1608  /* determine its hash code and partition lock ID */
1609  newHash = BufTableHashCode(&newTag);
1610  newPartitionLock = BufMappingPartitionLock(newHash);
1611 
1612  /* see if the block is in the buffer pool already */
1613  LWLockAcquire(newPartitionLock, LW_SHARED);
1614  existing_buf_id = BufTableLookup(&newTag, newHash);
1615  if (existing_buf_id >= 0)
1616  {
1617  BufferDesc *buf;
1618  bool valid;
1619 
1620  /*
1621  * Found it. Now, pin the buffer so no one can steal it from the
1622  * buffer pool, and check to see if the correct data has been loaded
1623  * into the buffer.
1624  */
1625  buf = GetBufferDescriptor(existing_buf_id);
1626 
1627  valid = PinBuffer(buf, strategy);
1628 
1629  /* Can release the mapping lock as soon as we've pinned it */
1630  LWLockRelease(newPartitionLock);
1631 
1632  *foundPtr = true;
1633 
1634  if (!valid)
1635  {
1636  /*
1637  * We can only get here if (a) someone else is still reading in
1638  * the page, (b) a previous read attempt failed, or (c) someone
1639  * called StartReadBuffers() but not yet WaitReadBuffers().
1640  */
1641  *foundPtr = false;
1642  }
1643 
1644  return buf;
1645  }
1646 
1647  /*
1648  * Didn't find it in the buffer pool. We'll have to initialize a new
1649  * buffer. Remember to unlock the mapping lock while doing the work.
1650  */
1651  LWLockRelease(newPartitionLock);
1652 
1653  /*
1654  * Acquire a victim buffer. Somebody else might try to do the same, we
1655  * don't hold any conflicting locks. If so we'll have to undo our work
1656  * later.
1657  */
1658  victim_buffer = GetVictimBuffer(strategy, io_context);
1659  victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1660 
1661  /*
1662  * Try to make a hashtable entry for the buffer under its new tag. If
1663  * somebody else inserted another buffer for the tag, we'll release the
1664  * victim buffer we acquired and use the already inserted one.
1665  */
1666  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1667  existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1668  if (existing_buf_id >= 0)
1669  {
1670  BufferDesc *existing_buf_hdr;
1671  bool valid;
1672 
1673  /*
1674  * Got a collision. Someone has already done what we were about to do.
1675  * We'll just handle this as if it were found in the buffer pool in
1676  * the first place. First, give up the buffer we were planning to
1677  * use.
1678  *
1679  * We could do this after releasing the partition lock, but then we'd
1680  * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1681  * before acquiring the lock, for the rare case of such a collision.
1682  */
1683  UnpinBuffer(victim_buf_hdr);
1684 
1685  /*
1686  * The victim buffer we acquired previously is clean and unused, let
1687  * it be found again quickly
1688  */
1689  StrategyFreeBuffer(victim_buf_hdr);
1690 
1691  /* remaining code should match code at top of routine */
1692 
1693  existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1694 
1695  valid = PinBuffer(existing_buf_hdr, strategy);
1696 
1697  /* Can release the mapping lock as soon as we've pinned it */
1698  LWLockRelease(newPartitionLock);
1699 
1700  *foundPtr = true;
1701 
1702  if (!valid)
1703  {
1704  /*
1705  * We can only get here if (a) someone else is still reading in
1706  * the page, (b) a previous read attempt failed, or (c) someone
1707  * called StartReadBuffers() but not yet WaitReadBuffers().
1708  */
1709  *foundPtr = false;
1710  }
1711 
1712  return existing_buf_hdr;
1713  }
1714 
1715  /*
1716  * Need to lock the buffer header too in order to change its tag.
1717  */
1718  victim_buf_state = LockBufHdr(victim_buf_hdr);
1719 
1720  /* some sanity checks while we hold the buffer header lock */
1721  Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1722  Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1723 
1724  victim_buf_hdr->tag = newTag;
1725 
1726  /*
1727  * Make sure BM_PERMANENT is set for buffers that must be written at every
1728  * checkpoint. Unlogged buffers only need to be written at shutdown
1729  * checkpoints, except for their "init" forks, which need to be treated
1730  * just like permanent relations.
1731  */
1732  victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1733  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1734  victim_buf_state |= BM_PERMANENT;
1735 
1736  UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1737 
1738  LWLockRelease(newPartitionLock);
1739 
1740  /*
1741  * Buffer contents are currently invalid.
1742  */
1743  *foundPtr = false;
1744 
1745  return victim_buf_hdr;
1746 }
int Buffer
Definition: buf.h:23
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_PERMANENT
Definition: buf_internals.h:69
static LWLock * BufMappingPartitionLock(uint32 hashcode)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:46
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:51
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:2634
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:1932
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:250
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:2788
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1168
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1781
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
static char * buf
Definition: pg_test_fsync.c:73
@ INIT_FORKNUM
Definition: relpath.h:61
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:442
Definition: lwlock.h:42
RelFileLocator locator
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:37

References Assert, BM_DIRTY, BM_IO_IN_PROGRESS, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), CurrentResourceOwner, GetBufferDescriptor(), GetVictimBuffer(), INIT_FORKNUM, InitBufferTag(), RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrRelationData::smgr_rlocator, StrategyFreeBuffer(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by PinBufferForBlock().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 3706 of file bufmgr.c.

3707 {
3708  BufferDesc *bufHdr;
3709 
3710  Assert(BufferIsPinned(buffer));
3711 
3712  if (BufferIsLocal(buffer))
3713  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3714  else
3715  bufHdr = GetBufferDescriptor(buffer - 1);
3716 
3717  /* pinned, so OK to read tag without spinlock */
3718  return bufHdr->tag.blockNum;
3719 }
#define BufferIsLocal(buffer)
Definition: buf.h:37
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:474

References Assert, buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_endpoint(), _bt_finish_split(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), collect_corrupt_items(), collectMatchBitmap(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_fetch_next_buffer(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_page_prune_and_freeze(), heap_prepare_pagescan(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), heapam_scan_analyze_next_block(), heapgettup(), heapgettup_pagemode(), index_compute_xid_horizon_for_tuples(), lazy_scan_noprune(), lazy_scan_prune(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), and WaitReadBuffers().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 3967 of file bufmgr.c.

3968 {
3969  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3970  char *page = BufferGetPage(buffer);
3971  XLogRecPtr lsn;
3972  uint32 buf_state;
3973 
3974  /*
3975  * If we don't need locking for correctness, fastpath out.
3976  */
3977  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
3978  return PageGetLSN(page);
3979 
3980  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3981  Assert(BufferIsValid(buffer));
3982  Assert(BufferIsPinned(buffer));
3983 
3984  buf_state = LockBufHdr(bufHdr);
3985  lsn = PageGetLSN(page);
3986  UnlockBufHdr(bufHdr, buf_state);
3987 
3988  return lsn;
3989 }
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:400
static XLogRecPtr PageGetLSN(const char *page)
Definition: bufpage.h:386
#define XLogHintBitIsNeeded()
Definition: xlog.h:120
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert, PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 3727 of file bufmgr.c.

3729 {
3730  BufferDesc *bufHdr;
3731 
3732  /* Do the same checks as BufferGetBlockNumber. */
3733  Assert(BufferIsPinned(buffer));
3734 
3735  if (BufferIsLocal(buffer))
3736  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3737  else
3738  bufHdr = GetBufferDescriptor(buffer - 1);
3739 
3740  /* pinned, so OK to read tag without spinlock */
3741  *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3742  *forknum = BufTagGetForkNum(&bufHdr->tag);
3743  *blknum = bufHdr->tag.blockNum;
3744 }

References Assert, buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

◆ BufferIsDirty()

bool BufferIsDirty ( Buffer  buffer)

Definition at line 2482 of file bufmgr.c.

2483 {
2484  BufferDesc *bufHdr;
2485 
2486  if (BufferIsLocal(buffer))
2487  {
2488  int bufid = -buffer - 1;
2489 
2490  bufHdr = GetLocalBufferDescriptor(bufid);
2491  }
2492  else
2493  {
2494  bufHdr = GetBufferDescriptor(buffer - 1);
2495  }
2496 
2497  Assert(BufferIsPinned(buffer));
2499  LW_EXCLUSIVE));
2500 
2501  return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2502 }
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:239
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1937
pg_atomic_uint32 state

References Assert, BM_DIRTY, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by XLogRegisterBuffer().

◆ BufferIsExclusiveLocked()

bool BufferIsExclusiveLocked ( Buffer  buffer)

Definition at line 2453 of file bufmgr.c.

2454 {
2455  BufferDesc *bufHdr;
2456 
2457  if (BufferIsLocal(buffer))
2458  {
2459  int bufid = -buffer - 1;
2460 
2461  bufHdr = GetLocalBufferDescriptor(bufid);
2462  }
2463  else
2464  {
2465  bufHdr = GetBufferDescriptor(buffer - 1);
2466  }
2467 
2468  Assert(BufferIsPinned(buffer));
2470  LW_EXCLUSIVE);
2471 }

References Assert, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), LW_EXCLUSIVE, and LWLockHeldByMeInMode().

Referenced by XLogRegisterBuffer().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 3937 of file bufmgr.c.

3938 {
3939  BufferDesc *bufHdr;
3940 
3941  /* Local buffers are used only for temp relations. */
3942  if (BufferIsLocal(buffer))
3943  return false;
3944 
3945  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3946  Assert(BufferIsValid(buffer));
3947  Assert(BufferIsPinned(buffer));
3948 
3949  /*
3950  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3951  * need not bother with the buffer header spinlock. Even if someone else
3952  * changes the buffer header state while we're doing this, the state is
3953  * changed atomically, so we'll read the old value or the new value, but
3954  * not random garbage.
3955  */
3956  bufHdr = GetBufferDescriptor(buffer - 1);
3957  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3958 }

References Assert, BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 2894 of file bufmgr.c.

2895 {
2896  uint32 buf_state;
2897  int buf_id;
2898  int num_to_scan;
2899  int num_spaces;
2900  int num_processed;
2901  int num_written;
2902  CkptTsStatus *per_ts_stat = NULL;
2903  Oid last_tsid;
2904  binaryheap *ts_heap;
2905  int i;
2906  int mask = BM_DIRTY;
2907  WritebackContext wb_context;
2908 
2909  /*
2910  * Unless this is a shutdown checkpoint or we have been explicitly told,
2911  * we write only permanent, dirty buffers. But at shutdown or end of
2912  * recovery, we write all dirty buffers.
2913  */
2916  mask |= BM_PERMANENT;
2917 
2918  /*
2919  * Loop over all buffers, and mark the ones that need to be written with
2920  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2921  * can estimate how much work needs to be done.
2922  *
2923  * This allows us to write only those pages that were dirty when the
2924  * checkpoint began, and not those that get dirtied while it proceeds.
2925  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2926  * later in this function, or by normal backends or the bgwriter cleaning
2927  * scan, the flag is cleared. Any buffer dirtied after this point won't
2928  * have the flag set.
2929  *
2930  * Note that if we fail to write some buffer, we may leave buffers with
2931  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2932  * certainly need to be written for the next checkpoint attempt, too.
2933  */
2934  num_to_scan = 0;
2935  for (buf_id = 0; buf_id < NBuffers; buf_id++)
2936  {
2937  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2938 
2939  /*
2940  * Header spinlock is enough to examine BM_DIRTY, see comment in
2941  * SyncOneBuffer.
2942  */
2943  buf_state = LockBufHdr(bufHdr);
2944 
2945  if ((buf_state & mask) == mask)
2946  {
2947  CkptSortItem *item;
2948 
2949  buf_state |= BM_CHECKPOINT_NEEDED;
2950 
2951  item = &CkptBufferIds[num_to_scan++];
2952  item->buf_id = buf_id;
2953  item->tsId = bufHdr->tag.spcOid;
2954  item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2955  item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2956  item->blockNum = bufHdr->tag.blockNum;
2957  }
2958 
2959  UnlockBufHdr(bufHdr, buf_state);
2960 
2961  /* Check for barrier events in case NBuffers is large. */
2964  }
2965 
2966  if (num_to_scan == 0)
2967  return; /* nothing to do */
2968 
2970 
2971  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2972 
2973  /*
2974  * Sort buffers that need to be written to reduce the likelihood of random
2975  * IO. The sorting is also important for the implementation of balancing
2976  * writes between tablespaces. Without balancing writes we'd potentially
2977  * end up writing to the tablespaces one-by-one; possibly overloading the
2978  * underlying system.
2979  */
2980  sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2981 
2982  num_spaces = 0;
2983 
2984  /*
2985  * Allocate progress status for each tablespace with buffers that need to
2986  * be flushed. This requires the to-be-flushed array to be sorted.
2987  */
2988  last_tsid = InvalidOid;
2989  for (i = 0; i < num_to_scan; i++)
2990  {
2991  CkptTsStatus *s;
2992  Oid cur_tsid;
2993 
2994  cur_tsid = CkptBufferIds[i].tsId;
2995 
2996  /*
2997  * Grow array of per-tablespace status structs, every time a new
2998  * tablespace is found.
2999  */
3000  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3001  {
3002  Size sz;
3003 
3004  num_spaces++;
3005 
3006  /*
3007  * Not worth adding grow-by-power-of-2 logic here - even with a
3008  * few hundred tablespaces this should be fine.
3009  */
3010  sz = sizeof(CkptTsStatus) * num_spaces;
3011 
3012  if (per_ts_stat == NULL)
3013  per_ts_stat = (CkptTsStatus *) palloc(sz);
3014  else
3015  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3016 
3017  s = &per_ts_stat[num_spaces - 1];
3018  memset(s, 0, sizeof(*s));
3019  s->tsId = cur_tsid;
3020 
3021  /*
3022  * The first buffer in this tablespace. As CkptBufferIds is sorted
3023  * by tablespace all (s->num_to_scan) buffers in this tablespace
3024  * will follow afterwards.
3025  */
3026  s->index = i;
3027 
3028  /*
3029  * progress_slice will be determined once we know how many buffers
3030  * are in each tablespace, i.e. after this loop.
3031  */
3032 
3033  last_tsid = cur_tsid;
3034  }
3035  else
3036  {
3037  s = &per_ts_stat[num_spaces - 1];
3038  }
3039 
3040  s->num_to_scan++;
3041 
3042  /* Check for barrier events. */
3045  }
3046 
3047  Assert(num_spaces > 0);
3048 
3049  /*
3050  * Build a min-heap over the write-progress in the individual tablespaces,
3051  * and compute how large a portion of the total progress a single
3052  * processed buffer is.
3053  */
3054  ts_heap = binaryheap_allocate(num_spaces,
3056  NULL);
3057 
3058  for (i = 0; i < num_spaces; i++)
3059  {
3060  CkptTsStatus *ts_stat = &per_ts_stat[i];
3061 
3062  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3063 
3064  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3065  }
3066 
3067  binaryheap_build(ts_heap);
3068 
3069  /*
3070  * Iterate through to-be-checkpointed buffers and write the ones (still)
3071  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3072  * tablespaces; otherwise the sorting would lead to only one tablespace
3073  * receiving writes at a time, making inefficient use of the hardware.
3074  */
3075  num_processed = 0;
3076  num_written = 0;
3077  while (!binaryheap_empty(ts_heap))
3078  {
3079  BufferDesc *bufHdr = NULL;
3080  CkptTsStatus *ts_stat = (CkptTsStatus *)
3082 
3083  buf_id = CkptBufferIds[ts_stat->index].buf_id;
3084  Assert(buf_id != -1);
3085 
3086  bufHdr = GetBufferDescriptor(buf_id);
3087 
3088  num_processed++;
3089 
3090  /*
3091  * We don't need to acquire the lock here, because we're only looking
3092  * at a single bit. It's possible that someone else writes the buffer
3093  * and clears the flag right after we check, but that doesn't matter
3094  * since SyncOneBuffer will then do nothing. However, there is a
3095  * further race condition: it's conceivable that between the time we
3096  * examine the bit here and the time SyncOneBuffer acquires the lock,
3097  * someone else not only wrote the buffer but replaced it with another
3098  * page and dirtied it. In that improbable case, SyncOneBuffer will
3099  * write the buffer though we didn't need to. It doesn't seem worth
3100  * guarding against this, though.
3101  */
3103  {
3104  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3105  {
3106  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3108  num_written++;
3109  }
3110  }
3111 
3112  /*
3113  * Measure progress independent of actually having to flush the buffer
3114  * - otherwise writing become unbalanced.
3115  */
3116  ts_stat->progress += ts_stat->progress_slice;
3117  ts_stat->num_scanned++;
3118  ts_stat->index++;
3119 
3120  /* Have all the buffers from the tablespace been processed? */
3121  if (ts_stat->num_scanned == ts_stat->num_to_scan)
3122  {
3123  binaryheap_remove_first(ts_heap);
3124  }
3125  else
3126  {
3127  /* update heap with the new progress */
3128  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3129  }
3130 
3131  /*
3132  * Sleep to throttle our I/O rate.
3133  *
3134  * (This will check for barrier events even if it doesn't sleep.)
3135  */
3136  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3137  }
3138 
3139  /*
3140  * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3141  * IOContext will always be IOCONTEXT_NORMAL.
3142  */
3144 
3145  pfree(per_ts_stat);
3146  per_ts_stat = NULL;
3147  binaryheap_free(ts_heap);
3148 
3149  /*
3150  * Update checkpoint statistics. As noted above, this doesn't include
3151  * buffers written by other backends or bgwriter scan.
3152  */
3153  CheckpointStats.ckpt_bufs_written += num_written;
3154 
3155  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3156 }
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
#define binaryheap_empty(h)
Definition: binaryheap.h:65
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:68
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:5862
int checkpoint_flush_after
Definition: bufmgr.c:171
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:5885
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:5942
struct CkptTsStatus CkptTsStatus
double float8
Definition: c.h:630
size_t Size
Definition: c.h:605
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:711
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:39
int i
Definition: isn.c:73
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1541
void * palloc(Size size)
Definition: mcxt.c:1317
@ IOCONTEXT_NORMAL
Definition: pgstat.h:322
PgStat_CheckpointerStats PendingCheckpointerStats
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:496
int ckpt_bufs_written
Definition: xlog.h:167
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:115
int index
Definition: bufmgr.c:123
int num_scanned
Definition: bufmgr.c:120
float8 progress
Definition: bufmgr.c:114
int num_to_scan
Definition: bufmgr.c:118
Oid tsId
Definition: bufmgr.c:105
PgStat_Counter buffers_written
Definition: pgstat.h:302
Oid spcOid
Definition: buf_internals.h:94
CheckpointStatsData CheckpointStats
Definition: xlog.c:208
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:140
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:143
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:139

References Assert, binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buffers_written, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, DatumGetPointer(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IOCONTEXT_NORMAL, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u32(), PointerGetDatum(), ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), buftag::spcOid, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 5797 of file bufmgr.c.

5798 {
5799  int ret;
5800  RelFileLocator rlocatora;
5801  RelFileLocator rlocatorb;
5802 
5803  rlocatora = BufTagGetRelFileLocator(ba);
5804  rlocatorb = BufTagGetRelFileLocator(bb);
5805 
5806  ret = rlocator_comparator(&rlocatora, &rlocatorb);
5807 
5808  if (ret != 0)
5809  return ret;
5810 
5811  if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
5812  return -1;
5813  if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
5814  return 1;
5815 
5816  if (ba->blockNum < bb->blockNum)
5817  return -1;
5818  if (ba->blockNum > bb->blockNum)
5819  return 1;
5820 
5821  return 0;
5822 }
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:5716

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), and rlocator_comparator().

◆ CheckBufferIsPinnedOnce()

void CheckBufferIsPinnedOnce ( Buffer  buffer)

Definition at line 5187 of file bufmgr.c.

5188 {
5189  if (BufferIsLocal(buffer))
5190  {
5191  if (LocalRefCount[-buffer - 1] != 1)
5192  elog(ERROR, "incorrect local pin count: %d",
5193  LocalRefCount[-buffer - 1]);
5194  }
5195  else
5196  {
5197  if (GetPrivateRefCount(buffer) != 1)
5198  elog(ERROR, "incorrect local pin count: %d",
5199  GetPrivateRefCount(buffer));
5200  }
5201 }
#define ERROR
Definition: elog.h:39

References PrivateRefCountEntry::buffer, BufferIsLocal, elog, ERROR, GetPrivateRefCount(), and LocalRefCount.

Referenced by GetVictimBuffer(), and LockBufferForCleanup().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 3601 of file bufmgr.c.

3602 {
3603 #ifdef USE_ASSERT_CHECKING
3604  int RefCountErrors = 0;
3606  int i;
3607  char *s;
3608 
3609  /* check the array */
3610  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3611  {
3613 
3614  if (res->buffer != InvalidBuffer)
3615  {
3616  s = DebugPrintBufferRefcount(res->buffer);
3617  elog(WARNING, "buffer refcount leak: %s", s);
3618  pfree(s);
3619 
3620  RefCountErrors++;
3621  }
3622  }
3623 
3624  /* if necessary search the hash */
3626  {
3627  HASH_SEQ_STATUS hstat;
3628 
3630  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3631  {
3632  s = DebugPrintBufferRefcount(res->buffer);
3633  elog(WARNING, "buffer refcount leak: %s", s);
3634  pfree(s);
3635  RefCountErrors++;
3636  }
3637  }
3638 
3639  Assert(RefCountErrors == 0);
3640 #endif
3641 }
#define InvalidBuffer
Definition: buf.h:25
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:3647
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:96
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:208
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:209
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1420
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385

References Assert, DebugPrintBufferRefcount(), elog, hash_seq_init(), hash_seq_search(), i, InvalidBuffer, pfree(), PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, res, and WARNING.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 3692 of file bufmgr.c.

3693 {
3694  BufferSync(flags);
3695 }
static void BufferSync(int flags)
Definition: bufmgr.c:2894

References BufferSync().

Referenced by CheckPointGuts().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 5831 of file bufmgr.c.

5832 {
5833  /* compare tablespace */
5834  if (a->tsId < b->tsId)
5835  return -1;
5836  else if (a->tsId > b->tsId)
5837  return 1;
5838  /* compare relation */
5839  if (a->relNumber < b->relNumber)
5840  return -1;
5841  else if (a->relNumber > b->relNumber)
5842  return 1;
5843  /* compare fork */
5844  else if (a->forkNum < b->forkNum)
5845  return -1;
5846  else if (a->forkNum > b->forkNum)
5847  return 1;
5848  /* compare block number */
5849  else if (a->blockNum < b->blockNum)
5850  return -1;
5851  else if (a->blockNum > b->blockNum)
5852  return 1;
5853  /* equal page IDs are unlikely, but not impossible */
5854  return 0;
5855 }
int b
Definition: isn.c:70
int a
Definition: isn.c:69

References a, and b.

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 5166 of file bufmgr.c.

5167 {
5168  BufferDesc *buf;
5169 
5170  Assert(BufferIsPinned(buffer));
5171  if (BufferIsLocal(buffer))
5172  return true; /* act as though we got it */
5173 
5174  buf = GetBufferDescriptor(buffer - 1);
5175 
5177  LW_EXCLUSIVE);
5178 }
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1339

References Assert, buf, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), LW_EXCLUSIVE, and LWLockConditionalAcquire().

Referenced by _bt_conditionallockbuf(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 5381 of file bufmgr.c.

5382 {
5383  BufferDesc *bufHdr;
5384  uint32 buf_state,
5385  refcount;
5386 
5387  Assert(BufferIsValid(buffer));
5388 
5389  if (BufferIsLocal(buffer))
5390  {
5391  refcount = LocalRefCount[-buffer - 1];
5392  /* There should be exactly one pin */
5393  Assert(refcount > 0);
5394  if (refcount != 1)
5395  return false;
5396  /* Nobody else to wait for */
5397  return true;
5398  }
5399 
5400  /* There should be exactly one local pin */
5401  refcount = GetPrivateRefCount(buffer);
5402  Assert(refcount);
5403  if (refcount != 1)
5404  return false;
5405 
5406  /* Try to acquire lock */
5407  if (!ConditionalLockBuffer(buffer))
5408  return false;
5409 
5410  bufHdr = GetBufferDescriptor(buffer - 1);
5411  buf_state = LockBufHdr(bufHdr);
5412  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5413 
5414  Assert(refcount > 0);
5415  if (refcount == 1)
5416  {
5417  /* Successfully acquired exclusive lock with pincount 1 */
5418  UnlockBufHdr(bufHdr, buf_state);
5419  return true;
5420  }
5421 
5422  /* Failed, so release the lock */
5423  UnlockBufHdr(bufHdr, buf_state);
5424  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5425  return false;
5426 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5166
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5140
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:189

References Assert, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 4780 of file bufmgr.c.

4782 {
4783  char relpersistence;
4784  SMgrRelation src_rel;
4785  SMgrRelation dst_rel;
4786 
4787  /* Set the relpersistence. */
4788  relpersistence = permanent ?
4789  RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4790 
4791  src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
4792  dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
4793 
4794  /*
4795  * Create and copy all forks of the relation. During create database we
4796  * have a separate cleanup mechanism which deletes complete database
4797  * directory. Therefore, each individual relation doesn't need to be
4798  * registered for cleanup.
4799  */
4800  RelationCreateStorage(dst_rlocator, relpersistence, false);
4801 
4802  /* copy main fork. */
4803  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4804  permanent);
4805 
4806  /* copy those extra forks that exist */
4807  for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4808  forkNum <= MAX_FORKNUM; forkNum++)
4809  {
4810  if (smgrexists(src_rel, forkNum))
4811  {
4812  smgrcreate(dst_rel, forkNum, false);
4813 
4814  /*
4815  * WAL log creation if the relation is persistent, or this is the
4816  * init fork of an unlogged relation.
4817  */
4818  if (permanent || forkNum == INIT_FORKNUM)
4819  log_smgrcreate(&dst_rlocator, forkNum);
4820 
4821  /* Copy a fork's data, block by block. */
4822  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4823  permanent);
4824  }
4825  }
4826 }
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:4672
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
ForkNumber
Definition: relpath.h:56
@ MAIN_FORKNUM
Definition: relpath.h:58
#define MAX_FORKNUM
Definition: relpath.h:70
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:198
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:411
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:398
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:121
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:186

References INIT_FORKNUM, INVALID_PROC_NUMBER, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DebugPrintBufferRefcount()

char* DebugPrintBufferRefcount ( Buffer  buffer)

Definition at line 3647 of file bufmgr.c.

3648 {
3649  BufferDesc *buf;
3650  int32 loccount;
3651  char *path;
3652  char *result;
3653  ProcNumber backend;
3654  uint32 buf_state;
3655 
3656  Assert(BufferIsValid(buffer));
3657  if (BufferIsLocal(buffer))
3658  {
3659  buf = GetLocalBufferDescriptor(-buffer - 1);
3660  loccount = LocalRefCount[-buffer - 1];
3661  backend = MyProcNumber;
3662  }
3663  else
3664  {
3665  buf = GetBufferDescriptor(buffer - 1);
3666  loccount = GetPrivateRefCount(buffer);
3667  backend = INVALID_PROC_NUMBER;
3668  }
3669 
3670  /* theoretically we should lock the bufhdr here */
3671  path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3672  BufTagGetForkNum(&buf->tag));
3673  buf_state = pg_atomic_read_u32(&buf->state);
3674 
3675  result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3676  buffer, path,
3677  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3678  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3679  pfree(path);
3680  return result;
3681 }
#define BUF_FLAG_MASK
Definition: buf_internals.h:48
ProcNumber MyProcNumber
Definition: globals.c:89
int ProcNumber
Definition: procnumber.h:24
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:93

References Assert, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), INVALID_PROC_NUMBER, LocalRefCount, MyProcNumber, pfree(), pg_atomic_read_u32(), psprintf(), and relpathbackend.

Referenced by CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResOwnerPrintBufferPin().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 4368 of file bufmgr.c.

4369 {
4370  int i;
4371 
4372  /*
4373  * We needn't consider local buffers, since by assumption the target
4374  * database isn't our own.
4375  */
4376 
4377  for (i = 0; i < NBuffers; i++)
4378  {
4379  BufferDesc *bufHdr = GetBufferDescriptor(i);
4380  uint32 buf_state;
4381 
4382  /*
4383  * As in DropRelationBuffers, an unlocked precheck should be safe and
4384  * saves some cycles.
4385  */
4386  if (bufHdr->tag.dbOid != dbid)
4387  continue;
4388 
4389  buf_state = LockBufHdr(bufHdr);
4390  if (bufHdr->tag.dbOid == dbid)
4391  InvalidateBuffer(bufHdr); /* releases spinlock */
4392  else
4393  UnlockBufHdr(bufHdr, buf_state);
4394  }
4395 }
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1766
Oid dbOid
Definition: buf_internals.h:95

References buftag::dbOid, GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, BufferDesc::tag, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 4013 of file bufmgr.c.

4015 {
4016  int i;
4017  int j;
4018  RelFileLocatorBackend rlocator;
4019  BlockNumber nForkBlock[MAX_FORKNUM];
4020  uint64 nBlocksToInvalidate = 0;
4021 
4022  rlocator = smgr_reln->smgr_rlocator;
4023 
4024  /* If it's a local relation, it's localbuf.c's problem. */
4025  if (RelFileLocatorBackendIsTemp(rlocator))
4026  {
4027  if (rlocator.backend == MyProcNumber)
4028  {
4029  for (j = 0; j < nforks; j++)
4030  DropRelationLocalBuffers(rlocator.locator, forkNum[j],
4031  firstDelBlock[j]);
4032  }
4033  return;
4034  }
4035 
4036  /*
4037  * To remove all the pages of the specified relation forks from the buffer
4038  * pool, we need to scan the entire buffer pool but we can optimize it by
4039  * finding the buffers from BufMapping table provided we know the exact
4040  * size of each fork of the relation. The exact size is required to ensure
4041  * that we don't leave any buffer for the relation being dropped as
4042  * otherwise the background writer or checkpointer can lead to a PANIC
4043  * error while flushing buffers corresponding to files that don't exist.
4044  *
4045  * To know the exact size, we rely on the size cached for each fork by us
4046  * during recovery which limits the optimization to recovery and on
4047  * standbys but we can easily extend it once we have shared cache for
4048  * relation size.
4049  *
4050  * In recovery, we cache the value returned by the first lseek(SEEK_END)
4051  * and the future writes keeps the cached value up-to-date. See
4052  * smgrextend. It is possible that the value of the first lseek is smaller
4053  * than the actual number of existing blocks in the file due to buggy
4054  * Linux kernels that might not have accounted for the recent write. But
4055  * that should be fine because there must not be any buffers after that
4056  * file size.
4057  */
4058  for (i = 0; i < nforks; i++)
4059  {
4060  /* Get the number of blocks for a relation's fork */
4061  nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4062 
4063  if (nForkBlock[i] == InvalidBlockNumber)
4064  {
4065  nBlocksToInvalidate = InvalidBlockNumber;
4066  break;
4067  }
4068 
4069  /* calculate the number of blocks to be invalidated */
4070  nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4071  }
4072 
4073  /*
4074  * We apply the optimization iff the total number of blocks to invalidate
4075  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4076  */
4077  if (BlockNumberIsValid(nBlocksToInvalidate) &&
4078  nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4079  {
4080  for (j = 0; j < nforks; j++)
4081  FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4082  nForkBlock[j], firstDelBlock[j]);
4083  return;
4084  }
4085 
4086  for (i = 0; i < NBuffers; i++)
4087  {
4088  BufferDesc *bufHdr = GetBufferDescriptor(i);
4089  uint32 buf_state;
4090 
4091  /*
4092  * We can make this a tad faster by prechecking the buffer tag before
4093  * we attempt to lock the buffer; this saves a lot of lock
4094  * acquisitions in typical cases. It should be safe because the
4095  * caller must have AccessExclusiveLock on the relation, or some other
4096  * reason to be certain that no one is loading new pages of the rel
4097  * into the buffer pool. (Otherwise we might well miss such pages
4098  * entirely.) Therefore, while the tag might be changing while we
4099  * look at it, it can't be changing *to* a value we care about, only
4100  * *away* from such a value. So false negatives are impossible, and
4101  * false positives are safe because we'll recheck after getting the
4102  * buffer lock.
4103  *
4104  * We could check forkNum and blockNum as well as the rlocator, but
4105  * the incremental win from doing so seems small.
4106  */
4107  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4108  continue;
4109 
4110  buf_state = LockBufHdr(bufHdr);
4111 
4112  for (j = 0; j < nforks; j++)
4113  {
4114  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4115  BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4116  bufHdr->tag.blockNum >= firstDelBlock[j])
4117  {
4118  InvalidateBuffer(bufHdr); /* releases spinlock */
4119  break;
4120  }
4121  }
4122  if (j >= nforks)
4123  UnlockBufHdr(bufHdr, buf_state);
4124  }
4125 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:87
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4307
int j
Definition: isn.c:74
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:489
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:679

References RelFileLocatorBackend::backend, buftag::blockNum, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, RelFileLocatorBackendIsTemp, SMgrRelationData::smgr_rlocator, smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 4136 of file bufmgr.c.

4137 {
4138  int i;
4139  int n = 0;
4140  SMgrRelation *rels;
4141  BlockNumber (*block)[MAX_FORKNUM + 1];
4142  uint64 nBlocksToInvalidate = 0;
4143  RelFileLocator *locators;
4144  bool cached = true;
4145  bool use_bsearch;
4146 
4147  if (nlocators == 0)
4148  return;
4149 
4150  rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4151 
4152  /* If it's a local relation, it's localbuf.c's problem. */
4153  for (i = 0; i < nlocators; i++)
4154  {
4155  if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4156  {
4157  if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4158  DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4159  }
4160  else
4161  rels[n++] = smgr_reln[i];
4162  }
4163 
4164  /*
4165  * If there are no non-local relations, then we're done. Release the
4166  * memory and return.
4167  */
4168  if (n == 0)
4169  {
4170  pfree(rels);
4171  return;
4172  }
4173 
4174  /*
4175  * This is used to remember the number of blocks for all the relations
4176  * forks.
4177  */
4178  block = (BlockNumber (*)[MAX_FORKNUM + 1])
4179  palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4180 
4181  /*
4182  * We can avoid scanning the entire buffer pool if we know the exact size
4183  * of each of the given relation forks. See DropRelationBuffers.
4184  */
4185  for (i = 0; i < n && cached; i++)
4186  {
4187  for (int j = 0; j <= MAX_FORKNUM; j++)
4188  {
4189  /* Get the number of blocks for a relation's fork. */
4190  block[i][j] = smgrnblocks_cached(rels[i], j);
4191 
4192  /* We need to only consider the relation forks that exists. */
4193  if (block[i][j] == InvalidBlockNumber)
4194  {
4195  if (!smgrexists(rels[i], j))
4196  continue;
4197  cached = false;
4198  break;
4199  }
4200 
4201  /* calculate the total number of blocks to be invalidated */
4202  nBlocksToInvalidate += block[i][j];
4203  }
4204  }
4205 
4206  /*
4207  * We apply the optimization iff the total number of blocks to invalidate
4208  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4209  */
4210  if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4211  {
4212  for (i = 0; i < n; i++)
4213  {
4214  for (int j = 0; j <= MAX_FORKNUM; j++)
4215  {
4216  /* ignore relation forks that doesn't exist */
4217  if (!BlockNumberIsValid(block[i][j]))
4218  continue;
4219 
4220  /* drop all the buffers for a particular relation fork */
4221  FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4222  j, block[i][j], 0);
4223  }
4224  }
4225 
4226  pfree(block);
4227  pfree(rels);
4228  return;
4229  }
4230 
4231  pfree(block);
4232  locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4233  for (i = 0; i < n; i++)
4234  locators[i] = rels[i]->smgr_rlocator.locator;
4235 
4236  /*
4237  * For low number of relations to drop just use a simple walk through, to
4238  * save the bsearch overhead. The threshold to use is rather a guess than
4239  * an exactly determined value, as it depends on many factors (CPU and RAM
4240  * speeds, amount of shared buffers etc.).
4241  */
4242  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4243 
4244  /* sort the list of rlocators if necessary */
4245  if (use_bsearch)
4246  qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4247 
4248  for (i = 0; i < NBuffers; i++)
4249  {
4250  RelFileLocator *rlocator = NULL;
4251  BufferDesc *bufHdr = GetBufferDescriptor(i);
4252  uint32 buf_state;
4253 
4254  /*
4255  * As in DropRelationBuffers, an unlocked precheck should be safe and
4256  * saves some cycles.
4257  */
4258 
4259  if (!use_bsearch)
4260  {
4261  int j;
4262 
4263  for (j = 0; j < n; j++)
4264  {
4265  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4266  {
4267  rlocator = &locators[j];
4268  break;
4269  }
4270  }
4271  }
4272  else
4273  {
4274  RelFileLocator locator;
4275 
4276  locator = BufTagGetRelFileLocator(&bufHdr->tag);
4277  rlocator = bsearch((const void *) &(locator),
4278  locators, n, sizeof(RelFileLocator),
4280  }
4281 
4282  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4283  if (rlocator == NULL)
4284  continue;
4285 
4286  buf_state = LockBufHdr(bufHdr);
4287  if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4288  InvalidateBuffer(bufHdr); /* releases spinlock */
4289  else
4290  UnlockBufHdr(bufHdr, buf_state);
4291  }
4292 
4293  pfree(locators);
4294  pfree(rels);
4295 }
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:79
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:537
#define qsort(a, b, c, d)
Definition: port.h:447

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, if(), InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, palloc(), pfree(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ EvictUnpinnedBuffer()

bool EvictUnpinnedBuffer ( Buffer  buf)

Definition at line 6078 of file bufmgr.c.

6079 {
6080  BufferDesc *desc;
6081  uint32 buf_state;
6082  bool result;
6083 
6084  /* Make sure we can pin the buffer. */
6087 
6089  desc = GetBufferDescriptor(buf - 1);
6090 
6091  /* Lock the header and check if it's valid. */
6092  buf_state = LockBufHdr(desc);
6093  if ((buf_state & BM_VALID) == 0)
6094  {
6095  UnlockBufHdr(desc, buf_state);
6096  return false;
6097  }
6098 
6099  /* Check that it's not pinned already. */
6100  if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6101  {
6102  UnlockBufHdr(desc, buf_state);
6103  return false;
6104  }
6105 
6106  PinBuffer_Locked(desc); /* releases spinlock */
6107 
6108  /* If it was dirty, try to clean it once. */
6109  if (buf_state & BM_DIRTY)
6110  {
6114  }
6115 
6116  /* This will return false if it becomes dirty or someone else pins it. */
6117  result = InvalidateVictimBuffer(desc);
6118 
6119  UnpinBuffer(desc);
6120 
6121  return result;
6122 }
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:3766
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:2745
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:1864
@ IOOBJECT_RELATION
Definition: pgstat.h:312

References Assert, BM_DIRTY, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock(), BufferIsLocal, CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), InvalidateVictimBuffer(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), UnlockBufHdr(), and UnpinBuffer().

Referenced by pg_buffercache_evict().

◆ ExtendBufferedRel()

Buffer ExtendBufferedRel ( BufferManagerRelation  bmr,
ForkNumber  forkNum,
BufferAccessStrategy  strategy,
uint32  flags 
)

Definition at line 846 of file bufmgr.c.

850 {
851  Buffer buf;
852  uint32 extend_by = 1;
853 
854  ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
855  &buf, &extend_by);
856 
857  return buf;
858 }
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:878

References buf, and ExtendBufferedRelBy().

Referenced by _bt_allocbuf(), _hash_getnewbuf(), BloomNewBuffer(), brinbuild(), brinbuildempty(), fill_seq_fork_with_data(), ginbuildempty(), GinNewBuffer(), gistbuildempty(), gistNewBuffer(), ReadBuffer_common(), revmap_physical_extend(), and SpGistNewBuffer().

◆ ExtendBufferedRelBy()

BlockNumber ExtendBufferedRelBy ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
Buffer buffers,
uint32 extended_by 
)

Definition at line 878 of file bufmgr.c.

885 {
886  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
887  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
888  Assert(extend_by > 0);
889 
890  if (bmr.smgr == NULL)
891  {
892  bmr.smgr = RelationGetSmgr(bmr.rel);
893  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
894  }
895 
896  return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
897  extend_by, InvalidBlockNumber,
898  buffers, extended_by);
899 }
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2129
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:567
struct SMgrRelationData * smgr
Definition: bufmgr.h:103
Form_pg_class rd_rel
Definition: rel.h:111

References Assert, ExtendBufferedRelCommon(), InvalidBlockNumber, RelationData::rd_rel, BufferManagerRelation::rel, RelationGetSmgr(), BufferManagerRelation::relpersistence, and BufferManagerRelation::smgr.

Referenced by ExtendBufferedRel(), and RelationAddBlocks().

◆ ExtendBufferedRelCommon()

static BlockNumber ExtendBufferedRelCommon ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2129 of file bufmgr.c.

2137 {
2138  BlockNumber first_block;
2139 
2140  TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2144  bmr.smgr->smgr_rlocator.backend,
2145  extend_by);
2146 
2147  if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2148  first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2149  extend_by, extend_upto,
2150  buffers, &extend_by);
2151  else
2152  first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2153  extend_by, extend_upto,
2154  buffers, &extend_by);
2155  *extended_by = extend_by;
2156 
2157  TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2161  bmr.smgr->smgr_rlocator.backend,
2162  *extended_by,
2163  first_block);
2164 
2165  return first_block;
2166 }
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2173
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:313
RelFileNumber relNumber

References RelFileLocatorBackend::backend, RelFileLocator::dbOid, ExtendBufferedRelLocal(), ExtendBufferedRelShared(), RelFileLocatorBackend::locator, RelFileLocator::relNumber, BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_rlocator, and RelFileLocator::spcOid.

Referenced by ExtendBufferedRelBy(), and ExtendBufferedRelTo().

◆ ExtendBufferedRelShared()

static BlockNumber ExtendBufferedRelShared ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2173 of file bufmgr.c.

2181 {
2182  BlockNumber first_block;
2183  IOContext io_context = IOContextForStrategy(strategy);
2184  instr_time io_start;
2185 
2186  LimitAdditionalPins(&extend_by);
2187 
2188  /*
2189  * Acquire victim buffers for extension without holding extension lock.
2190  * Writing out victim buffers is the most expensive part of extending the
2191  * relation, particularly when doing so requires WAL flushes. Zeroing out
2192  * the buffers is also quite expensive, so do that before holding the
2193  * extension lock as well.
2194  *
2195  * These pages are pinned by us and not valid. While we hold the pin they
2196  * can't be acquired as victim buffers by another backend.
2197  */
2198  for (uint32 i = 0; i < extend_by; i++)
2199  {
2200  Block buf_block;
2201 
2202  buffers[i] = GetVictimBuffer(strategy, io_context);
2203  buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2204 
2205  /* new buffers are zero-filled */
2206  MemSet((char *) buf_block, 0, BLCKSZ);
2207  }
2208 
2209  /*
2210  * Lock relation against concurrent extensions, unless requested not to.
2211  *
2212  * We use the same extension lock for all forks. That's unnecessarily
2213  * restrictive, but currently extensions for forks don't happen often
2214  * enough to make it worth locking more granularly.
2215  *
2216  * Note that another backend might have extended the relation by the time
2217  * we get the lock.
2218  */
2219  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2221 
2222  /*
2223  * If requested, invalidate size cache, so that smgrnblocks asks the
2224  * kernel.
2225  */
2226  if (flags & EB_CLEAR_SIZE_CACHE)
2228 
2229  first_block = smgrnblocks(bmr.smgr, fork);
2230 
2231  /*
2232  * Now that we have the accurate relation size, check if the caller wants
2233  * us to extend to only up to a specific size. If there were concurrent
2234  * extensions, we might have acquired too many buffers and need to release
2235  * them.
2236  */
2237  if (extend_upto != InvalidBlockNumber)
2238  {
2239  uint32 orig_extend_by = extend_by;
2240 
2241  if (first_block > extend_upto)
2242  extend_by = 0;
2243  else if ((uint64) first_block + extend_by > extend_upto)
2244  extend_by = extend_upto - first_block;
2245 
2246  for (uint32 i = extend_by; i < orig_extend_by; i++)
2247  {
2248  BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2249 
2250  /*
2251  * The victim buffer we acquired previously is clean and unused,
2252  * let it be found again quickly
2253  */
2254  StrategyFreeBuffer(buf_hdr);
2255  UnpinBuffer(buf_hdr);
2256  }
2257 
2258  if (extend_by == 0)
2259  {
2260  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2262  *extended_by = extend_by;
2263  return first_block;
2264  }
2265  }
2266 
2267  /* Fail if relation is already at maximum possible length */
2268  if ((uint64) first_block + extend_by >= MaxBlockNumber)
2269  ereport(ERROR,
2270  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2271  errmsg("cannot extend relation %s beyond %u blocks",
2272  relpath(bmr.smgr->smgr_rlocator, fork),
2273  MaxBlockNumber)));
2274 
2275  /*
2276  * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2277  *
2278  * This needs to happen before we extend the relation, because as soon as
2279  * we do, other backends can start to read in those pages.
2280  */
2281  for (uint32 i = 0; i < extend_by; i++)
2282  {
2283  Buffer victim_buf = buffers[i];
2284  BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2285  BufferTag tag;
2286  uint32 hash;
2287  LWLock *partition_lock;
2288  int existing_id;
2289 
2290  /* in case we need to pin an existing buffer below */
2293 
2294  InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2295  hash = BufTableHashCode(&tag);
2296  partition_lock = BufMappingPartitionLock(hash);
2297 
2298  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2299 
2300  existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2301 
2302  /*
2303  * We get here only in the corner case where we are trying to extend
2304  * the relation but we found a pre-existing buffer. This can happen
2305  * because a prior attempt at extending the relation failed, and
2306  * because mdread doesn't complain about reads beyond EOF (when
2307  * zero_damaged_pages is ON) and so a previous attempt to read a block
2308  * beyond EOF could have left a "valid" zero-filled buffer.
2309  * Unfortunately, we have also seen this case occurring because of
2310  * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2311  * that doesn't account for a recent write. In that situation, the
2312  * pre-existing buffer would contain valid data that we don't want to
2313  * overwrite. Since the legitimate cases should always have left a
2314  * zero-filled buffer, complain if not PageIsNew.
2315  */
2316  if (existing_id >= 0)
2317  {
2318  BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2319  Block buf_block;
2320  bool valid;
2321 
2322  /*
2323  * Pin the existing buffer before releasing the partition lock,
2324  * preventing it from being evicted.
2325  */
2326  valid = PinBuffer(existing_hdr, strategy);
2327 
2328  LWLockRelease(partition_lock);
2329 
2330  /*
2331  * The victim buffer we acquired previously is clean and unused,
2332  * let it be found again quickly
2333  */
2334  StrategyFreeBuffer(victim_buf_hdr);
2335  UnpinBuffer(victim_buf_hdr);
2336 
2337  buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2338  buf_block = BufHdrGetBlock(existing_hdr);
2339 
2340  if (valid && !PageIsNew((Page) buf_block))
2341  ereport(ERROR,
2342  (errmsg("unexpected data beyond EOF in block %u of relation %s",
2343  existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2344  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2345 
2346  /*
2347  * We *must* do smgr[zero]extend before succeeding, else the page
2348  * will not be reserved by the kernel, and the next P_NEW call
2349  * will decide to return the same page. Clear the BM_VALID bit,
2350  * do StartBufferIO() and proceed.
2351  *
2352  * Loop to handle the very small possibility that someone re-sets
2353  * BM_VALID between our clearing it and StartBufferIO inspecting
2354  * it.
2355  */
2356  do
2357  {
2358  uint32 buf_state = LockBufHdr(existing_hdr);
2359 
2360  buf_state &= ~BM_VALID;
2361  UnlockBufHdr(existing_hdr, buf_state);
2362  } while (!StartBufferIO(existing_hdr, true, false));
2363  }
2364  else
2365  {
2366  uint32 buf_state;
2367 
2368  buf_state = LockBufHdr(victim_buf_hdr);
2369 
2370  /* some sanity checks while we hold the buffer header lock */
2371  Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2372  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2373 
2374  victim_buf_hdr->tag = tag;
2375 
2376  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2377  if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2378  buf_state |= BM_PERMANENT;
2379 
2380  UnlockBufHdr(victim_buf_hdr, buf_state);
2381 
2382  LWLockRelease(partition_lock);
2383 
2384  /* XXX: could combine the locked operations in it with the above */
2385  StartBufferIO(victim_buf_hdr, true, false);
2386  }
2387  }
2388 
2390 
2391  /*
2392  * Note: if smgrzeroextend fails, we will end up with buffers that are
2393  * allocated but not marked BM_VALID. The next relation extension will
2394  * still select the same block number (because the relation didn't get any
2395  * longer on disk) and so future attempts to extend the relation will find
2396  * the same buffers (if they have not been recycled) but come right back
2397  * here to try smgrzeroextend again.
2398  *
2399  * We don't need to set checksum for all-zero pages.
2400  */
2401  smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2402 
2403  /*
2404  * Release the file-extension lock; it's now OK for someone else to extend
2405  * the relation some more.
2406  *
2407  * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2408  * take noticeable time.
2409  */
2410  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2412 
2414  io_start, extend_by);
2415 
2416  /* Set BM_VALID, terminate IO, and wake up any waiters */
2417  for (uint32 i = 0; i < extend_by; i++)
2418  {
2419  Buffer buf = buffers[i];
2420  BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2421  bool lock = false;
2422 
2423  if (flags & EB_LOCK_FIRST && i == 0)
2424  lock = true;
2425  else if (flags & EB_LOCK_TARGET)
2426  {
2427  Assert(extend_upto != InvalidBlockNumber);
2428  if (first_block + i + 1 == extend_upto)
2429  lock = true;
2430  }
2431 
2432  if (lock)
2434 
2435  TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2436  }
2437 
2438  pgBufferUsage.shared_blks_written += extend_by;
2439 
2440  *extended_by = extend_by;
2441 
2442  return first_block;
2443 }
#define MaxBlockNumber
Definition: block.h:35
#define BM_JUST_DIRTIED
Definition: buf_internals.h:66
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
bool track_io_timing
Definition: bufmgr.c:143
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:68
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2098
static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:5540
void * Block
Definition: bufmgr.h:25
@ EB_LOCK_TARGET
Definition: bufmgr.h:92
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:89
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:74
@ EB_LOCK_FIRST
Definition: bufmgr.h:86
Pointer Page
Definition: bufpage.h:81
static bool PageIsNew(Page page)
Definition: bufpage.h:233
#define MemSet(start, val, len)
Definition: c.h:1020
int errhint(const char *fmt,...)
Definition: elog.c:1317
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:758
BufferUsage pgBufferUsage
Definition: instrument.c:20
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:420
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:470
#define ExclusiveLock
Definition: lockdefs.h:42
IOContext
Definition: pgstat.h:319
@ IOOP_EXTEND
Definition: pgstat.h:331
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:100
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt)
Definition: pgstat_io.c:122
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
#define relpath(rlocator, forknum)
Definition: relpath.h:102
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:655
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:560
int64 shared_blks_written
Definition: instrument.h:29
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:46

References Assert, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufHdrGetBlock, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), CurrentResourceOwner, EB_CLEAR_SIZE_CACHE, EB_LOCK_FIRST, EB_LOCK_TARGET, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errhint(), errmsg(), ERROR, ExclusiveLock, GetBufferDescriptor(), GetVictimBuffer(), hash(), i, INIT_FORKNUM, InitBufferTag(), InvalidBlockNumber, IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EXTEND, LimitAdditionalPins(), RelFileLocatorBackend::locator, LockBufHdr(), LockRelationForExtension(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MaxBlockNumber, MemSet, PageIsNew(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer(), BufferManagerRelation::rel, relpath, BufferManagerRelation::relpersistence, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_written, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, SMgrRelationData::smgr_rlocator, smgrnblocks(), smgrzeroextend(), StartBufferIO(), StrategyFreeBuffer(), BufferDesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdr(), UnlockRelationForExtension(), and UnpinBuffer().

Referenced by ExtendBufferedRelCommon().

◆ ExtendBufferedRelTo()

Buffer ExtendBufferedRelTo ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
BlockNumber  extend_to,
ReadBufferMode  mode 
)

Definition at line 910 of file bufmgr.c.

916 {
918  uint32 extended_by = 0;
919  Buffer buffer = InvalidBuffer;
920  Buffer buffers[64];
921 
922  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
923  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
924  Assert(extend_to != InvalidBlockNumber && extend_to > 0);
925 
926  if (bmr.smgr == NULL)
927  {
928  bmr.smgr = RelationGetSmgr(bmr.rel);
929  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
930  }
931 
932  /*
933  * If desired, create the file if it doesn't exist. If
934  * smgr_cached_nblocks[fork] is positive then it must exist, no need for
935  * an smgrexists call.
936  */
937  if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
938  (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
940  !smgrexists(bmr.smgr, fork))
941  {
943 
944  /* recheck, fork might have been created concurrently */
945  if (!smgrexists(bmr.smgr, fork))
946  smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
947 
949  }
950 
951  /*
952  * If requested, invalidate size cache, so that smgrnblocks asks the
953  * kernel.
954  */
955  if (flags & EB_CLEAR_SIZE_CACHE)
957 
958  /*
959  * Estimate how many pages we'll need to extend by. This avoids acquiring
960  * unnecessarily many victim buffers.
961  */
962  current_size = smgrnblocks(bmr.smgr, fork);
963 
964  /*
965  * Since no-one else can be looking at the page contents yet, there is no
966  * difference between an exclusive lock and a cleanup-strength lock. Note
967  * that we pass the original mode to ReadBuffer_common() below, when
968  * falling back to reading the buffer to a concurrent relation extension.
969  */
971  flags |= EB_LOCK_TARGET;
972 
973  while (current_size < extend_to)
974  {
975  uint32 num_pages = lengthof(buffers);
976  BlockNumber first_block;
977 
978  if ((uint64) current_size + num_pages > extend_to)
979  num_pages = extend_to - current_size;
980 
981  first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
982  num_pages, extend_to,
983  buffers, &extended_by);
984 
985  current_size = first_block + extended_by;
986  Assert(num_pages != 0 || current_size >= extend_to);
987 
988  for (uint32 i = 0; i < extended_by; i++)
989  {
990  if (first_block + i != extend_to - 1)
991  ReleaseBuffer(buffers[i]);
992  else
993  buffer = buffers[i];
994  }
995  }
996 
997  /*
998  * It's possible that another backend concurrently extended the relation.
999  * In that case read the buffer.
1000  *
1001  * XXX: Should we control this via a flag?
1002  */
1003  if (buffer == InvalidBuffer)
1004  {
1005  Assert(extended_by == 0);
1006  buffer = ReadBuffer_common(bmr.rel, bmr.smgr, bmr.relpersistence,
1007  fork, extend_to - 1, mode, strategy);
1008  }
1009 
1010  return buffer;
1011 }
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1189
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4906
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:77
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:83
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:48
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:46
#define lengthof(array)
Definition: c.h:788
static PgChecksumMode mode
Definition: pg_checksums.c:56
static int64 current_size
Definition: pg_checksums.c:64

References Assert, PrivateRefCountEntry::buffer, current_size, EB_CLEAR_SIZE_CACHE, EB_CREATE_FORK_IF_NEEDED, EB_LOCK_TARGET, EB_PERFORMING_RECOVERY, ExclusiveLock, ExtendBufferedRelCommon(), i, InvalidBlockNumber, InvalidBuffer, lengthof, LockRelationForExtension(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RelationData::rd_rel, ReadBuffer_common(), BufferManagerRelation::rel, RelationGetSmgr(), ReleaseBuffer(), BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, smgrcreate(), smgrexists(), smgrnblocks(), and UnlockRelationForExtension().

Referenced by fsm_extend(), vm_extend(), and XLogReadBufferExtended().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 4307 of file bufmgr.c.

4310 {
4311  BlockNumber curBlock;
4312 
4313  for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4314  {
4315  uint32 bufHash; /* hash value for tag */
4316  BufferTag bufTag; /* identity of requested block */
4317  LWLock *bufPartitionLock; /* buffer partition lock for it */
4318  int buf_id;
4319  BufferDesc *bufHdr;
4320  uint32 buf_state;
4321 
4322  /* create a tag so we can lookup the buffer */
4323  InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4324 
4325  /* determine its hash code and partition lock ID */
4326  bufHash = BufTableHashCode(&bufTag);
4327  bufPartitionLock = BufMappingPartitionLock(bufHash);
4328 
4329  /* Check that it is in the buffer pool. If not, do nothing. */
4330  LWLockAcquire(bufPartitionLock, LW_SHARED);
4331  buf_id = BufTableLookup(&bufTag, bufHash);
4332  LWLockRelease(bufPartitionLock);
4333 
4334  if (buf_id < 0)
4335  continue;
4336 
4337  bufHdr = GetBufferDescriptor(buf_id);
4338 
4339  /*
4340  * We need to lock the buffer header and recheck if the buffer is
4341  * still associated with the same block because the buffer could be
4342  * evicted by some other backend loading blocks for a different
4343  * relation after we release lock on the BufMapping table.
4344  */
4345  buf_state = LockBufHdr(bufHdr);
4346 
4347  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4348  BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4349  bufHdr->tag.blockNum >= firstDelBlock)
4350  InvalidateBuffer(bufHdr); /* releases spinlock */
4351  else
4352  UnlockBufHdr(bufHdr, buf_state);
4353  }
4354 }

References buftag::blockNum, BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), BufferDesc::tag, and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 3766 of file bufmgr.c.

3768 {
3769  XLogRecPtr recptr;
3770  ErrorContextCallback errcallback;
3771  instr_time io_start;
3772  Block bufBlock;
3773  char *bufToWrite;
3774  uint32 buf_state;
3775 
3776  /*
3777  * Try to start an I/O operation. If StartBufferIO returns false, then
3778  * someone else flushed the buffer before we could, so we need not do
3779  * anything.
3780  */
3781  if (!StartBufferIO(buf, false, false))
3782  return;
3783 
3784  /* Setup error traceback support for ereport() */
3786  errcallback.arg = (void *) buf;
3787  errcallback.previous = error_context_stack;
3788  error_context_stack = &errcallback;
3789 
3790  /* Find smgr relation for buffer */
3791  if (reln == NULL)
3793 
3794  TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3795  buf->tag.blockNum,
3797  reln->smgr_rlocator.locator.dbOid,
3799 
3800  buf_state = LockBufHdr(buf);
3801 
3802  /*
3803  * Run PageGetLSN while holding header lock, since we don't have the
3804  * buffer locked exclusively in all cases.
3805  */
3806  recptr = BufferGetLSN(buf);
3807 
3808  /* To check if block content changes while flushing. - vadim 01/17/97 */
3809  buf_state &= ~BM_JUST_DIRTIED;
3810  UnlockBufHdr(buf, buf_state);
3811 
3812  /*
3813  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3814  * rule that log updates must hit disk before any of the data-file changes
3815  * they describe do.
3816  *
3817  * However, this rule does not apply to unlogged relations, which will be
3818  * lost after a crash anyway. Most unlogged relation pages do not bear
3819  * LSNs since we never emit WAL records for them, and therefore flushing
3820  * up through the buffer LSN would be useless, but harmless. However,
3821  * GiST indexes use LSNs internally to track page-splits, and therefore
3822  * unlogged GiST pages bear "fake" LSNs generated by
3823  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3824  * LSN counter could advance past the WAL insertion point; and if it did
3825  * happen, attempting to flush WAL through that location would fail, with
3826  * disastrous system-wide consequences. To make sure that can't happen,
3827  * skip the flush if the buffer isn't permanent.
3828  */
3829  if (buf_state & BM_PERMANENT)
3830  XLogFlush(recptr);
3831 
3832  /*
3833  * Now it's safe to write buffer to disk. Note that no one else should
3834  * have been able to write it while we were busy with log flushing because
3835  * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3836  */
3837  bufBlock = BufHdrGetBlock(buf);
3838 
3839  /*
3840  * Update page checksum if desired. Since we have only shared lock on the
3841  * buffer, other processes might be updating hint bits in it, so we must
3842  * copy the page to private storage if we do checksumming.
3843  */
3844  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3845 
3847 
3848  /*
3849  * bufToWrite is either the shared buffer or a copy, as appropriate.
3850  */
3851  smgrwrite(reln,
3852  BufTagGetForkNum(&buf->tag),
3853  buf->tag.blockNum,
3854  bufToWrite,
3855  false);
3856 
3857  /*
3858  * When a strategy is in use, only flushes of dirty buffers already in the
3859  * strategy ring are counted as strategy writes (IOCONTEXT
3860  * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3861  * statistics tracking.
3862  *
3863  * If a shared buffer initially added to the ring must be flushed before
3864  * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3865  *
3866  * If a shared buffer which was added to the ring later because the
3867  * current strategy buffer is pinned or in use or because all strategy
3868  * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3869  * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3870  * (from_ring will be false).
3871  *
3872  * When a strategy is not in use, the write can only be a "regular" write
3873  * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3874  */
3876  IOOP_WRITE, io_start, 1);
3877 
3879 
3880  /*
3881  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3882  * end the BM_IO_IN_PROGRESS state.
3883  */
3884  TerminateBufferIO(buf, true, 0, true);
3885 
3886  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3887  buf->tag.blockNum,
3889  reln->smgr_rlocator.locator.dbOid,
3891 
3892  /* Pop the error context stack */
3893  error_context_stack = errcallback.previous;
3894 }
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:69
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5676
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1510
ErrorContextCallback * error_context_stack
Definition: elog.c:94
@ IOOP_WRITE
Definition: pgstat.h:336
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:121
struct ErrorContextCallback * previous
Definition: elog.h:296
void(* callback)(void *arg)
Definition: elog.h:297
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2795

References ErrorContextCallback::arg, BM_JUST_DIRTIED, BM_PERMANENT, buf, BufferGetLSN, BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, RelFileLocator::dbOid, error_context_stack, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITE, RelFileLocatorBackend::locator, LockBufHdr(), PageSetChecksumCopy(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), ErrorContextCallback::previous, RelFileLocator::relNumber, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rlocator, smgropen(), smgrwrite(), RelFileLocator::spcOid, StartBufferIO(), TerminateBufferIO(), track_io_timing, UnlockBufHdr(), and XLogFlush().

Referenced by EvictUnpinnedBuffer(), FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), and SyncOneBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 4844 of file bufmgr.c.

4845 {
4846  int i;
4847  BufferDesc *bufHdr;
4848 
4849  for (i = 0; i < NBuffers; i++)
4850  {
4851  uint32 buf_state;
4852 
4853  bufHdr = GetBufferDescriptor(i);
4854 
4855  /*
4856  * As in DropRelationBuffers, an unlocked precheck should be safe and
4857  * saves some cycles.
4858  */
4859  if (bufHdr->tag.dbOid != dbid)
4860  continue;
4861 
4862  /* Make sure we can handle the pin */
4865 
4866  buf_state = LockBufHdr(bufHdr);
4867  if (bufHdr->tag.dbOid == dbid &&
4868  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4869  {
4870  PinBuffer_Locked(bufHdr);
4874  UnpinBuffer(bufHdr);
4875  }
4876  else
4877  UnlockBufHdr(bufHdr, buf_state);
4878  }
4879 }

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), CurrentResourceOwner, buftag::dbOid, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 4886 of file bufmgr.c.

4887 {
4888  BufferDesc *bufHdr;
4889 
4890  /* currently not needed, but no fundamental reason not to support */
4891  Assert(!BufferIsLocal(buffer));
4892 
4893  Assert(BufferIsPinned(buffer));
4894 
4895  bufHdr = GetBufferDescriptor(buffer - 1);
4896 
4898 
4900 }
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1893

References Assert, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 4474 of file bufmgr.c.

4475 {
4476  int i;
4477  BufferDesc *bufHdr;
4478  SMgrRelation srel = RelationGetSmgr(rel);
4479 
4480  if (RelationUsesLocalBuffers(rel))
4481  {
4482  for (i = 0; i < NLocBuffer; i++)
4483  {
4484  uint32 buf_state;
4485  instr_time io_start;
4486 
4487  bufHdr = GetLocalBufferDescriptor(i);
4488  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4489  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4490  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4491  {
4492  ErrorContextCallback errcallback;
4493  Page localpage;
4494 
4495  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4496 
4497  /* Setup error traceback support for ereport() */
4499  errcallback.arg = (void *) bufHdr;
4500  errcallback.previous = error_context_stack;
4501  error_context_stack = &errcallback;
4502 
4503  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4504 
4506 
4507  smgrwrite(srel,
4508  BufTagGetForkNum(&bufHdr->tag),
4509  bufHdr->tag.blockNum,
4510  localpage,
4511  false);
4512 
4515  io_start, 1);
4516 
4517  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4518  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4519 
4521 
4522  /* Pop the error context stack */
4523  error_context_stack = errcallback.previous;
4524  }
4525  }
4526 
4527  return;
4528  }
4529 
4530  for (i = 0; i < NBuffers; i++)
4531  {
4532  uint32 buf_state;
4533 
4534  bufHdr = GetBufferDescriptor(i);
4535 
4536  /*
4537  * As in DropRelationBuffers, an unlocked precheck should be safe and
4538  * saves some cycles.
4539  */
4540  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4541  continue;
4542 
4543  /* Make sure we can handle the pin */
4546 
4547  buf_state = LockBufHdr(bufHdr);
4548  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4549  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4550  {
4551  PinBuffer_Locked(bufHdr);
4555  UnpinBuffer(bufHdr);
4556  }
4557  else
4558  UnlockBufHdr(bufHdr, buf_state);
4559  }
4560 }
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:295
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:72
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5696
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1542
int NLocBuffer
Definition: localbuf.c:42
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:313
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637
int64 local_blks_written
Definition: instrument.h:33
RelFileLocator rd_locator
Definition: rel.h:57

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_WRITE, BufferUsage::local_blks_written, local_buffer_write_error_callback(), LocalBufHdrGetBlock, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, PageSetChecksumInplace(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), smgrwrite(), BufferDesc::state, BufferDesc::tag, track_io_timing, UnlockBufHdr(), and UnpinBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 4572 of file bufmgr.c.

4573 {
4574  int i;
4575  SMgrSortArray *srels;
4576  bool use_bsearch;
4577 
4578  if (nrels == 0)
4579  return;
4580 
4581  /* fill-in array for qsort */
4582  srels = palloc(sizeof(SMgrSortArray) * nrels);
4583 
4584  for (i = 0; i < nrels; i++)
4585  {
4586  Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4587 
4588  srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4589  srels[i].srel = smgrs[i];
4590  }
4591 
4592  /*
4593  * Save the bsearch overhead for low number of relations to sync. See
4594  * DropRelationsAllBuffers for details.
4595  */
4596  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4597 
4598  /* sort the list of SMgrRelations if necessary */
4599  if (use_bsearch)
4600  qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4601 
4602  for (i = 0; i < NBuffers; i++)
4603  {
4604  SMgrSortArray *srelent = NULL;
4605  BufferDesc *bufHdr = GetBufferDescriptor(i);
4606  uint32 buf_state;
4607 
4608  /*
4609  * As in DropRelationBuffers, an unlocked precheck should be safe and
4610  * saves some cycles.
4611  */
4612 
4613  if (!use_bsearch)
4614  {
4615  int j;
4616 
4617  for (j = 0; j < nrels; j++)
4618  {
4619  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4620  {
4621  srelent = &srels[j];
4622  break;
4623  }
4624  }
4625  }
4626  else
4627  {
4628  RelFileLocator rlocator;
4629 
4630  rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4631  srelent = bsearch((const void *) &(rlocator),
4632  srels, nrels, sizeof(SMgrSortArray),
4634  }
4635 
4636  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4637  if (srelent == NULL)
4638  continue;
4639 
4640  /* Make sure we can handle the pin */
4643 
4644  buf_state = LockBufHdr(bufHdr);
4645  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4646  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4647  {
4648  PinBuffer_Locked(bufHdr);
4650  FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4652  UnpinBuffer(bufHdr);
4653  }
4654  else
4655  UnlockBufHdr(bufHdr, buf_state);
4656  }
4657 
4658  pfree(srels);
4659 }
SMgrRelation srel
Definition: bufmgr.c:136
RelFileLocator rlocator
Definition: bufmgr.c:135

References Assert, BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, RelFileLocatorBackend::locator, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, palloc(), pfree(), PinBuffer_Locked(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrSortArray::rlocator, rlocator_comparator(), SMgrRelationData::smgr_rlocator, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 439 of file bufmgr.c.

440 {
441  Assert(ref->refcount == 0);
442 
443  if (ref >= &PrivateRefCountArray[0] &&
445  {
446  ref->buffer = InvalidBuffer;
447 
448  /*
449  * Mark the just used entry as reserved - in many scenarios that
450  * allows us to avoid ever having to search the array/hash for free
451  * entries.
452  */
453  ReservedRefCountEntry = ref;
454  }
455  else
456  {
457  bool found;
458  Buffer buffer = ref->buffer;
459 
460  hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
461  Assert(found);
464  }
465 }
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:212
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
@ HASH_REMOVE
Definition: hsearch.h:115

References Assert, PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by UnpinBufferNoOwner().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 416 of file bufmgr.c.

417 {
419 
420  Assert(BufferIsValid(buffer));
421  Assert(!BufferIsLocal(buffer));
422 
423  /*
424  * Not moving the entry - that's ok for the current users, but we might
425  * want to change this one day.
426  */
427  ref = GetPrivateRefCountEntry(buffer, false);
428 
429  if (ref == NULL)
430  return 0;
431  return ref->refcount;
432 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:342

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by CheckBufferIsPinnedOnce(), ConditionalLockBufferForCleanup(), DebugPrintBufferRefcount(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), MarkBufferDirtyHint(), and ReadRecentBuffer().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 342 of file bufmgr.c.

343 {
345  int i;
346 
347  Assert(BufferIsValid(buffer));
348  Assert(!BufferIsLocal(buffer));
349 
350  /*
351  * First search for references in the array, that'll be sufficient in the
352  * majority of cases.
353  */
354  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
355  {
357 
358  if (res->buffer == buffer)
359  return res;
360  }
361 
362  /*
363  * By here we know that the buffer, if already pinned, isn't residing in
364  * the array.
365  *
366  * Only look up the buffer in the hashtable if we've previously overflowed
367  * into it.
368  */
369  if (PrivateRefCountOverflowed == 0)
370  return NULL;
371 
372  res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
373 
374  if (res == NULL)
375  return NULL;
376  else if (!do_move)
377  {
378  /* caller doesn't want us to move the hash entry into the array */
379  return res;
380  }
381  else
382  {
383  /* move buffer from hashtable into the free array slot */
384  bool found;
386 
387  /* Ensure there's a free array slot */
389 
390  /* Use up the reserved slot */
391  Assert(ReservedRefCountEntry != NULL);
393  ReservedRefCountEntry = NULL;
394  Assert(free->buffer == InvalidBuffer);
395 
396  /* and fill it */
397  free->buffer = buffer;
398  free->refcount = res->refcount;
399 
400  /* delete from hashtable */
401  hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
402  Assert(found);
405 
406  return free;
407  }
408 }
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, res, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBufferNoOwner().

◆ GetVictimBuffer()

static Buffer GetVictimBuffer ( BufferAccessStrategy  strategy,
IOContext  io_context 
)
static

Definition at line 1932 of file bufmgr.c.

1933 {
1934  BufferDesc *buf_hdr;
1935  Buffer buf;
1936  uint32 buf_state;
1937  bool from_ring;
1938 
1939  /*
1940  * Ensure, while the spinlock's not yet held, that there's a free refcount
1941  * entry, and a resource owner slot for the pin.
1942  */
1945 
1946  /* we return here if a prospective victim buffer gets used concurrently */
1947 again:
1948 
1949  /*
1950  * Select a victim buffer. The buffer is returned with its header
1951  * spinlock still held!
1952  */
1953  buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1954  buf = BufferDescriptorGetBuffer(buf_hdr);
1955 
1956  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1957 
1958  /* Pin the buffer and then release the buffer spinlock */
1959  PinBuffer_Locked(buf_hdr);
1960 
1961  /*
1962  * We shouldn't have any other pins for this buffer.
1963  */
1965 
1966  /*
1967  * If the buffer was dirty, try to write it out. There is a race
1968  * condition here, in that someone might dirty it after we released the
1969  * buffer header lock above, or even while we are writing it out (since
1970  * our share-lock won't prevent hint-bit updates). We will recheck the
1971  * dirty bit after re-locking the buffer header.
1972  */
1973  if (buf_state & BM_DIRTY)
1974  {
1975  LWLock *content_lock;
1976 
1977  Assert(buf_state & BM_TAG_VALID);
1978  Assert(buf_state & BM_VALID);
1979 
1980  /*
1981  * We need a share-lock on the buffer contents to write it out (else
1982  * we might write invalid data, eg because someone else is compacting
1983  * the page contents while we write). We must use a conditional lock
1984  * acquisition here to avoid deadlock. Even though the buffer was not
1985  * pinned (and therefore surely not locked) when StrategyGetBuffer
1986  * returned it, someone else could have pinned and exclusive-locked it
1987  * by the time we get here. If we try to get the lock unconditionally,
1988  * we'd block waiting for them; if they later block waiting for us,
1989  * deadlock ensues. (This has been observed to happen when two
1990  * backends are both trying to split btree index pages, and the second
1991  * one just happens to be trying to split the page the first one got
1992  * from StrategyGetBuffer.)
1993  */
1994  content_lock = BufferDescriptorGetContentLock(buf_hdr);
1995  if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
1996  {
1997  /*
1998  * Someone else has locked the buffer, so give it up and loop back
1999  * to get another one.
2000  */
2001  UnpinBuffer(buf_hdr);
2002  goto again;
2003  }
2004 
2005  /*
2006  * If using a nondefault strategy, and writing the buffer would
2007  * require a WAL flush, let the strategy decide whether to go ahead
2008  * and write/reuse the buffer or to choose another victim. We need a
2009  * lock to inspect the page LSN, so this can't be done inside
2010  * StrategyGetBuffer.
2011  */
2012  if (strategy != NULL)
2013  {
2014  XLogRecPtr lsn;
2015 
2016  /* Read the LSN while holding buffer header lock */
2017  buf_state = LockBufHdr(buf_hdr);
2018  lsn = BufferGetLSN(buf_hdr);
2019  UnlockBufHdr(buf_hdr, buf_state);
2020 
2021  if (XLogNeedsFlush(lsn)
2022  && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2023  {
2024  LWLockRelease(content_lock);
2025  UnpinBuffer(buf_hdr);
2026  goto again;
2027  }
2028  }
2029 
2030  /* OK, do the I/O */
2031  FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2032  LWLockRelease(content_lock);
2033 
2035  &buf_hdr->tag);
2036  }
2037 
2038 
2039  if (buf_state & BM_VALID)
2040  {
2041  /*
2042  * When a BufferAccessStrategy is in use, blocks evicted from shared
2043  * buffers are counted as IOOP_EVICT in the corresponding context
2044  * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2045  * strategy in two cases: 1) while initially claiming buffers for the
2046  * strategy ring 2) to replace an existing strategy ring buffer
2047  * because it is pinned or in use and cannot be reused.
2048  *
2049  * Blocks evicted from buffers already in the strategy ring are
2050  * counted as IOOP_REUSE in the corresponding strategy context.
2051  *
2052  * At this point, we can accurately count evictions and reuses,
2053  * because we have successfully claimed the valid buffer. Previously,
2054  * we may have been forced to release the buffer due to concurrent
2055  * pinners or erroring out.
2056  */
2058  from_ring ? IOOP_REUSE : IOOP_EVICT);
2059  }
2060 
2061  /*
2062  * If the buffer has an entry in the buffer mapping table, delete it. This
2063  * can fail because another backend could have pinned or dirtied the
2064  * buffer.
2065  */
2066  if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2067  {
2068  UnpinBuffer(buf_hdr);
2069  goto again;
2070  }
2071 
2072  /* a final set of sanity checks */
2073 #ifdef USE_ASSERT_CHECKING
2074  buf_state = pg_atomic_read_u32(&buf_hdr->state);
2075 
2076  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2077  Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2078 
2080 #endif
2081 
2082  return buf;
2083 }
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5187
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:5897
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:798
@ IOOP_EVICT
Definition: pgstat.h:330
@ IOOP_REUSE
Definition: pgstat.h:335
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
Definition: pgstat_io.c:77
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3126

References Assert, BackendWritebackContext, BM_DIRTY, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufferGetLSN, CheckBufferIsPinnedOnce(), CurrentResourceOwner, FlushBuffer(), InvalidateVictimBuffer(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, LockBufHdr(), LW_SHARED, LWLockConditionalAcquire(), LWLockRelease(), pg_atomic_read_u32(), pgstat_count_io_op(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), BufferDesc::state, StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by BufferAlloc(), and ExtendBufferedRelShared().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 5355 of file bufmgr.c.

5356 {
5357  int bufid = GetStartupBufferPinWaitBufId();
5358 
5359  /*
5360  * If we get woken slowly then it's possible that the Startup process was
5361  * already woken by other backends before we got here. Also possible that
5362  * we get here by multiple interrupts or interrupts at inappropriate
5363  * times, so make sure we do nothing if the bufid is not set.
5364  */
5365  if (bufid < 0)
5366  return false;
5367 
5368  if (GetPrivateRefCount(bufid + 1) > 0)
5369  return true;
5370 
5371  return false;
5372 }
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:672

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and ProcessRecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 4938 of file bufmgr.c.

4939 {
4940  Assert(BufferIsPinned(buffer));
4942  if (BufferIsLocal(buffer))
4943  LocalRefCount[-buffer - 1]++;
4944  else
4945  {
4946  PrivateRefCountEntry *ref;
4947 
4948  ref = GetPrivateRefCountEntry(buffer, true);
4949  Assert(ref != NULL);
4950  ref->refcount++;
4951  }
4953 }
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, GetPrivateRefCountEntry(), LocalRefCount, PrivateRefCountEntry::refcount, ResourceOwnerEnlarge(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), RelationAddBlocks(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

◆ InitBufferManagerAccess()

void InitBufferManagerAccess ( void  )

Definition at line 3558 of file bufmgr.c.

3559 {
3560  HASHCTL hash_ctl;
3561 
3562  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3563 
3564  hash_ctl.keysize = sizeof(int32);
3565  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3566 
3567  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3568  HASH_ELEM | HASH_BLOBS);
3569 
3570  /*
3571  * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3572  * the corresponding phase of backend shutdown.
3573  */
3574  Assert(MyProc != NULL);
3576 }
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:3583
struct PrivateRefCountEntry PrivateRefCountEntry
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
PGPROC * MyProc
Definition: proc.c:67
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76

References Assert, AtProcExit_Buffers(), HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MyProc, on_shmem_exit(), PrivateRefCountArray, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 1766 of file bufmgr.c.

1767 {
1768  BufferTag oldTag;
1769  uint32 oldHash; /* hash value for oldTag */
1770  LWLock *oldPartitionLock; /* buffer partition lock for it */
1771  uint32 oldFlags;
1772  uint32 buf_state;
1773 
1774  /* Save the original buffer tag before dropping the spinlock */
1775  oldTag = buf->tag;
1776 
1777  buf_state = pg_atomic_read_u32(&buf->state);
1778  Assert(buf_state & BM_LOCKED);
1779  UnlockBufHdr(buf, buf_state);
1780 
1781  /*
1782  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1783  * worth storing the hashcode in BufferDesc so we need not recompute it
1784  * here? Probably not.
1785  */
1786  oldHash = BufTableHashCode(&oldTag);
1787  oldPartitionLock = BufMappingPartitionLock(oldHash);
1788 
1789 retry:
1790 
1791  /*
1792  * Acquire exclusive mapping lock in preparation for changing the buffer's
1793  * association.
1794  */
1795  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1796 
1797  /* Re-lock the buffer header */
1798  buf_state = LockBufHdr(buf);
1799 
1800  /* If it's changed while we were waiting for lock, do nothing */
1801  if (!BufferTagsEqual(&buf->tag, &oldTag))
1802  {
1803  UnlockBufHdr(buf, buf_state);
1804  LWLockRelease(oldPartitionLock);
1805  return;
1806  }
1807 
1808  /*
1809  * We assume the only reason for it to be pinned is that someone else is
1810  * flushing the page out. Wait for them to finish. (This could be an
1811  * infinite loop if the refcount is messed up... it would be nice to time
1812  * out after awhile, but there seems no way to be sure how many loops may
1813  * be needed. Note that if the other guy has pinned the buffer but not
1814  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1815  * be busy-looping here.)
1816  */
1817  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1818  {
1819  UnlockBufHdr(buf, buf_state);
1820  LWLockRelease(oldPartitionLock);
1821  /* safety check: should definitely not be our *own* pin */
1823  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1824  WaitIO(buf);
1825  goto retry;
1826  }
1827 
1828  /*
1829  * Clear out the buffer's tag and flags. We must do this to ensure that
1830  * linear scans of the buffer array don't think the buffer is valid.
1831  */
1832  oldFlags = buf_state & BUF_FLAG_MASK;
1833  ClearBufferTag(&buf->tag);
1834  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1835  UnlockBufHdr(buf, buf_state);
1836 
1837  /*
1838  * Remove the buffer from the lookup hashtable, if it was in there.
1839  */
1840  if (oldFlags & BM_TAG_VALID)
1841  BufTableDelete(&oldTag, oldHash);
1842 
1843  /*
1844  * Done with mapping lock.
1845  */
1846  LWLockRelease(oldPartitionLock);
1847 
1848  /*
1849  * Insert the buffer at the head of the list of free buffers.
1850  */
1852 }
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:45
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
#define BM_LOCKED
Definition: buf_internals.h:60
static void ClearBufferTag(BufferTag *tag)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5491

References Assert, BM_LOCKED, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), StrategyFreeBuffer(), UnlockBufHdr(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ InvalidateVictimBuffer()

static bool InvalidateVictimBuffer ( BufferDesc buf_hdr)
static

Definition at line 1864 of file bufmgr.c.

1865 {
1866  uint32 buf_state;
1867  uint32 hash;
1868  LWLock *partition_lock;
1869  BufferTag tag;
1870 
1872 
1873  /* have buffer pinned, so it's safe to read tag without lock */
1874  tag = buf_hdr->tag;
1875 
1876  hash = BufTableHashCode(&tag);
1877  partition_lock = BufMappingPartitionLock(hash);
1878 
1879  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1880 
1881  /* lock the buffer header */
1882  buf_state = LockBufHdr(buf_hdr);
1883 
1884  /*
1885  * We have the buffer pinned nobody else should have been able to unset
1886  * this concurrently.
1887  */
1888  Assert(buf_state & BM_TAG_VALID);
1889  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1890  Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1891 
1892  /*
1893  * If somebody else pinned the buffer since, or even worse, dirtied it,
1894  * give up on this buffer: It's clearly in use.
1895  */
1896  if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1897  {
1898  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1899 
1900  UnlockBufHdr(buf_hdr, buf_state);
1901  LWLockRelease(partition_lock);
1902 
1903  return false;
1904  }
1905 
1906  /*
1907  * Clear out the buffer's tag and flags and usagecount. This is not
1908  * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1909  * doing anything with the buffer. But currently it's beneficial, as the
1910  * cheaper pre-check for several linear scans of shared buffers use the
1911  * tag (see e.g. FlushDatabaseBuffers()).
1912  */
1913  ClearBufferTag(&buf_hdr->tag);
1914  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1915  UnlockBufHdr(buf_hdr, buf_state);
1916 
1917  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1918 
1919  /* finally delete buffer from the buffer mapping table */
1920  BufTableDelete(&tag, hash);
1921 
1922  LWLockRelease(partition_lock);
1923 
1924  Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1925  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1927 
1928  return true;
1929 }

References Assert, BM_DIRTY, BM_TAG_VALID, BM_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), GetPrivateRefCount(), hash(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by EvictUnpinnedBuffer(), and GetVictimBuffer().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 5437 of file bufmgr.c.

5438 {
5439  BufferDesc *bufHdr;
5440  uint32 buf_state;
5441 
5442  Assert(BufferIsValid(buffer));
5443 
5444  if (BufferIsLocal(buffer))
5445  {
5446  /* There should be exactly one pin */
5447  if (LocalRefCount[-buffer - 1] != 1)
5448  return false;
5449  /* Nobody else to wait for */
5450  return true;
5451  }
5452 
5453  /* There should be exactly one local pin */
5454  if (GetPrivateRefCount(buffer) != 1)
5455  return false;
5456 
5457  bufHdr = GetBufferDescriptor(buffer - 1);
5458 
5459  /* caller must hold exclusive lock on buffer */
5461  LW_EXCLUSIVE));
5462 
5463  buf_state = LockBufHdr(bufHdr);
5464 
5465  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5466  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5467  {
5468  /* pincount is OK. */
5469  UnlockBufHdr(bufHdr, buf_state);
5470  return true;
5471  }
5472 
5473  UnlockBufHdr(bufHdr, buf_state);
5474  return false;
5475 }

References Assert, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsValid(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext wb_context,
IOContext  io_context 
)

Definition at line 5942 of file bufmgr.c.

5943 {
5944  instr_time io_start;
5945  int i;
5946 
5947  if (wb_context->nr_pending == 0)
5948  return;
5949 
5950  /*
5951  * Executing the writes in-order can make them a lot faster, and allows to
5952  * merge writeback requests to consecutive blocks into larger writebacks.
5953  */
5954  sort_pending_writebacks(wb_context->pending_writebacks,
5955  wb_context->nr_pending);
5956 
5958 
5959  /*
5960  * Coalesce neighbouring writes, but nothing else. For that we iterate
5961  * through the, now sorted, array of pending flushes, and look forward to
5962  * find all neighbouring (or identical) writes.
5963  */
5964  for (i = 0; i < wb_context->nr_pending; i++)
5965  {
5968  SMgrRelation reln;
5969  int ahead;
5970  BufferTag tag;
5971  RelFileLocator currlocator;
5972  Size nblocks = 1;
5973 
5974  cur = &wb_context->pending_writebacks[i];
5975  tag = cur->tag;
5976  currlocator = BufTagGetRelFileLocator(&tag);
5977 
5978  /*
5979  * Peek ahead, into following writeback requests, to see if they can
5980  * be combined with the current one.
5981  */
5982  for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
5983  {
5984 
5985  next = &wb_context->pending_writebacks[i + ahead + 1];
5986 
5987  /* different file, stop */
5988  if (!RelFileLocatorEquals(currlocator,
5989  BufTagGetRelFileLocator(&next->tag)) ||
5990  BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
5991  break;
5992 
5993  /* ok, block queued twice, skip */
5994  if (cur->tag.blockNum == next->tag.blockNum)
5995  continue;
5996 
5997  /* only merge consecutive writes */
5998  if (cur->tag.blockNum + 1 != next->tag.blockNum)
5999  break;
6000 
6001  nblocks++;
6002  cur = next;
6003  }
6004 
6005  i += ahead;
6006 
6007  /* and finally tell the kernel to write the data to storage */
6008  reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6009  smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6010  }
6011 
6012  /*
6013  * Assume that writeback requests are only issued for buffers containing
6014  * blocks of permanent relations.
6015  */
6017  IOOP_WRITEBACK, io_start, wb_context->nr_pending);
6018 
6019  wb_context->nr_pending = 0;
6020 }
static int32 next
Definition: blutils.c:222
struct cursor * cur
Definition: ecpg.c:28
@ IOOP_WRITEBACK
Definition: pgstat.h:337
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:643
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, i, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITEBACK, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RelFileLocatorEquals, smgropen(), smgrwriteback(), and track_io_timing.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ LimitAdditionalPins()

void LimitAdditionalPins ( uint32 additional_pins)

Definition at line 2098 of file bufmgr.c.

2099 {
2100  uint32 max_backends;
2101  int max_proportional_pins;
2102 
2103  if (*additional_pins <= 1)
2104  return;
2105 
2106  max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
2107  max_proportional_pins = NBuffers / max_backends;
2108 
2109  /*
2110  * Subtract the approximate number of buffers already pinned by this
2111  * backend. We get the number of "overflowed" pins for free, but don't
2112  * know the number of pins in PrivateRefCountArray. The cost of
2113  * calculating that exactly doesn't seem worth it, so just assume the max.
2114  */
2115  max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2116 
2117  if (max_proportional_pins <= 0)
2118  max_proportional_pins = 1;
2119 
2120  if (*additional_pins > max_proportional_pins)
2121  *additional_pins = max_proportional_pins;
2122 }
int MaxBackends
Definition: globals.c:145
#define NUM_AUXILIARY_PROCS
Definition: proc.h:439

References MaxBackends, NBuffers, NUM_AUXILIARY_PROCS, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by ExtendBufferedRelShared(), and read_stream_begin_impl().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 5696 of file bufmgr.c.

5697 {
5698  BufferDesc *bufHdr = (BufferDesc *) arg;
5699 
5700  if (bufHdr != NULL)
5701  {
5702  char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5703  MyProcNumber,
5704  BufTagGetForkNum(&bufHdr->tag));
5705 
5706  errcontext("writing block %u of relation %s",
5707  bufHdr->tag.blockNum, path);
5708  pfree(path);
5709  }
5710 }
#define errcontext
Definition: elog.h:196
void * arg

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, MyProcNumber, pfree(), relpathbackend, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 5140 of file bufmgr.c.

5141 {
5142  BufferDesc *buf;
5143 
5144  Assert(BufferIsPinned(buffer));
5145  if (BufferIsLocal(buffer))
5146  return; /* local buffers need no lock */
5147 
5148  buf = GetBufferDescriptor(buffer - 1);
5149 
5150  if (mode == BUFFER_LOCK_UNLOCK)
5152  else if (mode == BUFFER_LOCK_SHARE)
5154  else if (mode == BUFFER_LOCK_EXCLUSIVE)
5156  else
5157  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5158 }
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:190
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:191

References Assert, buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), and mode.

Referenced by _bt_lockbuf(), _bt_unlockbuf(), _bt_upgradelockbufcleanup(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), entryLoadMoreItems(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishOldSplit(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_prepare_pagescan(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_bitmap_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgettup(), initBloomState(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), LockBufferForCleanup(), log_newpage_range(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), ScanSourceDatabasePgClass(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferForRedoExtended(), XLogRecordPageWithFreeSpace(), and ZeroAndLockBuffer().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 5220 of file bufmgr.c.

5221 {
5222  BufferDesc *bufHdr;
5223  TimestampTz waitStart = 0;
5224  bool waiting = false;
5225  bool logged_recovery_conflict = false;
5226 
5227  Assert(BufferIsPinned(buffer));
5228  Assert(PinCountWaitBuf == NULL);
5229 
5230  CheckBufferIsPinnedOnce(buffer);
5231 
5232  /* Nobody else to wait for */
5233  if (BufferIsLocal(buffer))
5234  return;
5235 
5236  bufHdr = GetBufferDescriptor(buffer - 1);
5237 
5238  for (;;)
5239  {
5240  uint32 buf_state;
5241 
5242  /* Try to acquire lock */
5244  buf_state = LockBufHdr(bufHdr);
5245 
5246  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5247  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5248  {
5249  /* Successfully acquired exclusive lock with pincount 1 */
5250  UnlockBufHdr(bufHdr, buf_state);
5251 
5252  /*
5253  * Emit the log message if recovery conflict on buffer pin was
5254  * resolved but the startup process waited longer than
5255  * deadlock_timeout for it.
5256  */
5257  if (logged_recovery_conflict)
5259  waitStart, GetCurrentTimestamp(),
5260  NULL, false);
5261 
5262  if (waiting)
5263  {
5264  /* reset ps display to remove the suffix if we added one */
5266  waiting = false;
5267  }
5268  return;
5269  }
5270  /* Failed, so mark myself as waiting for pincount 1 */
5271  if (buf_state & BM_PIN_COUNT_WAITER)
5272  {
5273  UnlockBufHdr(bufHdr, buf_state);
5274  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5275  elog(ERROR, "multiple backends attempting to wait for pincount 1");
5276  }
5278  PinCountWaitBuf = bufHdr;
5279  buf_state |= BM_PIN_COUNT_WAITER;
5280  UnlockBufHdr(bufHdr, buf_state);
5281  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5282 
5283  /* Wait to be signaled by UnpinBuffer() */
5284  if (InHotStandby)
5285  {
5286  if (!waiting)
5287  {
5288  /* adjust the process title to indicate that it's waiting */
5289  set_ps_display_suffix("waiting");
5290  waiting = true;
5291  }
5292 
5293  /*
5294  * Emit the log message if the startup process is waiting longer
5295  * than deadlock_timeout for recovery conflict on buffer pin.
5296  *
5297  * Skip this if first time through because the startup process has
5298  * not started waiting yet in this case. So, the wait start
5299  * timestamp is set after this logic.
5300  */
5301  if (waitStart != 0 && !logged_recovery_conflict)
5302  {
5304 
5305  if (TimestampDifferenceExceeds(waitStart, now,
5306  DeadlockTimeout))
5307  {
5309  waitStart, now, NULL, true);
5310  logged_recovery_conflict = true;
5311  }
5312  }
5313 
5314  /*
5315  * Set the wait start timestamp if logging is enabled and first
5316  * time through.
5317  */
5318  if (log_recovery_conflict_waits && waitStart == 0)
5319  waitStart = GetCurrentTimestamp();
5320 
5321  /* Publish the bufid that Startup process waits on */
5322  SetStartupBufferPinWaitBufId(buffer - 1);
5323  /* Set alarm and then wait to be signaled by UnpinBuffer() */
5325  /* Reset the published bufid */
5327  }
5328  else
5329  ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5330 
5331  /*
5332  * Remove flag marking us as waiter. Normally this will not be set
5333  * anymore, but ProcWaitForSignal() can return for other signals as
5334  * well. We take care to only reset the flag if we're the waiter, as
5335  * theoretically another backend could have started waiting. That's
5336  * impossible with the current usages due to table level locking, but
5337  * better be safe.
5338  */
5339  buf_state = LockBufHdr(bufHdr);
5340  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5342  buf_state &= ~BM_PIN_COUNT_WAITER;
5343  UnlockBufHdr(bufHdr, buf_state);
5344 
5345  PinCountWaitBuf = NULL;
5346  /* Loop back and try again */
5347  }
5348 }
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1780
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1644
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1608
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:67
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:176
int64 TimestampTz
Definition: timestamp.h:39
static volatile sig_atomic_t waiting
Definition: latch.c:162
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:421
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:369
int DeadlockTimeout
Definition: proc.c:58
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:660
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1872
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:792
bool log_recovery_conflict_waits
Definition: standby.c:41
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:273
int wait_backend_pgprocno
#define InHotStandby
Definition: xlogutils.h:60

References Assert, BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, CheckBufferIsPinnedOnce(), DeadlockTimeout, elog, ERROR, GetBufferDescriptor(), GetCurrentTimestamp(), InHotStandby, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProcNumber, now(), PinCountWaitBuf, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), BufferDesc::wait_backend_pgprocno, and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), XLogReadBufferForRedoExtended(), and ZeroAndLockBuffer().

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 5743 of file bufmgr.c.

5744 {
5745  SpinDelayStatus delayStatus;
5746  uint32 old_buf_state;
5747 
5749 
5750  init_local_spin_delay(&delayStatus);
5751 
5752  while (true)
5753  {
5754  /* set BM_LOCKED flag */
5755  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5756  /* if it wasn't set before we're OK */
5757  if (!(old_buf_state & BM_LOCKED))
5758  break;
5759  perform_spin_delay(&delayStatus);
5760  }
5761  finish_spin_delay(&delayStatus);
5762  return old_buf_state | BM_LOCKED;
5763 }
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:410
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:127
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:187
#define init_local_spin_delay(status)
Definition: s_lock.h:745

References Assert, BM_LOCKED, BufferDescriptorGetBuffer(), BufferIsLocal, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FindAndDropRelationBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), GetVictimBuffer(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_pages(), ReadRecentBuffer(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), UnpinBufferNoOwner(), and WaitIO().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 2514 of file bufmgr.c.

2515 {
2516  BufferDesc *bufHdr;
2517  uint32 buf_state;
2518  uint32 old_buf_state;
2519 
2520  if (!BufferIsValid(buffer))
2521  elog(ERROR, "bad buffer ID: %d", buffer);
2522 
2523  if (BufferIsLocal(buffer))
2524  {
2525  MarkLocalBufferDirty(buffer);
2526  return;
2527  }
2528 
2529  bufHdr = GetBufferDescriptor(buffer - 1);
2530 
2531  Assert(BufferIsPinned(buffer));
2533  LW_EXCLUSIVE));
2534 
2535  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2536  for (;;)
2537  {
2538  if (old_buf_state & BM_LOCKED)
2539  old_buf_state = WaitBufHdrUnlocked(bufHdr);
2540 
2541  buf_state = old_buf_state;
2542 
2543  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2544  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2545 
2546  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2547  buf_state))
2548  break;
2549  }
2550 
2551  /*
2552  * If the buffer was not dirty already, do vacuum accounting.
2553  */
2554  if (!(old_buf_state & BM_DIRTY))
2555  {
2557  if (VacuumCostActive)
2559  }
2560 }
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:349
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:5773
bool VacuumCostActive
Definition: globals.c:157
int VacuumCostBalance
Definition: globals.c:156
int VacuumCostPageDirty
Definition: globals.c:152
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:449
int64 shared_blks_dirtied
Definition: instrument.h:28

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), dataExecPlaceToPageInternal(), dataExecPlaceToPageLeaf(), do_setval(), doPickSplit(), entryExecPlaceToPage(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune_and_freeze(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_new_or_empty(), lazy_scan_prune(), lazy_vacuum_heap_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 4970 of file bufmgr.c.

4971 {
4972  BufferDesc *bufHdr;
4973  Page page = BufferGetPage(buffer);
4974 
4975  if (!BufferIsValid(buffer))
4976  elog(ERROR, "bad buffer ID: %d", buffer);
4977 
4978  if (BufferIsLocal(buffer))
4979  {
4980  MarkLocalBufferDirty(buffer);
4981  return;
4982  }
4983 
4984  bufHdr = GetBufferDescriptor(buffer - 1);
4985 
4986  Assert(GetPrivateRefCount(buffer) > 0);
4987  /* here, either share or exclusive lock is OK */
4989 
4990  /*
4991  * This routine might get called many times on the same page, if we are
4992  * making the first scan after commit of an xact that added/deleted many
4993  * tuples. So, be as quick as we can if the buffer is already dirty. We
4994  * do this by not acquiring spinlock if it looks like the status bits are
4995  * already set. Since we make this test unlocked, there's a chance we
4996  * might fail to notice that the flags have just been cleared, and failed
4997  * to reset them, due to memory-ordering issues. But since this function
4998  * is only intended to be used in cases where failing to write out the
4999  * data would be harmless anyway, it doesn't really matter.
5000  */
5001  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
5003  {
5005  bool dirtied = false;
5006  bool delayChkptFlags = false;
5007  uint32 buf_state;
5008 
5009  /*
5010  * If we need to protect hint bit updates from torn writes, WAL-log a
5011  * full page image of the page. This full page image is only necessary
5012  * if the hint bit update is the first change to the page since the
5013  * last checkpoint.
5014  *
5015  * We don't check full_page_writes here because that logic is included
5016  * when we call XLogInsert() since the value changes dynamically.
5017  */
5018  if (XLogHintBitIsNeeded() &&
5019  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
5020  {
5021  /*
5022  * If we must not write WAL, due to a relfilelocator-specific
5023  * condition or being in recovery, don't dirty the page. We can
5024  * set the hint, just not dirty the page as a result so the hint
5025  * is lost when we evict the page or shutdown.
5026  *
5027  * See src/backend/storage/page/README for longer discussion.
5028  */
5029  if (RecoveryInProgress() ||
5031  return;
5032 
5033  /*
5034  * If the block is already dirty because we either made a change
5035  * or set a hint already, then we don't need to write a full page
5036  * image. Note that aggressive cleaning of blocks dirtied by hint
5037  * bit setting would increase the call rate. Bulk setting of hint
5038  * bits would reduce the call rate...
5039  *
5040  * We must issue the WAL record before we mark the buffer dirty.
5041  * Otherwise we might write the page before we write the WAL. That
5042  * causes a race condition, since a checkpoint might occur between
5043  * writing the WAL record and marking the buffer dirty. We solve
5044  * that with a kluge, but one that is already in use during
5045  * transaction commit to prevent race conditions. Basically, we
5046  * simply prevent the checkpoint WAL record from being written
5047  * until we have marked the buffer dirty. We don't start the
5048  * checkpoint flush until we have marked dirty, so our checkpoint
5049  * must flush the change to disk successfully or the checkpoint
5050  * never gets written, so crash recovery will fix.
5051  *
5052  * It's possible we may enter here without an xid, so it is
5053  * essential that CreateCheckPoint waits for virtual transactions
5054  * rather than full transactionids.
5055  */
5058  delayChkptFlags = true;
5059  lsn = XLogSaveBufferForHint(buffer, buffer_std);
5060  }
5061 
5062  buf_state = LockBufHdr(bufHdr);
5063 
5064  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5065 
5066  if (!(buf_state & BM_DIRTY))
5067  {
5068  dirtied = true; /* Means "will be dirtied by this action" */
5069 
5070  /*
5071  * Set the page LSN if we wrote a backup block. We aren't supposed
5072  * to set this when only holding a share lock but as long as we
5073  * serialise it somehow we're OK. We choose to set LSN while
5074  * holding the buffer header lock, which causes any reader of an
5075  * LSN who holds only a share lock to also obtain a buffer header
5076  * lock before using PageGetLSN(), which is enforced in
5077  * BufferGetLSNAtomic().
5078  *
5079  * If checksums are enabled, you might think we should reset the
5080  * checksum here. That will happen when the page is written
5081  * sometime later in this checkpoint cycle.
5082  */
5083  if (!XLogRecPtrIsInvalid(lsn))
5084  PageSetLSN(page, lsn);
5085  }
5086 
5087  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5088  UnlockBufHdr(bufHdr, buf_state);
5089 
5090  if (delayChkptFlags)
5092 
5093  if (dirtied)
5094  {
5096  if (VacuumCostActive)
5098  }
5099  }
5100 }
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:391
#define DELAY_CHKPT_START
Definition: proc.h:114
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:532
int delayChkptFlags
Definition: proc.h:235
bool RecoveryInProgress(void)
Definition: xlog.c:6333
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1065

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferGetPage(), BufferIsLocal, BufferIsValid(), BufTagGetRelFileLocator(), DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog, ERROR, GetBufferDescriptor(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyProc, PageSetLSN(), pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune_and_freeze(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 316 of file bufmgr.c.

317 {
319 
320  /* only allowed to be called when a reservation has been made */
321  Assert(ReservedRefCountEntry != NULL);
322 
323  /* use up the reserved entry */
325  ReservedRefCountEntry = NULL;
326 
327  /* and fill it */
328  res->buffer = buffer;
329  res->refcount = 0;
330 
331  return res;
332 }

References Assert, PrivateRefCountEntry::buffer, res, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 2634 of file bufmgr.c.

2635 {
2637  bool result;
2638  PrivateRefCountEntry *ref;
2639 
2640  Assert(!BufferIsLocal(b));
2641  Assert(ReservedRefCountEntry != NULL);
2642 
2643  ref = GetPrivateRefCountEntry(b, true);
2644 
2645  if (ref == NULL)
2646  {
2647  uint32 buf_state;
2648  uint32 old_buf_state;
2649 
2650  ref = NewPrivateRefCountEntry(b);
2651 
2652  old_buf_state = pg_atomic_read_u32(&buf->state);
2653  for (;;)
2654  {
2655  if (old_buf_state & BM_LOCKED)
2656  old_buf_state = WaitBufHdrUnlocked(buf);
2657 
2658  buf_state = old_buf_state;
2659 
2660  /* increase refcount */
2661  buf_state += BUF_REFCOUNT_ONE;
2662 
2663  if (strategy == NULL)
2664  {
2665  /* Default case: increase usagecount unless already max. */
2667  buf_state += BUF_USAGECOUNT_ONE;
2668  }
2669  else
2670  {
2671  /*
2672  * Ring buffers shouldn't evict others from pool. Thus we
2673  * don't make usagecount more than 1.
2674  */
2675  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2676  buf_state += BUF_USAGECOUNT_ONE;
2677  }
2678 
2679  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2680  buf_state))
2681  {
2682  result = (buf_state & BM_VALID) != 0;
2683 
2684  /*
2685  * Assume that we acquired a buffer pin for the purposes of
2686  * Valgrind buffer client checks (even in !result case) to
2687  * keep things simple. Buffers that are unsafe to access are
2688  * not generally guaranteed to be marked undefined or
2689  * non-accessible in any case.
2690  */
2692  break;
2693  }
2694  }
2695  }
2696  else
2697  {
2698  /*
2699  * If we previously pinned the buffer, it is likely to be valid, but
2700  * it may not be if StartReadBuffers() was called and
2701  * WaitReadBuffers() hasn't been called yet. We'll check by loading
2702  * the flags without locking. This is racy, but it's OK to return
2703  * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
2704  * it'll see that it's now valid.
2705  *
2706  * Note: We deliberately avoid a Valgrind client request here.
2707  * Individual access methods can optionally superimpose buffer page
2708  * client requests on top of our client requests to enforce that
2709  * buffers are only accessed while locked (and pinned). It's possible
2710  * that the buffer page is legitimately non-accessible here. We
2711  * cannot meddle with that.
2712  */
2713  result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
2714  }
2715 
2716  ref->refcount++;
2717  Assert(ref->refcount > 0);
2719  return result;
2720 }
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:78
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:43
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:52
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:316
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26

References Assert, b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservedRefCountEntry, ResourceOwnerRememberBuffer(), VALGRIND_MAKE_MEM_DEFINED, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), ExtendBufferedRelShared(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 2745 of file bufmgr.c.

2746 {
2747  Buffer b;
2748  PrivateRefCountEntry *ref;
2749  uint32 buf_state;
2750 
2751  /*
2752  * As explained, We don't expect any preexisting pins. That allows us to
2753  * manipulate the PrivateRefCount after releasing the spinlock
2754  */
2756 
2757  /*
2758  * Buffer can't have a preexisting pin, so mark its page as defined to
2759  * Valgrind (this is similar to the PinBuffer() case where the backend
2760  * doesn't already have a buffer pin)
2761  */
2763 
2764  /*
2765  * Since we hold the buffer spinlock, we can update the buffer state and
2766  * release the lock in one operation.
2767  */
2768  buf_state = pg_atomic_read_u32(&buf->state);
2769  Assert(buf_state & BM_LOCKED);
2770  buf_state += BUF_REFCOUNT_ONE;
2771  UnlockBufHdr(buf, buf_state);
2772 
2774 
2775  ref = NewPrivateRefCountEntry(b);
2776  ref->refcount++;
2777 
2779 }

References Assert, b, BM_LOCKED, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), UnlockBufHdr(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by EvictUnpinnedBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ PinBufferForBlock()

static pg_attribute_always_inline Buffer PinBufferForBlock ( Relation  rel,
SMgrRelation  smgr,
char  persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr 
)
static

Definition at line 1106 of file bufmgr.c.

1113 {
1114  BufferDesc *bufHdr;
1115  IOContext io_context;
1116  IOObject io_object;
1117 
1118  Assert(blockNum != P_NEW);
1119 
1120  /* Persistence should be set before */
1121  Assert((persistence == RELPERSISTENCE_TEMP ||
1122  persistence == RELPERSISTENCE_PERMANENT ||
1123  persistence == RELPERSISTENCE_UNLOGGED));
1124 
1125  if (persistence == RELPERSISTENCE_TEMP)
1126  {
1127  io_context = IOCONTEXT_NORMAL;
1128  io_object = IOOBJECT_TEMP_RELATION;
1129  }
1130  else
1131  {
1132  io_context = IOContextForStrategy(strategy);
1133  io_object = IOOBJECT_RELATION;
1134  }
1135 
1136  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1138  smgr->smgr_rlocator.locator.dbOid,
1140  smgr->smgr_rlocator.backend);
1141 
1142  if (persistence == RELPERSISTENCE_TEMP)
1143  {
1144  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1145  if (*foundPtr)
1147  }
1148  else
1149  {
1150  bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1151  strategy, foundPtr, io_context);
1152  if (*foundPtr)
1154  }
1155  if (rel)
1156  {
1157  /*
1158  * While pgBufferUsage's "read" counter isn't bumped unless we reach
1159  * WaitReadBuffers() (so, not for hits, and not for buffers that are
1160  * zeroed instead), the per-relation stats always count them.
1161  */
1163  if (*foundPtr)
1165  }
1166  if (*foundPtr)
1167  {
1168  pgstat_count_io_op(io_object, io_context, IOOP_HIT);
1169  if (VacuumCostActive)
1171 
1172  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1174  smgr->smgr_rlocator.locator.dbOid,
1176  smgr->smgr_rlocator.backend,
1177  true);
1178  }
1179 
1180  return BufferDescriptorGetBuffer(bufHdr);
1181 }
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:1588
#define P_NEW
Definition: bufmgr.h:184
int VacuumCostPageHit
Definition: globals.c:150
BufferDesc * LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr)
Definition: localbuf.c:116
IOObject
Definition: pgstat.h:311
#define pgstat_count_buffer_read(rel)
Definition: pgstat.h:668
@ IOOP_HIT
Definition: pgstat.h:333
#define pgstat_count_buffer_hit(rel)
Definition: pgstat.h:673
int64 local_blks_hit
Definition: instrument.h:30
int64 shared_blks_hit
Definition: instrument.h:26</