PostgreSQL Source Code  git master
bufmgr.c File Reference
#include "postgres.h"
#include <sys/file.h>
#include <unistd.h>
#include "access/tableam.h"
#include "access/xloginsert.h"
#include "access/xlogutils.h"
#include "catalog/storage.h"
#include "catalog/storage_xlog.h"
#include "executor/instrument.h"
#include "lib/binaryheap.h"
#include "miscadmin.h"
#include "pg_trace.h"
#include "pgstat.h"
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
#include "utils/memdebug.h"
#include "utils/ps_status.h"
#include "utils/rel.h"
#include "utils/resowner.h"
#include "utils/timestamp.h"
#include <lib/sort_template.h>
Include dependency graph for bufmgr.c:

Go to the source code of this file.

Data Structures

struct  PrivateRefCountEntry
 
struct  CkptTsStatus
 
struct  SMgrSortArray
 

Macros

#define BufHdrGetBlock(bufHdr)   ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
 
#define BufferGetLSN(bufHdr)   (PageGetLSN(BufHdrGetBlock(bufHdr)))
 
#define LocalBufHdrGetBlock(bufHdr)    LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
 
#define BUF_WRITTEN   0x01
 
#define BUF_REUSABLE   0x02
 
#define RELS_BSEARCH_THRESHOLD   20
 
#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)
 
#define REFCOUNT_ARRAY_ENTRIES   8
 
#define BufferIsPinned(bufnum)
 
#define ST_SORT   sort_checkpoint_bufferids
 
#define ST_ELEMENT_TYPE   CkptSortItem
 
#define ST_COMPARE(a, b)   ckpt_buforder_comparator(a, b)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 
#define ST_SORT   sort_pending_writebacks
 
#define ST_ELEMENT_TYPE   PendingWriteback
 
#define ST_COMPARE(a, b)   buffertag_comparator(&a->tag, &b->tag)
 
#define ST_SCOPE   static
 
#define ST_DEFINE
 

Typedefs

typedef struct PrivateRefCountEntry PrivateRefCountEntry
 
typedef struct CkptTsStatus CkptTsStatus
 
typedef struct SMgrSortArray SMgrSortArray
 

Functions

static void ReservePrivateRefCountEntry (void)
 
static PrivateRefCountEntryNewPrivateRefCountEntry (Buffer buffer)
 
static PrivateRefCountEntryGetPrivateRefCountEntry (Buffer buffer, bool do_move)
 
static int32 GetPrivateRefCount (Buffer buffer)
 
static void ForgetPrivateRefCountEntry (PrivateRefCountEntry *ref)
 
static void ResOwnerReleaseBufferIO (Datum res)
 
static char * ResOwnerPrintBufferIO (Datum res)
 
static void ResOwnerReleaseBufferPin (Datum res)
 
static char * ResOwnerPrintBufferPin (Datum res)
 
static Buffer ReadBuffer_common (Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
static BlockNumber ExtendBufferedRelCommon (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static BlockNumber ExtendBufferedRelShared (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
 
static bool PinBuffer (BufferDesc *buf, BufferAccessStrategy strategy)
 
static void PinBuffer_Locked (BufferDesc *buf)
 
static void UnpinBuffer (BufferDesc *buf)
 
static void UnpinBufferNoOwner (BufferDesc *buf)
 
static void BufferSync (int flags)
 
static uint32 WaitBufHdrUnlocked (BufferDesc *buf)
 
static int SyncOneBuffer (int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 
static void WaitIO (BufferDesc *buf)
 
static bool StartBufferIO (BufferDesc *buf, bool forInput, bool nowait)
 
static void TerminateBufferIO (BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner)
 
static void AbortBufferIO (Buffer buffer)
 
static void shared_buffer_write_error_callback (void *arg)
 
static void local_buffer_write_error_callback (void *arg)
 
static BufferDescBufferAlloc (SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
 
static Buffer GetVictimBuffer (BufferAccessStrategy strategy, IOContext io_context)
 
static void FlushBuffer (BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
 
static void FindAndDropRelationBuffers (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
 
static void RelationCopyStorageUsingBuffer (RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
 
static void AtProcExit_Buffers (int code, Datum arg)
 
static void CheckForBufferLeaks (void)
 
static int rlocator_comparator (const void *p1, const void *p2)
 
static int buffertag_comparator (const BufferTag *ba, const BufferTag *bb)
 
static int ckpt_buforder_comparator (const CkptSortItem *a, const CkptSortItem *b)
 
static int ts_ckpt_progress_comparator (Datum a, Datum b, void *arg)
 
PrefetchBufferResult PrefetchSharedBuffer (SMgrRelation smgr_reln, ForkNumber forkNum, BlockNumber blockNum)
 
PrefetchBufferResult PrefetchBuffer (Relation reln, ForkNumber forkNum, BlockNumber blockNum)
 
bool ReadRecentBuffer (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, Buffer recent_buffer)
 
Buffer ReadBuffer (Relation reln, BlockNumber blockNum)
 
Buffer ReadBufferExtended (Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
 
Buffer ReadBufferWithoutRelcache (RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy, bool permanent)
 
Buffer ExtendBufferedRel (BufferManagerRelation bmr, ForkNumber forkNum, BufferAccessStrategy strategy, uint32 flags)
 
BlockNumber ExtendBufferedRelBy (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
 
Buffer ExtendBufferedRelTo (BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, BlockNumber extend_to, ReadBufferMode mode)
 
static void ZeroAndLockBuffer (Buffer buffer, ReadBufferMode mode, bool already_valid)
 
static pg_attribute_always_inline Buffer PinBufferForBlock (Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr)
 
static pg_attribute_always_inline bool StartReadBuffersImpl (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
 
bool StartReadBuffers (ReadBuffersOperation *operation, Buffer *buffers, BlockNumber blockNum, int *nblocks, int flags)
 
bool StartReadBuffer (ReadBuffersOperation *operation, Buffer *buffer, BlockNumber blocknum, int flags)
 
static bool WaitReadBuffersCanStartIO (Buffer buffer, bool nowait)
 
void WaitReadBuffers (ReadBuffersOperation *operation)
 
static void InvalidateBuffer (BufferDesc *buf)
 
static bool InvalidateVictimBuffer (BufferDesc *buf_hdr)
 
void LimitAdditionalPins (uint32 *additional_pins)
 
bool BufferIsExclusiveLocked (Buffer buffer)
 
bool BufferIsDirty (Buffer buffer)
 
void MarkBufferDirty (Buffer buffer)
 
Buffer ReleaseAndReadBuffer (Buffer buffer, Relation relation, BlockNumber blockNum)
 
bool BgBufferSync (WritebackContext *wb_context)
 
void AtEOXact_Buffers (bool isCommit)
 
void InitBufferPoolAccess (void)
 
char * DebugPrintBufferRefcount (Buffer buffer)
 
void CheckPointBuffers (int flags)
 
BlockNumber BufferGetBlockNumber (Buffer buffer)
 
void BufferGetTag (Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, BlockNumber *blknum)
 
BlockNumber RelationGetNumberOfBlocksInFork (Relation relation, ForkNumber forkNum)
 
bool BufferIsPermanent (Buffer buffer)
 
XLogRecPtr BufferGetLSNAtomic (Buffer buffer)
 
void DropRelationBuffers (SMgrRelation smgr_reln, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock)
 
void DropRelationsAllBuffers (SMgrRelation *smgr_reln, int nlocators)
 
void DropDatabaseBuffers (Oid dbid)
 
void FlushRelationBuffers (Relation rel)
 
void FlushRelationsAllBuffers (SMgrRelation *smgrs, int nrels)
 
void CreateAndCopyRelationData (RelFileLocator src_rlocator, RelFileLocator dst_rlocator, bool permanent)
 
void FlushDatabaseBuffers (Oid dbid)
 
void FlushOneBuffer (Buffer buffer)
 
void ReleaseBuffer (Buffer buffer)
 
void UnlockReleaseBuffer (Buffer buffer)
 
void IncrBufferRefCount (Buffer buffer)
 
void MarkBufferDirtyHint (Buffer buffer, bool buffer_std)
 
void UnlockBuffers (void)
 
void LockBuffer (Buffer buffer, int mode)
 
bool ConditionalLockBuffer (Buffer buffer)
 
void CheckBufferIsPinnedOnce (Buffer buffer)
 
void LockBufferForCleanup (Buffer buffer)
 
bool HoldingBufferPinThatDelaysRecovery (void)
 
bool ConditionalLockBufferForCleanup (Buffer buffer)
 
bool IsBufferCleanupOK (Buffer buffer)
 
uint32 LockBufHdr (BufferDesc *desc)
 
void WritebackContextInit (WritebackContext *context, int *max_pending)
 
void ScheduleBufferTagForWriteback (WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
 
void IssuePendingWritebacks (WritebackContext *wb_context, IOContext io_context)
 
bool EvictUnpinnedBuffer (Buffer buf)
 

Variables

bool zero_damaged_pages = false
 
int bgwriter_lru_maxpages = 100
 
double bgwriter_lru_multiplier = 2.0
 
bool track_io_timing = false
 
int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY
 
int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY
 
int io_combine_limit = DEFAULT_IO_COMBINE_LIMIT
 
int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER
 
int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER
 
int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER
 
static BufferDescPinCountWaitBuf = NULL
 
static struct PrivateRefCountEntry PrivateRefCountArray [REFCOUNT_ARRAY_ENTRIES]
 
static HTABPrivateRefCountHash = NULL
 
static int32 PrivateRefCountOverflowed = 0
 
static uint32 PrivateRefCountClock = 0
 
static PrivateRefCountEntryReservedRefCountEntry = NULL
 
const ResourceOwnerDesc buffer_io_resowner_desc
 
const ResourceOwnerDesc buffer_pin_resowner_desc
 

Macro Definition Documentation

◆ BUF_DROP_FULL_SCAN_THRESHOLD

#define BUF_DROP_FULL_SCAN_THRESHOLD   (uint64) (NBuffers / 32)

Definition at line 86 of file bufmgr.c.

◆ BUF_REUSABLE

#define BUF_REUSABLE   0x02

Definition at line 76 of file bufmgr.c.

◆ BUF_WRITTEN

#define BUF_WRITTEN   0x01

Definition at line 75 of file bufmgr.c.

◆ BufferGetLSN

#define BufferGetLSN (   bufHdr)    (PageGetLSN(BufHdrGetBlock(bufHdr)))

Definition at line 68 of file bufmgr.c.

◆ BufferIsPinned

#define BufferIsPinned (   bufnum)
Value:
( \
!BufferIsValid(bufnum) ? \
false \
: \
BufferIsLocal(bufnum) ? \
(LocalRefCount[-(bufnum) - 1] > 0) \
: \
(GetPrivateRefCount(bufnum) > 0) \
)
static int32 GetPrivateRefCount(Buffer buffer)
Definition: bufmgr.c:415
static bool BufferIsValid(Buffer bufnum)
Definition: bufmgr.h:355
int32 * LocalRefCount
Definition: localbuf.c:46

Definition at line 473 of file bufmgr.c.

◆ BufHdrGetBlock

#define BufHdrGetBlock (   bufHdr)    ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))

Definition at line 67 of file bufmgr.c.

◆ LocalBufHdrGetBlock

#define LocalBufHdrGetBlock (   bufHdr)     LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]

Definition at line 71 of file bufmgr.c.

◆ REFCOUNT_ARRAY_ENTRIES

#define REFCOUNT_ARRAY_ENTRIES   8

Definition at line 95 of file bufmgr.c.

◆ RELS_BSEARCH_THRESHOLD

#define RELS_BSEARCH_THRESHOLD   20

Definition at line 78 of file bufmgr.c.

◆ ST_COMPARE [1/2]

#define ST_COMPARE (   a,
  b 
)    ckpt_buforder_comparator(a, b)

Definition at line 5920 of file bufmgr.c.

◆ ST_COMPARE [2/2]

#define ST_COMPARE (   a,
  b 
)    buffertag_comparator(&a->tag, &b->tag)

Definition at line 5920 of file bufmgr.c.

◆ ST_DEFINE [1/2]

#define ST_DEFINE

Definition at line 5922 of file bufmgr.c.

◆ ST_DEFINE [2/2]

#define ST_DEFINE

Definition at line 5922 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [1/2]

#define ST_ELEMENT_TYPE   CkptSortItem

Definition at line 5919 of file bufmgr.c.

◆ ST_ELEMENT_TYPE [2/2]

#define ST_ELEMENT_TYPE   PendingWriteback

Definition at line 5919 of file bufmgr.c.

◆ ST_SCOPE [1/2]

#define ST_SCOPE   static

Definition at line 5921 of file bufmgr.c.

◆ ST_SCOPE [2/2]

#define ST_SCOPE   static

Definition at line 5921 of file bufmgr.c.

◆ ST_SORT [1/2]

#define ST_SORT   sort_checkpoint_bufferids

Definition at line 5918 of file bufmgr.c.

◆ ST_SORT [2/2]

#define ST_SORT   sort_pending_writebacks

Definition at line 5918 of file bufmgr.c.

Typedef Documentation

◆ CkptTsStatus

typedef struct CkptTsStatus CkptTsStatus

◆ PrivateRefCountEntry

◆ SMgrSortArray

typedef struct SMgrSortArray SMgrSortArray

Function Documentation

◆ AbortBufferIO()

static void AbortBufferIO ( Buffer  buffer)
static

Definition at line 5625 of file bufmgr.c.

5626 {
5627  BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
5628  uint32 buf_state;
5629 
5630  buf_state = LockBufHdr(buf_hdr);
5631  Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
5632 
5633  if (!(buf_state & BM_VALID))
5634  {
5635  Assert(!(buf_state & BM_DIRTY));
5636  UnlockBufHdr(buf_hdr, buf_state);
5637  }
5638  else
5639  {
5640  Assert(buf_state & BM_DIRTY);
5641  UnlockBufHdr(buf_hdr, buf_state);
5642 
5643  /* Issue notice if this is not the first failure... */
5644  if (buf_state & BM_IO_ERROR)
5645  {
5646  /* Buffer is pinned, so we can read tag without spinlock */
5647  char *path;
5648 
5649  path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
5650  BufTagGetForkNum(&buf_hdr->tag));
5651  ereport(WARNING,
5652  (errcode(ERRCODE_IO_ERROR),
5653  errmsg("could not write block %u of %s",
5654  buf_hdr->tag.blockNum, path),
5655  errdetail("Multiple failures --- write error might be permanent.")));
5656  pfree(path);
5657  }
5658  }
5659 
5660  TerminateBufferIO(buf_hdr, false, BM_IO_ERROR, false);
5661 }
#define BM_TAG_VALID
Definition: buf_internals.h:63
static ForkNumber BufTagGetForkNum(const BufferTag *tag)
static BufferDesc * GetBufferDescriptor(uint32 id)
static void UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
#define BM_DIRTY
Definition: buf_internals.h:61
#define BM_IO_IN_PROGRESS
Definition: buf_internals.h:64
static RelFileLocator BufTagGetRelFileLocator(const BufferTag *tag)
#define BM_VALID
Definition: buf_internals.h:62
#define BM_IO_ERROR
Definition: buf_internals.h:65
static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits, bool forget_owner)
Definition: bufmgr.c:5588
uint32 LockBufHdr(BufferDesc *desc)
Definition: bufmgr.c:5734
unsigned int uint32
Definition: c.h:506
#define Assert(condition)
Definition: c.h:858
int errdetail(const char *fmt,...)
Definition: elog.c:1203
int errcode(int sqlerrcode)
Definition: elog.c:857
int errmsg(const char *fmt,...)
Definition: elog.c:1070
#define WARNING
Definition: elog.h:36
#define ereport(elevel,...)
Definition: elog.h:149
void pfree(void *pointer)
Definition: mcxt.c:1520
#define relpathperm(rlocator, forknum)
Definition: relpath.h:90
BufferTag tag
BlockNumber blockNum
Definition: buf_internals.h:98

References Assert, buftag::blockNum, BM_DIRTY, BM_IO_ERROR, BM_IO_IN_PROGRESS, BM_TAG_VALID, BM_VALID, PrivateRefCountEntry::buffer, BufTagGetForkNum(), BufTagGetRelFileLocator(), ereport, errcode(), errdetail(), errmsg(), GetBufferDescriptor(), LockBufHdr(), pfree(), relpathperm, BufferDesc::tag, TerminateBufferIO(), UnlockBufHdr(), and WARNING.

Referenced by ResOwnerReleaseBufferIO().

◆ AtEOXact_Buffers()

void AtEOXact_Buffers ( bool  isCommit)

Definition at line 3548 of file bufmgr.c.

3549 {
3551 
3552  AtEOXact_LocalBuffers(isCommit);
3553 
3555 }
static void CheckForBufferLeaks(void)
Definition: bufmgr.c:3608
static int32 PrivateRefCountOverflowed
Definition: bufmgr.c:209
void AtEOXact_LocalBuffers(bool isCommit)
Definition: localbuf.c:819

References Assert, AtEOXact_LocalBuffers(), CheckForBufferLeaks(), and PrivateRefCountOverflowed.

Referenced by AbortTransaction(), BackgroundWriterMain(), CheckpointerMain(), CommitTransaction(), PrepareTransaction(), and WalWriterMain().

◆ AtProcExit_Buffers()

static void AtProcExit_Buffers ( int  code,
Datum  arg 
)
static

Definition at line 3590 of file bufmgr.c.

3591 {
3592  UnlockBuffers();
3593 
3595 
3596  /* localbuf.c needs a chance too */
3598 }
void UnlockBuffers(void)
Definition: bufmgr.c:5103
void AtProcExit_LocalBuffers(void)
Definition: localbuf.c:830

References AtProcExit_LocalBuffers(), CheckForBufferLeaks(), and UnlockBuffers().

Referenced by InitBufferPoolAccess().

◆ BgBufferSync()

bool BgBufferSync ( WritebackContext wb_context)

Definition at line 3177 of file bufmgr.c.

3178 {
3179  /* info obtained from freelist.c */
3180  int strategy_buf_id;
3181  uint32 strategy_passes;
3182  uint32 recent_alloc;
3183 
3184  /*
3185  * Information saved between calls so we can determine the strategy
3186  * point's advance rate and avoid scanning already-cleaned buffers.
3187  */
3188  static bool saved_info_valid = false;
3189  static int prev_strategy_buf_id;
3190  static uint32 prev_strategy_passes;
3191  static int next_to_clean;
3192  static uint32 next_passes;
3193 
3194  /* Moving averages of allocation rate and clean-buffer density */
3195  static float smoothed_alloc = 0;
3196  static float smoothed_density = 10.0;
3197 
3198  /* Potentially these could be tunables, but for now, not */
3199  float smoothing_samples = 16;
3200  float scan_whole_pool_milliseconds = 120000.0;
3201 
3202  /* Used to compute how far we scan ahead */
3203  long strategy_delta;
3204  int bufs_to_lap;
3205  int bufs_ahead;
3206  float scans_per_alloc;
3207  int reusable_buffers_est;
3208  int upcoming_alloc_est;
3209  int min_scan_buffers;
3210 
3211  /* Variables for the scanning loop proper */
3212  int num_to_scan;
3213  int num_written;
3214  int reusable_buffers;
3215 
3216  /* Variables for final smoothed_density update */
3217  long new_strategy_delta;
3218  uint32 new_recent_alloc;
3219 
3220  /*
3221  * Find out where the freelist clock sweep currently is, and how many
3222  * buffer allocations have happened since our last call.
3223  */
3224  strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
3225 
3226  /* Report buffer alloc counts to pgstat */
3227  PendingBgWriterStats.buf_alloc += recent_alloc;
3228 
3229  /*
3230  * If we're not running the LRU scan, just stop after doing the stats
3231  * stuff. We mark the saved state invalid so that we can recover sanely
3232  * if LRU scan is turned back on later.
3233  */
3234  if (bgwriter_lru_maxpages <= 0)
3235  {
3236  saved_info_valid = false;
3237  return true;
3238  }
3239 
3240  /*
3241  * Compute strategy_delta = how many buffers have been scanned by the
3242  * clock sweep since last time. If first time through, assume none. Then
3243  * see if we are still ahead of the clock sweep, and if so, how many
3244  * buffers we could scan before we'd catch up with it and "lap" it. Note:
3245  * weird-looking coding of xxx_passes comparisons are to avoid bogus
3246  * behavior when the passes counts wrap around.
3247  */
3248  if (saved_info_valid)
3249  {
3250  int32 passes_delta = strategy_passes - prev_strategy_passes;
3251 
3252  strategy_delta = strategy_buf_id - prev_strategy_buf_id;
3253  strategy_delta += (long) passes_delta * NBuffers;
3254 
3255  Assert(strategy_delta >= 0);
3256 
3257  if ((int32) (next_passes - strategy_passes) > 0)
3258  {
3259  /* we're one pass ahead of the strategy point */
3260  bufs_to_lap = strategy_buf_id - next_to_clean;
3261 #ifdef BGW_DEBUG
3262  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3263  next_passes, next_to_clean,
3264  strategy_passes, strategy_buf_id,
3265  strategy_delta, bufs_to_lap);
3266 #endif
3267  }
3268  else if (next_passes == strategy_passes &&
3269  next_to_clean >= strategy_buf_id)
3270  {
3271  /* on same pass, but ahead or at least not behind */
3272  bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
3273 #ifdef BGW_DEBUG
3274  elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
3275  next_passes, next_to_clean,
3276  strategy_passes, strategy_buf_id,
3277  strategy_delta, bufs_to_lap);
3278 #endif
3279  }
3280  else
3281  {
3282  /*
3283  * We're behind, so skip forward to the strategy point and start
3284  * cleaning from there.
3285  */
3286 #ifdef BGW_DEBUG
3287  elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
3288  next_passes, next_to_clean,
3289  strategy_passes, strategy_buf_id,
3290  strategy_delta);
3291 #endif
3292  next_to_clean = strategy_buf_id;
3293  next_passes = strategy_passes;
3294  bufs_to_lap = NBuffers;
3295  }
3296  }
3297  else
3298  {
3299  /*
3300  * Initializing at startup or after LRU scanning had been off. Always
3301  * start at the strategy point.
3302  */
3303 #ifdef BGW_DEBUG
3304  elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
3305  strategy_passes, strategy_buf_id);
3306 #endif
3307  strategy_delta = 0;
3308  next_to_clean = strategy_buf_id;
3309  next_passes = strategy_passes;
3310  bufs_to_lap = NBuffers;
3311  }
3312 
3313  /* Update saved info for next time */
3314  prev_strategy_buf_id = strategy_buf_id;
3315  prev_strategy_passes = strategy_passes;
3316  saved_info_valid = true;
3317 
3318  /*
3319  * Compute how many buffers had to be scanned for each new allocation, ie,
3320  * 1/density of reusable buffers, and track a moving average of that.
3321  *
3322  * If the strategy point didn't move, we don't update the density estimate
3323  */
3324  if (strategy_delta > 0 && recent_alloc > 0)
3325  {
3326  scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
3327  smoothed_density += (scans_per_alloc - smoothed_density) /
3328  smoothing_samples;
3329  }
3330 
3331  /*
3332  * Estimate how many reusable buffers there are between the current
3333  * strategy point and where we've scanned ahead to, based on the smoothed
3334  * density estimate.
3335  */
3336  bufs_ahead = NBuffers - bufs_to_lap;
3337  reusable_buffers_est = (float) bufs_ahead / smoothed_density;
3338 
3339  /*
3340  * Track a moving average of recent buffer allocations. Here, rather than
3341  * a true average we want a fast-attack, slow-decline behavior: we
3342  * immediately follow any increase.
3343  */
3344  if (smoothed_alloc <= (float) recent_alloc)
3345  smoothed_alloc = recent_alloc;
3346  else
3347  smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
3348  smoothing_samples;
3349 
3350  /* Scale the estimate by a GUC to allow more aggressive tuning. */
3351  upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
3352 
3353  /*
3354  * If recent_alloc remains at zero for many cycles, smoothed_alloc will
3355  * eventually underflow to zero, and the underflows produce annoying
3356  * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
3357  * zero, there's no point in tracking smaller and smaller values of
3358  * smoothed_alloc, so just reset it to exactly zero to avoid this
3359  * syndrome. It will pop back up as soon as recent_alloc increases.
3360  */
3361  if (upcoming_alloc_est == 0)
3362  smoothed_alloc = 0;
3363 
3364  /*
3365  * Even in cases where there's been little or no buffer allocation
3366  * activity, we want to make a small amount of progress through the buffer
3367  * cache so that as many reusable buffers as possible are clean after an
3368  * idle period.
3369  *
3370  * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
3371  * the BGW will be called during the scan_whole_pool time; slice the
3372  * buffer pool into that many sections.
3373  */
3374  min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
3375 
3376  if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
3377  {
3378 #ifdef BGW_DEBUG
3379  elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
3380  upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
3381 #endif
3382  upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
3383  }
3384 
3385  /*
3386  * Now write out dirty reusable buffers, working forward from the
3387  * next_to_clean point, until we have lapped the strategy scan, or cleaned
3388  * enough buffers to match our estimate of the next cycle's allocation
3389  * requirements, or hit the bgwriter_lru_maxpages limit.
3390  */
3391 
3392  num_to_scan = bufs_to_lap;
3393  num_written = 0;
3394  reusable_buffers = reusable_buffers_est;
3395 
3396  /* Execute the LRU scan */
3397  while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
3398  {
3399  int sync_state = SyncOneBuffer(next_to_clean, true,
3400  wb_context);
3401 
3402  if (++next_to_clean >= NBuffers)
3403  {
3404  next_to_clean = 0;
3405  next_passes++;
3406  }
3407  num_to_scan--;
3408 
3409  if (sync_state & BUF_WRITTEN)
3410  {
3411  reusable_buffers++;
3412  if (++num_written >= bgwriter_lru_maxpages)
3413  {
3415  break;
3416  }
3417  }
3418  else if (sync_state & BUF_REUSABLE)
3419  reusable_buffers++;
3420  }
3421 
3422  PendingBgWriterStats.buf_written_clean += num_written;
3423 
3424 #ifdef BGW_DEBUG
3425  elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
3426  recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
3427  smoothed_density, reusable_buffers_est, upcoming_alloc_est,
3428  bufs_to_lap - num_to_scan,
3429  num_written,
3430  reusable_buffers - reusable_buffers_est);
3431 #endif
3432 
3433  /*
3434  * Consider the above scan as being like a new allocation scan.
3435  * Characterize its density and update the smoothed one based on it. This
3436  * effectively halves the moving average period in cases where both the
3437  * strategy and the background writer are doing some useful scanning,
3438  * which is helpful because a long memory isn't as desirable on the
3439  * density estimates.
3440  */
3441  new_strategy_delta = bufs_to_lap - num_to_scan;
3442  new_recent_alloc = reusable_buffers - reusable_buffers_est;
3443  if (new_strategy_delta > 0 && new_recent_alloc > 0)
3444  {
3445  scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
3446  smoothed_density += (scans_per_alloc - smoothed_density) /
3447  smoothing_samples;
3448 
3449 #ifdef BGW_DEBUG
3450  elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
3451  new_recent_alloc, new_strategy_delta,
3452  scans_per_alloc, smoothed_density);
3453 #endif
3454  }
3455 
3456  /* Return true if OK to hibernate */
3457  return (bufs_to_lap == 0 && recent_alloc == 0);
3458 }
int BgWriterDelay
Definition: bgwriter.c:57
#define BUF_REUSABLE
Definition: bufmgr.c:76
double bgwriter_lru_multiplier
Definition: bufmgr.c:141
static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
Definition: bufmgr.c:3475
int bgwriter_lru_maxpages
Definition: bufmgr.c:140
#define BUF_WRITTEN
Definition: bufmgr.c:75
signed int int32
Definition: c.h:494
#define DEBUG2
Definition: elog.h:29
#define DEBUG1
Definition: elog.h:30
#define elog(elevel,...)
Definition: elog.h:224
int StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
Definition: freelist.c:394
int NBuffers
Definition: globals.c:139
PgStat_BgWriterStats PendingBgWriterStats
PgStat_Counter buf_written_clean
Definition: pgstat.h:255
PgStat_Counter maxwritten_clean
Definition: pgstat.h:256
PgStat_Counter buf_alloc
Definition: pgstat.h:257

References Assert, bgwriter_lru_maxpages, bgwriter_lru_multiplier, BgWriterDelay, PgStat_BgWriterStats::buf_alloc, BUF_REUSABLE, BUF_WRITTEN, PgStat_BgWriterStats::buf_written_clean, DEBUG1, DEBUG2, elog, PgStat_BgWriterStats::maxwritten_clean, NBuffers, PendingBgWriterStats, StrategySyncStart(), and SyncOneBuffer().

Referenced by BackgroundWriterMain().

◆ BufferAlloc()

static pg_attribute_always_inline BufferDesc * BufferAlloc ( SMgrRelation  smgr,
char  relpersistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr,
IOContext  io_context 
)
inlinestatic

Definition at line 1594 of file bufmgr.c.

1598 {
1599  BufferTag newTag; /* identity of requested block */
1600  uint32 newHash; /* hash value for newTag */
1601  LWLock *newPartitionLock; /* buffer partition lock for it */
1602  int existing_buf_id;
1603  Buffer victim_buffer;
1604  BufferDesc *victim_buf_hdr;
1605  uint32 victim_buf_state;
1606 
1607  /* Make sure we will have room to remember the buffer pin */
1610 
1611  /* create a tag so we can lookup the buffer */
1612  InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
1613 
1614  /* determine its hash code and partition lock ID */
1615  newHash = BufTableHashCode(&newTag);
1616  newPartitionLock = BufMappingPartitionLock(newHash);
1617 
1618  /* see if the block is in the buffer pool already */
1619  LWLockAcquire(newPartitionLock, LW_SHARED);
1620  existing_buf_id = BufTableLookup(&newTag, newHash);
1621  if (existing_buf_id >= 0)
1622  {
1623  BufferDesc *buf;
1624  bool valid;
1625 
1626  /*
1627  * Found it. Now, pin the buffer so no one can steal it from the
1628  * buffer pool, and check to see if the correct data has been loaded
1629  * into the buffer.
1630  */
1631  buf = GetBufferDescriptor(existing_buf_id);
1632 
1633  valid = PinBuffer(buf, strategy);
1634 
1635  /* Can release the mapping lock as soon as we've pinned it */
1636  LWLockRelease(newPartitionLock);
1637 
1638  *foundPtr = true;
1639 
1640  if (!valid)
1641  {
1642  /*
1643  * We can only get here if (a) someone else is still reading in
1644  * the page, (b) a previous read attempt failed, or (c) someone
1645  * called StartReadBuffers() but not yet WaitReadBuffers().
1646  */
1647  *foundPtr = false;
1648  }
1649 
1650  return buf;
1651  }
1652 
1653  /*
1654  * Didn't find it in the buffer pool. We'll have to initialize a new
1655  * buffer. Remember to unlock the mapping lock while doing the work.
1656  */
1657  LWLockRelease(newPartitionLock);
1658 
1659  /*
1660  * Acquire a victim buffer. Somebody else might try to do the same, we
1661  * don't hold any conflicting locks. If so we'll have to undo our work
1662  * later.
1663  */
1664  victim_buffer = GetVictimBuffer(strategy, io_context);
1665  victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
1666 
1667  /*
1668  * Try to make a hashtable entry for the buffer under its new tag. If
1669  * somebody else inserted another buffer for the tag, we'll release the
1670  * victim buffer we acquired and use the already inserted one.
1671  */
1672  LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
1673  existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
1674  if (existing_buf_id >= 0)
1675  {
1676  BufferDesc *existing_buf_hdr;
1677  bool valid;
1678 
1679  /*
1680  * Got a collision. Someone has already done what we were about to do.
1681  * We'll just handle this as if it were found in the buffer pool in
1682  * the first place. First, give up the buffer we were planning to
1683  * use.
1684  *
1685  * We could do this after releasing the partition lock, but then we'd
1686  * have to call ResourceOwnerEnlarge() & ReservePrivateRefCountEntry()
1687  * before acquiring the lock, for the rare case of such a collision.
1688  */
1689  UnpinBuffer(victim_buf_hdr);
1690 
1691  /*
1692  * The victim buffer we acquired previously is clean and unused, let
1693  * it be found again quickly
1694  */
1695  StrategyFreeBuffer(victim_buf_hdr);
1696 
1697  /* remaining code should match code at top of routine */
1698 
1699  existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
1700 
1701  valid = PinBuffer(existing_buf_hdr, strategy);
1702 
1703  /* Can release the mapping lock as soon as we've pinned it */
1704  LWLockRelease(newPartitionLock);
1705 
1706  *foundPtr = true;
1707 
1708  if (!valid)
1709  {
1710  /*
1711  * We can only get here if (a) someone else is still reading in
1712  * the page, (b) a previous read attempt failed, or (c) someone
1713  * called StartReadBuffers() but not yet WaitReadBuffers().
1714  */
1715  *foundPtr = false;
1716  }
1717 
1718  return existing_buf_hdr;
1719  }
1720 
1721  /*
1722  * Need to lock the buffer header too in order to change its tag.
1723  */
1724  victim_buf_state = LockBufHdr(victim_buf_hdr);
1725 
1726  /* some sanity checks while we hold the buffer header lock */
1727  Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
1728  Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
1729 
1730  victim_buf_hdr->tag = newTag;
1731 
1732  /*
1733  * Make sure BM_PERMANENT is set for buffers that must be written at every
1734  * checkpoint. Unlogged buffers only need to be written at shutdown
1735  * checkpoints, except for their "init" forks, which need to be treated
1736  * just like permanent relations.
1737  */
1738  victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
1739  if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
1740  victim_buf_state |= BM_PERMANENT;
1741 
1742  UnlockBufHdr(victim_buf_hdr, victim_buf_state);
1743 
1744  LWLockRelease(newPartitionLock);
1745 
1746  /*
1747  * Buffer contents are currently invalid.
1748  */
1749  *foundPtr = false;
1750 
1751  return victim_buf_hdr;
1752 }
int Buffer
Definition: buf.h:23
static void InitBufferTag(BufferTag *tag, const RelFileLocator *rlocator, ForkNumber forkNum, BlockNumber blockNum)
#define BM_PERMANENT
Definition: buf_internals.h:69
static LWLock * BufMappingPartitionLock(uint32 hashcode)
#define BUF_USAGECOUNT_ONE
Definition: buf_internals.h:46
#define BUF_STATE_GET_REFCOUNT(state)
Definition: buf_internals.h:51
int BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:90
uint32 BufTableHashCode(BufferTag *tagPtr)
Definition: buf_table.c:78
int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
Definition: buf_table.c:118
static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
Definition: bufmgr.c:2641
static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
Definition: bufmgr.c:1938
static void ReservePrivateRefCountEntry(void)
Definition: bufmgr.c:249
static void UnpinBuffer(BufferDesc *buf)
Definition: bufmgr.c:2795
void StrategyFreeBuffer(BufferDesc *buf)
Definition: freelist.c:363
bool LWLockAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1170
void LWLockRelease(LWLock *lock)
Definition: lwlock.c:1783
@ LW_SHARED
Definition: lwlock.h:115
@ LW_EXCLUSIVE
Definition: lwlock.h:114
static char * buf
Definition: pg_test_fsync.c:73
@ INIT_FORKNUM
Definition: relpath.h:53
ResourceOwner CurrentResourceOwner
Definition: resowner.c:165
void ResourceOwnerEnlarge(ResourceOwner owner)
Definition: resowner.c:442
Definition: lwlock.h:42
RelFileLocator locator
RelFileLocatorBackend smgr_rlocator
Definition: smgr.h:37

References Assert, BM_DIRTY, BM_IO_IN_PROGRESS, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), BufTableLookup(), CurrentResourceOwner, GetBufferDescriptor(), GetVictimBuffer(), INIT_FORKNUM, InitBufferTag(), RelFileLocatorBackend::locator, LockBufHdr(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrRelationData::smgr_rlocator, StrategyFreeBuffer(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by PinBufferForBlock().

◆ BufferGetBlockNumber()

BlockNumber BufferGetBlockNumber ( Buffer  buffer)

Definition at line 3713 of file bufmgr.c.

3714 {
3715  BufferDesc *bufHdr;
3716 
3717  Assert(BufferIsPinned(buffer));
3718 
3719  if (BufferIsLocal(buffer))
3720  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3721  else
3722  bufHdr = GetBufferDescriptor(buffer - 1);
3723 
3724  /* pinned, so OK to read tag without spinlock */
3725  return bufHdr->tag.blockNum;
3726 }
#define BufferIsLocal(buffer)
Definition: buf.h:37
static BufferDesc * GetLocalBufferDescriptor(uint32 id)
#define BufferIsPinned(bufnum)
Definition: bufmgr.c:473

References Assert, buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by _bt_binsrch_insert(), _bt_bottomupdel_pass(), _bt_check_unique(), _bt_checkpage(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_doinsert(), _bt_endpoint(), _bt_finish_split(), _bt_first(), _bt_getroot(), _bt_insert_parent(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_moveright(), _bt_newlevel(), _bt_pagedel(), _bt_readnextpage(), _bt_readpage(), _bt_restore_meta(), _bt_search(), _bt_simpledel_pass(), _bt_split(), _bt_unlink_halfdead_page(), _bt_walk_left(), _hash_addovflpage(), _hash_checkpage(), _hash_doinsert(), _hash_first(), _hash_freeovflpage(), _hash_getnewbuf(), _hash_readnext(), _hash_readpage(), _hash_splitbucket(), allocNewBuffer(), blinsert(), BloomInitMetapage(), brin_doinsert(), brin_doupdate(), brin_getinsertbuffer(), brin_initialize_empty_new_buffer(), brin_page_cleanup(), brin_xlog_insert_update(), brinbuild(), brinGetTupleForHeapBlock(), collectMatchBitmap(), createPostingTree(), dataBeginPlaceToPageLeaf(), dataPrepareDownlink(), doPickSplit(), entryPrepareDownlink(), fill_seq_fork_with_data(), ginEntryInsert(), ginFindParents(), ginFinishSplit(), ginPlaceToPage(), ginRedoDeleteListPages(), ginRedoUpdateMetapage(), ginScanToDelete(), gistbufferinginserttuples(), gistbuild(), gistcheckpage(), gistdeletepage(), gistformdownlink(), gistinserttuples(), gistMemorizeAllDownlinks(), gistplacetopage(), gistRelocateBuildBuffersOnSplit(), gistScanPage(), hash_xlog_add_ovfl_page(), heap_delete(), heap_fetch_next_buffer(), heap_hot_search_buffer(), heap_insert(), heap_multi_insert(), heap_page_is_all_visible(), heap_page_prune_and_freeze(), heap_prepare_pagescan(), heap_update(), heap_xlog_confirm(), heap_xlog_lock(), heapam_scan_analyze_next_block(), heapgettup(), heapgettup_pagemode(), index_compute_xid_horizon_for_tuples(), lazy_scan_noprune(), lazy_scan_prune(), makeSublist(), moveLeafs(), moveRightIfItNeeded(), pgstathashindex(), ReadBufferBI(), RelationAddBlocks(), RelationGetBufferForTuple(), RelationPutHeapTuple(), revmap_get_buffer(), revmap_physical_extend(), ScanSourceDatabasePgClassPage(), spgAddNodeAction(), spgbuild(), spgdoinsert(), SpGistSetLastUsedPage(), spgSplitNodeAction(), spgWalk(), startScanEntry(), terminate_brin_buildstate(), vacuumLeafPage(), visibilitymap_clear(), visibilitymap_get_status(), visibilitymap_pin(), visibilitymap_pin_ok(), visibilitymap_set(), and WaitReadBuffers().

◆ BufferGetLSNAtomic()

XLogRecPtr BufferGetLSNAtomic ( Buffer  buffer)

Definition at line 3974 of file bufmgr.c.

3975 {
3976  BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
3977  char *page = BufferGetPage(buffer);
3978  XLogRecPtr lsn;
3979  uint32 buf_state;
3980 
3981  /*
3982  * If we don't need locking for correctness, fastpath out.
3983  */
3984  if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
3985  return PageGetLSN(page);
3986 
3987  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3988  Assert(BufferIsValid(buffer));
3989  Assert(BufferIsPinned(buffer));
3990 
3991  buf_state = LockBufHdr(bufHdr);
3992  lsn = PageGetLSN(page);
3993  UnlockBufHdr(bufHdr, buf_state);
3994 
3995  return lsn;
3996 }
static Page BufferGetPage(Buffer buffer)
Definition: bufmgr.h:404
static XLogRecPtr PageGetLSN(Page page)
Definition: bufpage.h:383
#define XLogHintBitIsNeeded()
Definition: xlog.h:118
uint64 XLogRecPtr
Definition: xlogdefs.h:21

References Assert, PrivateRefCountEntry::buffer, BufferGetPage(), BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), LockBufHdr(), PageGetLSN(), UnlockBufHdr(), and XLogHintBitIsNeeded.

Referenced by _bt_killitems(), _bt_readpage(), gistdoinsert(), gistFindPath(), gistkillitems(), gistScanPage(), SetHintBits(), and XLogSaveBufferForHint().

◆ BufferGetTag()

void BufferGetTag ( Buffer  buffer,
RelFileLocator rlocator,
ForkNumber forknum,
BlockNumber blknum 
)

Definition at line 3734 of file bufmgr.c.

3736 {
3737  BufferDesc *bufHdr;
3738 
3739  /* Do the same checks as BufferGetBlockNumber. */
3740  Assert(BufferIsPinned(buffer));
3741 
3742  if (BufferIsLocal(buffer))
3743  bufHdr = GetLocalBufferDescriptor(-buffer - 1);
3744  else
3745  bufHdr = GetBufferDescriptor(buffer - 1);
3746 
3747  /* pinned, so OK to read tag without spinlock */
3748  *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
3749  *forknum = BufTagGetForkNum(&bufHdr->tag);
3750  *blknum = bufHdr->tag.blockNum;
3751 }

References Assert, buftag::blockNum, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), and BufferDesc::tag.

Referenced by fsm_search_avail(), ginRedoInsertEntry(), log_newpage_buffer(), ResolveCminCmaxDuringDecoding(), XLogRegisterBuffer(), and XLogSaveBufferForHint().

◆ BufferIsDirty()

bool BufferIsDirty ( Buffer  buffer)

Definition at line 2488 of file bufmgr.c.

2489 {
2490  BufferDesc *bufHdr;
2491 
2492  if (BufferIsLocal(buffer))
2493  {
2494  int bufid = -buffer - 1;
2495 
2496  bufHdr = GetLocalBufferDescriptor(bufid);
2497  }
2498  else
2499  {
2500  bufHdr = GetBufferDescriptor(buffer - 1);
2501  }
2502 
2503  Assert(BufferIsPinned(buffer));
2505  LW_EXCLUSIVE));
2506 
2507  return pg_atomic_read_u32(&bufHdr->state) & BM_DIRTY;
2508 }
static uint32 pg_atomic_read_u32(volatile pg_atomic_uint32 *ptr)
Definition: atomics.h:234
static LWLock * BufferDescriptorGetContentLock(const BufferDesc *bdesc)
bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1939
pg_atomic_uint32 state

References Assert, BM_DIRTY, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by XLogRegisterBuffer().

◆ BufferIsExclusiveLocked()

bool BufferIsExclusiveLocked ( Buffer  buffer)

Definition at line 2459 of file bufmgr.c.

2460 {
2461  BufferDesc *bufHdr;
2462 
2463  if (BufferIsLocal(buffer))
2464  {
2465  int bufid = -buffer - 1;
2466 
2467  bufHdr = GetLocalBufferDescriptor(bufid);
2468  }
2469  else
2470  {
2471  bufHdr = GetBufferDescriptor(buffer - 1);
2472  }
2473 
2474  Assert(BufferIsPinned(buffer));
2476  LW_EXCLUSIVE);
2477 }

References Assert, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), GetLocalBufferDescriptor(), LW_EXCLUSIVE, and LWLockHeldByMeInMode().

Referenced by XLogRegisterBuffer().

◆ BufferIsPermanent()

bool BufferIsPermanent ( Buffer  buffer)

Definition at line 3944 of file bufmgr.c.

3945 {
3946  BufferDesc *bufHdr;
3947 
3948  /* Local buffers are used only for temp relations. */
3949  if (BufferIsLocal(buffer))
3950  return false;
3951 
3952  /* Make sure we've got a real buffer, and that we hold a pin on it. */
3953  Assert(BufferIsValid(buffer));
3954  Assert(BufferIsPinned(buffer));
3955 
3956  /*
3957  * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
3958  * need not bother with the buffer header spinlock. Even if someone else
3959  * changes the buffer header state while we're doing this, the state is
3960  * changed atomically, so we'll read the old value or the new value, but
3961  * not random garbage.
3962  */
3963  bufHdr = GetBufferDescriptor(buffer - 1);
3964  return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
3965 }

References Assert, BM_PERMANENT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, BufferIsValid(), GetBufferDescriptor(), pg_atomic_read_u32(), and BufferDesc::state.

Referenced by SetHintBits().

◆ BufferSync()

static void BufferSync ( int  flags)
static

Definition at line 2901 of file bufmgr.c.

2902 {
2903  uint32 buf_state;
2904  int buf_id;
2905  int num_to_scan;
2906  int num_spaces;
2907  int num_processed;
2908  int num_written;
2909  CkptTsStatus *per_ts_stat = NULL;
2910  Oid last_tsid;
2911  binaryheap *ts_heap;
2912  int i;
2913  int mask = BM_DIRTY;
2914  WritebackContext wb_context;
2915 
2916  /*
2917  * Unless this is a shutdown checkpoint or we have been explicitly told,
2918  * we write only permanent, dirty buffers. But at shutdown or end of
2919  * recovery, we write all dirty buffers.
2920  */
2923  mask |= BM_PERMANENT;
2924 
2925  /*
2926  * Loop over all buffers, and mark the ones that need to be written with
2927  * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
2928  * can estimate how much work needs to be done.
2929  *
2930  * This allows us to write only those pages that were dirty when the
2931  * checkpoint began, and not those that get dirtied while it proceeds.
2932  * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
2933  * later in this function, or by normal backends or the bgwriter cleaning
2934  * scan, the flag is cleared. Any buffer dirtied after this point won't
2935  * have the flag set.
2936  *
2937  * Note that if we fail to write some buffer, we may leave buffers with
2938  * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
2939  * certainly need to be written for the next checkpoint attempt, too.
2940  */
2941  num_to_scan = 0;
2942  for (buf_id = 0; buf_id < NBuffers; buf_id++)
2943  {
2944  BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
2945 
2946  /*
2947  * Header spinlock is enough to examine BM_DIRTY, see comment in
2948  * SyncOneBuffer.
2949  */
2950  buf_state = LockBufHdr(bufHdr);
2951 
2952  if ((buf_state & mask) == mask)
2953  {
2954  CkptSortItem *item;
2955 
2956  buf_state |= BM_CHECKPOINT_NEEDED;
2957 
2958  item = &CkptBufferIds[num_to_scan++];
2959  item->buf_id = buf_id;
2960  item->tsId = bufHdr->tag.spcOid;
2961  item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
2962  item->forkNum = BufTagGetForkNum(&bufHdr->tag);
2963  item->blockNum = bufHdr->tag.blockNum;
2964  }
2965 
2966  UnlockBufHdr(bufHdr, buf_state);
2967 
2968  /* Check for barrier events in case NBuffers is large. */
2971  }
2972 
2973  if (num_to_scan == 0)
2974  return; /* nothing to do */
2975 
2977 
2978  TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
2979 
2980  /*
2981  * Sort buffers that need to be written to reduce the likelihood of random
2982  * IO. The sorting is also important for the implementation of balancing
2983  * writes between tablespaces. Without balancing writes we'd potentially
2984  * end up writing to the tablespaces one-by-one; possibly overloading the
2985  * underlying system.
2986  */
2987  sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
2988 
2989  num_spaces = 0;
2990 
2991  /*
2992  * Allocate progress status for each tablespace with buffers that need to
2993  * be flushed. This requires the to-be-flushed array to be sorted.
2994  */
2995  last_tsid = InvalidOid;
2996  for (i = 0; i < num_to_scan; i++)
2997  {
2998  CkptTsStatus *s;
2999  Oid cur_tsid;
3000 
3001  cur_tsid = CkptBufferIds[i].tsId;
3002 
3003  /*
3004  * Grow array of per-tablespace status structs, every time a new
3005  * tablespace is found.
3006  */
3007  if (last_tsid == InvalidOid || last_tsid != cur_tsid)
3008  {
3009  Size sz;
3010 
3011  num_spaces++;
3012 
3013  /*
3014  * Not worth adding grow-by-power-of-2 logic here - even with a
3015  * few hundred tablespaces this should be fine.
3016  */
3017  sz = sizeof(CkptTsStatus) * num_spaces;
3018 
3019  if (per_ts_stat == NULL)
3020  per_ts_stat = (CkptTsStatus *) palloc(sz);
3021  else
3022  per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
3023 
3024  s = &per_ts_stat[num_spaces - 1];
3025  memset(s, 0, sizeof(*s));
3026  s->tsId = cur_tsid;
3027 
3028  /*
3029  * The first buffer in this tablespace. As CkptBufferIds is sorted
3030  * by tablespace all (s->num_to_scan) buffers in this tablespace
3031  * will follow afterwards.
3032  */
3033  s->index = i;
3034 
3035  /*
3036  * progress_slice will be determined once we know how many buffers
3037  * are in each tablespace, i.e. after this loop.
3038  */
3039 
3040  last_tsid = cur_tsid;
3041  }
3042  else
3043  {
3044  s = &per_ts_stat[num_spaces - 1];
3045  }
3046 
3047  s->num_to_scan++;
3048 
3049  /* Check for barrier events. */
3052  }
3053 
3054  Assert(num_spaces > 0);
3055 
3056  /*
3057  * Build a min-heap over the write-progress in the individual tablespaces,
3058  * and compute how large a portion of the total progress a single
3059  * processed buffer is.
3060  */
3061  ts_heap = binaryheap_allocate(num_spaces,
3063  NULL);
3064 
3065  for (i = 0; i < num_spaces; i++)
3066  {
3067  CkptTsStatus *ts_stat = &per_ts_stat[i];
3068 
3069  ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
3070 
3071  binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
3072  }
3073 
3074  binaryheap_build(ts_heap);
3075 
3076  /*
3077  * Iterate through to-be-checkpointed buffers and write the ones (still)
3078  * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
3079  * tablespaces; otherwise the sorting would lead to only one tablespace
3080  * receiving writes at a time, making inefficient use of the hardware.
3081  */
3082  num_processed = 0;
3083  num_written = 0;
3084  while (!binaryheap_empty(ts_heap))
3085  {
3086  BufferDesc *bufHdr = NULL;
3087  CkptTsStatus *ts_stat = (CkptTsStatus *)
3089 
3090  buf_id = CkptBufferIds[ts_stat->index].buf_id;
3091  Assert(buf_id != -1);
3092 
3093  bufHdr = GetBufferDescriptor(buf_id);
3094 
3095  num_processed++;
3096 
3097  /*
3098  * We don't need to acquire the lock here, because we're only looking
3099  * at a single bit. It's possible that someone else writes the buffer
3100  * and clears the flag right after we check, but that doesn't matter
3101  * since SyncOneBuffer will then do nothing. However, there is a
3102  * further race condition: it's conceivable that between the time we
3103  * examine the bit here and the time SyncOneBuffer acquires the lock,
3104  * someone else not only wrote the buffer but replaced it with another
3105  * page and dirtied it. In that improbable case, SyncOneBuffer will
3106  * write the buffer though we didn't need to. It doesn't seem worth
3107  * guarding against this, though.
3108  */
3110  {
3111  if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
3112  {
3113  TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
3115  num_written++;
3116  }
3117  }
3118 
3119  /*
3120  * Measure progress independent of actually having to flush the buffer
3121  * - otherwise writing become unbalanced.
3122  */
3123  ts_stat->progress += ts_stat->progress_slice;
3124  ts_stat->num_scanned++;
3125  ts_stat->index++;
3126 
3127  /* Have all the buffers from the tablespace been processed? */
3128  if (ts_stat->num_scanned == ts_stat->num_to_scan)
3129  {
3130  binaryheap_remove_first(ts_heap);
3131  }
3132  else
3133  {
3134  /* update heap with the new progress */
3135  binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
3136  }
3137 
3138  /*
3139  * Sleep to throttle our I/O rate.
3140  *
3141  * (This will check for barrier events even if it doesn't sleep.)
3142  */
3143  CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
3144  }
3145 
3146  /*
3147  * Issue all pending flushes. Only checkpointer calls BufferSync(), so
3148  * IOContext will always be IOCONTEXT_NORMAL.
3149  */
3151 
3152  pfree(per_ts_stat);
3153  per_ts_stat = NULL;
3154  binaryheap_free(ts_heap);
3155 
3156  /*
3157  * Update checkpoint statistics. As noted above, this doesn't include
3158  * buffers written by other backends or bgwriter scan.
3159  */
3160  CheckpointStats.ckpt_bufs_written += num_written;
3161 
3162  TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
3163 }
void binaryheap_build(binaryheap *heap)
Definition: binaryheap.c:138
void binaryheap_replace_first(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:255
bh_node_type binaryheap_first(binaryheap *heap)
Definition: binaryheap.c:177
bh_node_type binaryheap_remove_first(binaryheap *heap)
Definition: binaryheap.c:192
binaryheap * binaryheap_allocate(int capacity, binaryheap_comparator compare, void *arg)
Definition: binaryheap.c:39
void binaryheap_free(binaryheap *heap)
Definition: binaryheap.c:75
void binaryheap_add_unordered(binaryheap *heap, bh_node_type d)
Definition: binaryheap.c:116
#define binaryheap_empty(h)
Definition: binaryheap.h:65
CkptSortItem * CkptBufferIds
Definition: buf_init.c:25
static RelFileNumber BufTagGetRelNumber(const BufferTag *tag)
#define BM_CHECKPOINT_NEEDED
Definition: buf_internals.h:68
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
Definition: bufmgr.c:5853
int checkpoint_flush_after
Definition: bufmgr.c:170
void WritebackContextInit(WritebackContext *context, int *max_pending)
Definition: bufmgr.c:5876
void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
Definition: bufmgr.c:5933
struct CkptTsStatus CkptTsStatus
double float8
Definition: c.h:630
size_t Size
Definition: c.h:605
void CheckpointWriteDelay(int flags, double progress)
Definition: checkpointer.c:711
volatile sig_atomic_t ProcSignalBarrierPending
Definition: globals.c:38
int i
Definition: isn.c:73
void * repalloc(void *pointer, Size size)
Definition: mcxt.c:1540
void * palloc(Size size)
Definition: mcxt.c:1316
@ IOCONTEXT_NORMAL
Definition: pgstat.h:290
PgStat_CheckpointerStats PendingCheckpointerStats
static Datum PointerGetDatum(const void *X)
Definition: postgres.h:322
static Pointer DatumGetPointer(Datum X)
Definition: postgres.h:312
#define InvalidOid
Definition: postgres_ext.h:36
unsigned int Oid
Definition: postgres_ext.h:31
void ProcessProcSignalBarrier(void)
Definition: procsignal.c:464
int ckpt_bufs_written
Definition: xlog.h:165
ForkNumber forkNum
RelFileNumber relNumber
BlockNumber blockNum
float8 progress_slice
Definition: bufmgr.c:114
int index
Definition: bufmgr.c:122
int num_scanned
Definition: bufmgr.c:119
float8 progress
Definition: bufmgr.c:113
int num_to_scan
Definition: bufmgr.c:117
Oid tsId
Definition: bufmgr.c:104
PgStat_Counter buffers_written
Definition: pgstat.h:270
Oid spcOid
Definition: buf_internals.h:94
CheckpointStatsData CheckpointStats
Definition: xlog.c:209
#define CHECKPOINT_END_OF_RECOVERY
Definition: xlog.h:138
#define CHECKPOINT_FLUSH_ALL
Definition: xlog.h:141
#define CHECKPOINT_IS_SHUTDOWN
Definition: xlog.h:137

References Assert, binaryheap_add_unordered(), binaryheap_allocate(), binaryheap_build(), binaryheap_empty, binaryheap_first(), binaryheap_free(), binaryheap_remove_first(), binaryheap_replace_first(), buftag::blockNum, CkptSortItem::blockNum, BM_CHECKPOINT_NEEDED, BM_DIRTY, BM_PERMANENT, CkptSortItem::buf_id, BUF_WRITTEN, PgStat_CheckpointerStats::buffers_written, BufTagGetForkNum(), BufTagGetRelNumber(), CHECKPOINT_END_OF_RECOVERY, checkpoint_flush_after, CHECKPOINT_FLUSH_ALL, CHECKPOINT_IS_SHUTDOWN, CheckpointStats, CheckpointWriteDelay(), CheckpointStatsData::ckpt_bufs_written, CkptBufferIds, DatumGetPointer(), CkptSortItem::forkNum, GetBufferDescriptor(), i, CkptTsStatus::index, InvalidOid, IOCONTEXT_NORMAL, IssuePendingWritebacks(), LockBufHdr(), NBuffers, CkptTsStatus::num_scanned, CkptTsStatus::num_to_scan, palloc(), PendingCheckpointerStats, pfree(), pg_atomic_read_u32(), PointerGetDatum(), ProcessProcSignalBarrier(), ProcSignalBarrierPending, CkptTsStatus::progress, CkptTsStatus::progress_slice, CkptSortItem::relNumber, repalloc(), buftag::spcOid, BufferDesc::state, SyncOneBuffer(), BufferDesc::tag, ts_ckpt_progress_comparator(), CkptTsStatus::tsId, CkptSortItem::tsId, UnlockBufHdr(), and WritebackContextInit().

Referenced by CheckPointBuffers().

◆ buffertag_comparator()

static int buffertag_comparator ( const BufferTag ba,
const BufferTag bb 
)
inlinestatic

Definition at line 5788 of file bufmgr.c.

5789 {
5790  int ret;
5791  RelFileLocator rlocatora;
5792  RelFileLocator rlocatorb;
5793 
5794  rlocatora = BufTagGetRelFileLocator(ba);
5795  rlocatorb = BufTagGetRelFileLocator(bb);
5796 
5797  ret = rlocator_comparator(&rlocatora, &rlocatorb);
5798 
5799  if (ret != 0)
5800  return ret;
5801 
5802  if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
5803  return -1;
5804  if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
5805  return 1;
5806 
5807  if (ba->blockNum < bb->blockNum)
5808  return -1;
5809  if (ba->blockNum > bb->blockNum)
5810  return 1;
5811 
5812  return 0;
5813 }
static int rlocator_comparator(const void *p1, const void *p2)
Definition: bufmgr.c:5707

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), and rlocator_comparator().

◆ CheckBufferIsPinnedOnce()

void CheckBufferIsPinnedOnce ( Buffer  buffer)

Definition at line 5178 of file bufmgr.c.

5179 {
5180  if (BufferIsLocal(buffer))
5181  {
5182  if (LocalRefCount[-buffer - 1] != 1)
5183  elog(ERROR, "incorrect local pin count: %d",
5184  LocalRefCount[-buffer - 1]);
5185  }
5186  else
5187  {
5188  if (GetPrivateRefCount(buffer) != 1)
5189  elog(ERROR, "incorrect local pin count: %d",
5190  GetPrivateRefCount(buffer));
5191  }
5192 }
#define ERROR
Definition: elog.h:39

References PrivateRefCountEntry::buffer, BufferIsLocal, elog, ERROR, GetPrivateRefCount(), and LocalRefCount.

Referenced by GetVictimBuffer(), and LockBufferForCleanup().

◆ CheckForBufferLeaks()

static void CheckForBufferLeaks ( void  )
static

Definition at line 3608 of file bufmgr.c.

3609 {
3610 #ifdef USE_ASSERT_CHECKING
3611  int RefCountErrors = 0;
3613  int i;
3614  char *s;
3615 
3616  /* check the array */
3617  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
3618  {
3620 
3621  if (res->buffer != InvalidBuffer)
3622  {
3623  s = DebugPrintBufferRefcount(res->buffer);
3624  elog(WARNING, "buffer refcount leak: %s", s);
3625  pfree(s);
3626 
3627  RefCountErrors++;
3628  }
3629  }
3630 
3631  /* if necessary search the hash */
3633  {
3634  HASH_SEQ_STATUS hstat;
3635 
3637  while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
3638  {
3639  s = DebugPrintBufferRefcount(res->buffer);
3640  elog(WARNING, "buffer refcount leak: %s", s);
3641  pfree(s);
3642  RefCountErrors++;
3643  }
3644  }
3645 
3646  Assert(RefCountErrors == 0);
3647 #endif
3648 }
#define InvalidBuffer
Definition: buf.h:25
char * DebugPrintBufferRefcount(Buffer buffer)
Definition: bufmgr.c:3654
#define REFCOUNT_ARRAY_ENTRIES
Definition: bufmgr.c:95
static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]
Definition: bufmgr.c:207
static HTAB * PrivateRefCountHash
Definition: bufmgr.c:208
void * hash_seq_search(HASH_SEQ_STATUS *status)
Definition: dynahash.c:1395
void hash_seq_init(HASH_SEQ_STATUS *status, HTAB *hashp)
Definition: dynahash.c:1385

References Assert, DebugPrintBufferRefcount(), elog, hash_seq_init(), hash_seq_search(), i, InvalidBuffer, pfree(), PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, res, and WARNING.

Referenced by AtEOXact_Buffers(), and AtProcExit_Buffers().

◆ CheckPointBuffers()

void CheckPointBuffers ( int  flags)

Definition at line 3699 of file bufmgr.c.

3700 {
3701  BufferSync(flags);
3702 }
static void BufferSync(int flags)
Definition: bufmgr.c:2901

References BufferSync().

Referenced by CheckPointGuts().

◆ ckpt_buforder_comparator()

static int ckpt_buforder_comparator ( const CkptSortItem a,
const CkptSortItem b 
)
inlinestatic

Definition at line 5822 of file bufmgr.c.

5823 {
5824  /* compare tablespace */
5825  if (a->tsId < b->tsId)
5826  return -1;
5827  else if (a->tsId > b->tsId)
5828  return 1;
5829  /* compare relation */
5830  if (a->relNumber < b->relNumber)
5831  return -1;
5832  else if (a->relNumber > b->relNumber)
5833  return 1;
5834  /* compare fork */
5835  else if (a->forkNum < b->forkNum)
5836  return -1;
5837  else if (a->forkNum > b->forkNum)
5838  return 1;
5839  /* compare block number */
5840  else if (a->blockNum < b->blockNum)
5841  return -1;
5842  else if (a->blockNum > b->blockNum)
5843  return 1;
5844  /* equal page IDs are unlikely, but not impossible */
5845  return 0;
5846 }
int b
Definition: isn.c:70
int a
Definition: isn.c:69

References a, and b.

◆ ConditionalLockBuffer()

bool ConditionalLockBuffer ( Buffer  buffer)

Definition at line 5157 of file bufmgr.c.

5158 {
5159  BufferDesc *buf;
5160 
5161  Assert(BufferIsPinned(buffer));
5162  if (BufferIsLocal(buffer))
5163  return true; /* act as though we got it */
5164 
5165  buf = GetBufferDescriptor(buffer - 1);
5166 
5168  LW_EXCLUSIVE);
5169 }
bool LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
Definition: lwlock.c:1341

References Assert, buf, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, GetBufferDescriptor(), LW_EXCLUSIVE, and LWLockConditionalAcquire().

Referenced by _bt_conditionallockbuf(), BloomNewBuffer(), ConditionalLockBufferForCleanup(), GinNewBuffer(), gistNewBuffer(), RelationGetBufferForTuple(), spgdoinsert(), SpGistGetBuffer(), SpGistNewBuffer(), and SpGistUpdateMetaPage().

◆ ConditionalLockBufferForCleanup()

bool ConditionalLockBufferForCleanup ( Buffer  buffer)

Definition at line 5372 of file bufmgr.c.

5373 {
5374  BufferDesc *bufHdr;
5375  uint32 buf_state,
5376  refcount;
5377 
5378  Assert(BufferIsValid(buffer));
5379 
5380  if (BufferIsLocal(buffer))
5381  {
5382  refcount = LocalRefCount[-buffer - 1];
5383  /* There should be exactly one pin */
5384  Assert(refcount > 0);
5385  if (refcount != 1)
5386  return false;
5387  /* Nobody else to wait for */
5388  return true;
5389  }
5390 
5391  /* There should be exactly one local pin */
5392  refcount = GetPrivateRefCount(buffer);
5393  Assert(refcount);
5394  if (refcount != 1)
5395  return false;
5396 
5397  /* Try to acquire lock */
5398  if (!ConditionalLockBuffer(buffer))
5399  return false;
5400 
5401  bufHdr = GetBufferDescriptor(buffer - 1);
5402  buf_state = LockBufHdr(bufHdr);
5403  refcount = BUF_STATE_GET_REFCOUNT(buf_state);
5404 
5405  Assert(refcount > 0);
5406  if (refcount == 1)
5407  {
5408  /* Successfully acquired exclusive lock with pincount 1 */
5409  UnlockBufHdr(bufHdr, buf_state);
5410  return true;
5411  }
5412 
5413  /* Failed, so release the lock */
5414  UnlockBufHdr(bufHdr, buf_state);
5415  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5416  return false;
5417 }
bool ConditionalLockBuffer(Buffer buffer)
Definition: bufmgr.c:5157
void LockBuffer(Buffer buffer, int mode)
Definition: bufmgr.c:5131
#define BUFFER_LOCK_UNLOCK
Definition: bufmgr.h:193

References Assert, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsValid(), ConditionalLockBuffer(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBuffer(), LockBufHdr(), PrivateRefCountEntry::refcount, and UnlockBufHdr().

Referenced by _hash_finish_split(), _hash_getbuf_with_condlock_cleanup(), heap_page_prune_opt(), and lazy_scan_heap().

◆ CreateAndCopyRelationData()

void CreateAndCopyRelationData ( RelFileLocator  src_rlocator,
RelFileLocator  dst_rlocator,
bool  permanent 
)

Definition at line 4770 of file bufmgr.c.

4772 {
4773  char relpersistence;
4774  SMgrRelation src_rel;
4775  SMgrRelation dst_rel;
4776 
4777  /* Set the relpersistence. */
4778  relpersistence = permanent ?
4779  RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
4780 
4781  src_rel = smgropen(src_rlocator, INVALID_PROC_NUMBER);
4782  dst_rel = smgropen(dst_rlocator, INVALID_PROC_NUMBER);
4783 
4784  /*
4785  * Create and copy all forks of the relation. During create database we
4786  * have a separate cleanup mechanism which deletes complete database
4787  * directory. Therefore, each individual relation doesn't need to be
4788  * registered for cleanup.
4789  */
4790  RelationCreateStorage(dst_rlocator, relpersistence, false);
4791 
4792  /* copy main fork. */
4793  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
4794  permanent);
4795 
4796  /* copy those extra forks that exist */
4797  for (ForkNumber forkNum = MAIN_FORKNUM + 1;
4798  forkNum <= MAX_FORKNUM; forkNum++)
4799  {
4800  if (smgrexists(src_rel, forkNum))
4801  {
4802  smgrcreate(dst_rel, forkNum, false);
4803 
4804  /*
4805  * WAL log creation if the relation is persistent, or this is the
4806  * init fork of an unlogged relation.
4807  */
4808  if (permanent || forkNum == INIT_FORKNUM)
4809  log_smgrcreate(&dst_rlocator, forkNum);
4810 
4811  /* Copy a fork's data, block by block. */
4812  RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
4813  permanent);
4814  }
4815  }
4816 }
static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, RelFileLocator dstlocator, ForkNumber forkNum, bool permanent)
Definition: bufmgr.c:4679
#define INVALID_PROC_NUMBER
Definition: procnumber.h:26
ForkNumber
Definition: relpath.h:48
@ MAIN_FORKNUM
Definition: relpath.h:50
#define MAX_FORKNUM
Definition: relpath.h:62
SMgrRelation smgropen(RelFileLocator rlocator, ProcNumber backend)
Definition: smgr.c:198
void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
Definition: smgr.c:411
bool smgrexists(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:398
SMgrRelation RelationCreateStorage(RelFileLocator rlocator, char relpersistence, bool register_delete)
Definition: storage.c:121
void log_smgrcreate(const RelFileLocator *rlocator, ForkNumber forkNum)
Definition: storage.c:186

References INIT_FORKNUM, INVALID_PROC_NUMBER, log_smgrcreate(), MAIN_FORKNUM, MAX_FORKNUM, RelationCopyStorageUsingBuffer(), RelationCreateStorage(), smgrcreate(), smgrexists(), and smgropen().

Referenced by CreateDatabaseUsingWalLog().

◆ DebugPrintBufferRefcount()

char* DebugPrintBufferRefcount ( Buffer  buffer)

Definition at line 3654 of file bufmgr.c.

3655 {
3656  BufferDesc *buf;
3657  int32 loccount;
3658  char *path;
3659  char *result;
3660  ProcNumber backend;
3661  uint32 buf_state;
3662 
3663  Assert(BufferIsValid(buffer));
3664  if (BufferIsLocal(buffer))
3665  {
3666  buf = GetLocalBufferDescriptor(-buffer - 1);
3667  loccount = LocalRefCount[-buffer - 1];
3668  backend = MyProcNumber;
3669  }
3670  else
3671  {
3672  buf = GetBufferDescriptor(buffer - 1);
3673  loccount = GetPrivateRefCount(buffer);
3674  backend = INVALID_PROC_NUMBER;
3675  }
3676 
3677  /* theoretically we should lock the bufhdr here */
3678  path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
3679  BufTagGetForkNum(&buf->tag));
3680  buf_state = pg_atomic_read_u32(&buf->state);
3681 
3682  result = psprintf("[%03d] (rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
3683  buffer, path,
3684  buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
3685  BUF_STATE_GET_REFCOUNT(buf_state), loccount);
3686  pfree(path);
3687  return result;
3688 }
#define BUF_FLAG_MASK
Definition: buf_internals.h:48
ProcNumber MyProcNumber
Definition: globals.c:87
int ProcNumber
Definition: procnumber.h:24
char * psprintf(const char *fmt,...)
Definition: psprintf.c:46
#define relpathbackend(rlocator, backend, forknum)
Definition: relpath.h:85

References Assert, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), BufTagGetForkNum(), BufTagGetRelFileLocator(), GetBufferDescriptor(), GetLocalBufferDescriptor(), GetPrivateRefCount(), INVALID_PROC_NUMBER, LocalRefCount, MyProcNumber, pfree(), pg_atomic_read_u32(), psprintf(), and relpathbackend.

Referenced by CheckForBufferLeaks(), CheckForLocalBufferLeaks(), and ResOwnerPrintBufferPin().

◆ DropDatabaseBuffers()

void DropDatabaseBuffers ( Oid  dbid)

Definition at line 4375 of file bufmgr.c.

4376 {
4377  int i;
4378 
4379  /*
4380  * We needn't consider local buffers, since by assumption the target
4381  * database isn't our own.
4382  */
4383 
4384  for (i = 0; i < NBuffers; i++)
4385  {
4386  BufferDesc *bufHdr = GetBufferDescriptor(i);
4387  uint32 buf_state;
4388 
4389  /*
4390  * As in DropRelationBuffers, an unlocked precheck should be safe and
4391  * saves some cycles.
4392  */
4393  if (bufHdr->tag.dbOid != dbid)
4394  continue;
4395 
4396  buf_state = LockBufHdr(bufHdr);
4397  if (bufHdr->tag.dbOid == dbid)
4398  InvalidateBuffer(bufHdr); /* releases spinlock */
4399  else
4400  UnlockBufHdr(bufHdr, buf_state);
4401  }
4402 }
static void InvalidateBuffer(BufferDesc *buf)
Definition: bufmgr.c:1772
Oid dbOid
Definition: buf_internals.h:95

References buftag::dbOid, GetBufferDescriptor(), i, InvalidateBuffer(), LockBufHdr(), NBuffers, BufferDesc::tag, and UnlockBufHdr().

Referenced by createdb_failure_callback(), dbase_redo(), dropdb(), and movedb().

◆ DropRelationBuffers()

void DropRelationBuffers ( SMgrRelation  smgr_reln,
ForkNumber forkNum,
int  nforks,
BlockNumber firstDelBlock 
)

Definition at line 4020 of file bufmgr.c.

4022 {
4023  int i;
4024  int j;
4025  RelFileLocatorBackend rlocator;
4026  BlockNumber nForkBlock[MAX_FORKNUM];
4027  uint64 nBlocksToInvalidate = 0;
4028 
4029  rlocator = smgr_reln->smgr_rlocator;
4030 
4031  /* If it's a local relation, it's localbuf.c's problem. */
4032  if (RelFileLocatorBackendIsTemp(rlocator))
4033  {
4034  if (rlocator.backend == MyProcNumber)
4035  {
4036  for (j = 0; j < nforks; j++)
4037  DropRelationLocalBuffers(rlocator.locator, forkNum[j],
4038  firstDelBlock[j]);
4039  }
4040  return;
4041  }
4042 
4043  /*
4044  * To remove all the pages of the specified relation forks from the buffer
4045  * pool, we need to scan the entire buffer pool but we can optimize it by
4046  * finding the buffers from BufMapping table provided we know the exact
4047  * size of each fork of the relation. The exact size is required to ensure
4048  * that we don't leave any buffer for the relation being dropped as
4049  * otherwise the background writer or checkpointer can lead to a PANIC
4050  * error while flushing buffers corresponding to files that don't exist.
4051  *
4052  * To know the exact size, we rely on the size cached for each fork by us
4053  * during recovery which limits the optimization to recovery and on
4054  * standbys but we can easily extend it once we have shared cache for
4055  * relation size.
4056  *
4057  * In recovery, we cache the value returned by the first lseek(SEEK_END)
4058  * and the future writes keeps the cached value up-to-date. See
4059  * smgrextend. It is possible that the value of the first lseek is smaller
4060  * than the actual number of existing blocks in the file due to buggy
4061  * Linux kernels that might not have accounted for the recent write. But
4062  * that should be fine because there must not be any buffers after that
4063  * file size.
4064  */
4065  for (i = 0; i < nforks; i++)
4066  {
4067  /* Get the number of blocks for a relation's fork */
4068  nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
4069 
4070  if (nForkBlock[i] == InvalidBlockNumber)
4071  {
4072  nBlocksToInvalidate = InvalidBlockNumber;
4073  break;
4074  }
4075 
4076  /* calculate the number of blocks to be invalidated */
4077  nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
4078  }
4079 
4080  /*
4081  * We apply the optimization iff the total number of blocks to invalidate
4082  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4083  */
4084  if (BlockNumberIsValid(nBlocksToInvalidate) &&
4085  nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4086  {
4087  for (j = 0; j < nforks; j++)
4088  FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
4089  nForkBlock[j], firstDelBlock[j]);
4090  return;
4091  }
4092 
4093  for (i = 0; i < NBuffers; i++)
4094  {
4095  BufferDesc *bufHdr = GetBufferDescriptor(i);
4096  uint32 buf_state;
4097 
4098  /*
4099  * We can make this a tad faster by prechecking the buffer tag before
4100  * we attempt to lock the buffer; this saves a lot of lock
4101  * acquisitions in typical cases. It should be safe because the
4102  * caller must have AccessExclusiveLock on the relation, or some other
4103  * reason to be certain that no one is loading new pages of the rel
4104  * into the buffer pool. (Otherwise we might well miss such pages
4105  * entirely.) Therefore, while the tag might be changing while we
4106  * look at it, it can't be changing *to* a value we care about, only
4107  * *away* from such a value. So false negatives are impossible, and
4108  * false positives are safe because we'll recheck after getting the
4109  * buffer lock.
4110  *
4111  * We could check forkNum and blockNum as well as the rlocator, but
4112  * the incremental win from doing so seems small.
4113  */
4114  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
4115  continue;
4116 
4117  buf_state = LockBufHdr(bufHdr);
4118 
4119  for (j = 0; j < nforks; j++)
4120  {
4121  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
4122  BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
4123  bufHdr->tag.blockNum >= firstDelBlock[j])
4124  {
4125  InvalidateBuffer(bufHdr); /* releases spinlock */
4126  break;
4127  }
4128  }
4129  if (j >= nforks)
4130  UnlockBufHdr(bufHdr, buf_state);
4131  }
4132 }
uint32 BlockNumber
Definition: block.h:31
#define InvalidBlockNumber
Definition: block.h:33
static bool BlockNumberIsValid(BlockNumber blockNumber)
Definition: block.h:71
static bool BufTagMatchesRelFileLocator(const BufferTag *tag, const RelFileLocator *rlocator)
#define BUF_DROP_FULL_SCAN_THRESHOLD
Definition: bufmgr.c:86
static void FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber nForkBlock, BlockNumber firstDelBlock)
Definition: bufmgr.c:4314
int j
Definition: isn.c:74
void DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber firstDelBlock)
Definition: localbuf.c:489
#define RelFileLocatorBackendIsTemp(rlocator)
BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:679

References RelFileLocatorBackend::backend, buftag::blockNum, BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetForkNum(), BufTagMatchesRelFileLocator(), DropRelationLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, InvalidateBuffer(), InvalidBlockNumber, j, RelFileLocatorBackend::locator, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, RelFileLocatorBackendIsTemp, SMgrRelationData::smgr_rlocator, smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrtruncate().

◆ DropRelationsAllBuffers()

void DropRelationsAllBuffers ( SMgrRelation smgr_reln,
int  nlocators 
)

Definition at line 4143 of file bufmgr.c.

4144 {
4145  int i;
4146  int n = 0;
4147  SMgrRelation *rels;
4148  BlockNumber (*block)[MAX_FORKNUM + 1];
4149  uint64 nBlocksToInvalidate = 0;
4150  RelFileLocator *locators;
4151  bool cached = true;
4152  bool use_bsearch;
4153 
4154  if (nlocators == 0)
4155  return;
4156 
4157  rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
4158 
4159  /* If it's a local relation, it's localbuf.c's problem. */
4160  for (i = 0; i < nlocators; i++)
4161  {
4162  if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
4163  {
4164  if (smgr_reln[i]->smgr_rlocator.backend == MyProcNumber)
4165  DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
4166  }
4167  else
4168  rels[n++] = smgr_reln[i];
4169  }
4170 
4171  /*
4172  * If there are no non-local relations, then we're done. Release the
4173  * memory and return.
4174  */
4175  if (n == 0)
4176  {
4177  pfree(rels);
4178  return;
4179  }
4180 
4181  /*
4182  * This is used to remember the number of blocks for all the relations
4183  * forks.
4184  */
4185  block = (BlockNumber (*)[MAX_FORKNUM + 1])
4186  palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
4187 
4188  /*
4189  * We can avoid scanning the entire buffer pool if we know the exact size
4190  * of each of the given relation forks. See DropRelationBuffers.
4191  */
4192  for (i = 0; i < n && cached; i++)
4193  {
4194  for (int j = 0; j <= MAX_FORKNUM; j++)
4195  {
4196  /* Get the number of blocks for a relation's fork. */
4197  block[i][j] = smgrnblocks_cached(rels[i], j);
4198 
4199  /* We need to only consider the relation forks that exists. */
4200  if (block[i][j] == InvalidBlockNumber)
4201  {
4202  if (!smgrexists(rels[i], j))
4203  continue;
4204  cached = false;
4205  break;
4206  }
4207 
4208  /* calculate the total number of blocks to be invalidated */
4209  nBlocksToInvalidate += block[i][j];
4210  }
4211  }
4212 
4213  /*
4214  * We apply the optimization iff the total number of blocks to invalidate
4215  * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
4216  */
4217  if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
4218  {
4219  for (i = 0; i < n; i++)
4220  {
4221  for (int j = 0; j <= MAX_FORKNUM; j++)
4222  {
4223  /* ignore relation forks that doesn't exist */
4224  if (!BlockNumberIsValid(block[i][j]))
4225  continue;
4226 
4227  /* drop all the buffers for a particular relation fork */
4228  FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
4229  j, block[i][j], 0);
4230  }
4231  }
4232 
4233  pfree(block);
4234  pfree(rels);
4235  return;
4236  }
4237 
4238  pfree(block);
4239  locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
4240  for (i = 0; i < n; i++)
4241  locators[i] = rels[i]->smgr_rlocator.locator;
4242 
4243  /*
4244  * For low number of relations to drop just use a simple walk through, to
4245  * save the bsearch overhead. The threshold to use is rather a guess than
4246  * an exactly determined value, as it depends on many factors (CPU and RAM
4247  * speeds, amount of shared buffers etc.).
4248  */
4249  use_bsearch = n > RELS_BSEARCH_THRESHOLD;
4250 
4251  /* sort the list of rlocators if necessary */
4252  if (use_bsearch)
4253  qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
4254 
4255  for (i = 0; i < NBuffers; i++)
4256  {
4257  RelFileLocator *rlocator = NULL;
4258  BufferDesc *bufHdr = GetBufferDescriptor(i);
4259  uint32 buf_state;
4260 
4261  /*
4262  * As in DropRelationBuffers, an unlocked precheck should be safe and
4263  * saves some cycles.
4264  */
4265 
4266  if (!use_bsearch)
4267  {
4268  int j;
4269 
4270  for (j = 0; j < n; j++)
4271  {
4272  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
4273  {
4274  rlocator = &locators[j];
4275  break;
4276  }
4277  }
4278  }
4279  else
4280  {
4281  RelFileLocator locator;
4282 
4283  locator = BufTagGetRelFileLocator(&bufHdr->tag);
4284  rlocator = bsearch((const void *) &(locator),
4285  locators, n, sizeof(RelFileLocator),
4287  }
4288 
4289  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4290  if (rlocator == NULL)
4291  continue;
4292 
4293  buf_state = LockBufHdr(bufHdr);
4294  if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
4295  InvalidateBuffer(bufHdr); /* releases spinlock */
4296  else
4297  UnlockBufHdr(bufHdr, buf_state);
4298  }
4299 
4300  pfree(locators);
4301  pfree(rels);
4302 }
#define RELS_BSEARCH_THRESHOLD
Definition: bufmgr.c:78
if(TABLE==NULL||TABLE_index==NULL)
Definition: isn.c:77
void DropRelationAllLocalBuffers(RelFileLocator rlocator)
Definition: localbuf.c:537
#define qsort(a, b, c, d)
Definition: port.h:449

References BlockNumberIsValid(), BUF_DROP_FULL_SCAN_THRESHOLD, BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), DropRelationAllLocalBuffers(), FindAndDropRelationBuffers(), GetBufferDescriptor(), i, if(), InvalidateBuffer(), InvalidBlockNumber, j, LockBufHdr(), MAX_FORKNUM, MyProcNumber, NBuffers, palloc(), pfree(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, rlocator_comparator(), smgrexists(), smgrnblocks_cached(), BufferDesc::tag, and UnlockBufHdr().

Referenced by smgrdounlinkall().

◆ EvictUnpinnedBuffer()

bool EvictUnpinnedBuffer ( Buffer  buf)

Definition at line 6069 of file bufmgr.c.

6070 {
6071  BufferDesc *desc;
6072  uint32 buf_state;
6073  bool result;
6074 
6075  /* Make sure we can pin the buffer. */
6078 
6080  desc = GetBufferDescriptor(buf - 1);
6081 
6082  /* Lock the header and check if it's valid. */
6083  buf_state = LockBufHdr(desc);
6084  if ((buf_state & BM_VALID) == 0)
6085  {
6086  UnlockBufHdr(desc, buf_state);
6087  return false;
6088  }
6089 
6090  /* Check that it's not pinned already. */
6091  if (BUF_STATE_GET_REFCOUNT(buf_state) > 0)
6092  {
6093  UnlockBufHdr(desc, buf_state);
6094  return false;
6095  }
6096 
6097  PinBuffer_Locked(desc); /* releases spinlock */
6098 
6099  /* If it was dirty, try to clean it once. */
6100  if (buf_state & BM_DIRTY)
6101  {
6105  }
6106 
6107  /* This will return false if it becomes dirty or someone else pins it. */
6108  result = InvalidateVictimBuffer(desc);
6109 
6110  UnpinBuffer(desc);
6111 
6112  return result;
6113 }
static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, IOContext io_context)
Definition: bufmgr.c:3773
static void PinBuffer_Locked(BufferDesc *buf)
Definition: bufmgr.c:2752
static bool InvalidateVictimBuffer(BufferDesc *buf_hdr)
Definition: bufmgr.c:1870
@ IOOBJECT_RELATION
Definition: pgstat.h:280

References Assert, BM_DIRTY, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetContentLock(), BufferIsLocal, CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), InvalidateVictimBuffer(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), UnlockBufHdr(), and UnpinBuffer().

Referenced by pg_buffercache_evict().

◆ ExtendBufferedRel()

Buffer ExtendBufferedRel ( BufferManagerRelation  bmr,
ForkNumber  forkNum,
BufferAccessStrategy  strategy,
uint32  flags 
)

Definition at line 845 of file bufmgr.c.

849 {
850  Buffer buf;
851  uint32 extend_by = 1;
852 
853  ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
854  &buf, &extend_by);
855 
856  return buf;
857 }
BlockNumber ExtendBufferedRelBy(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:877

References buf, and ExtendBufferedRelBy().

Referenced by _bt_allocbuf(), _hash_getnewbuf(), BloomNewBuffer(), brinbuild(), brinbuildempty(), fill_seq_fork_with_data(), ginbuildempty(), GinNewBuffer(), gistbuildempty(), gistNewBuffer(), ReadBuffer_common(), revmap_physical_extend(), and SpGistNewBuffer().

◆ ExtendBufferedRelBy()

BlockNumber ExtendBufferedRelBy ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
Buffer buffers,
uint32 extended_by 
)

Definition at line 877 of file bufmgr.c.

884 {
885  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
886  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
887  Assert(extend_by > 0);
888 
889  if (bmr.smgr == NULL)
890  {
891  bmr.smgr = RelationGetSmgr(bmr.rel);
892  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
893  }
894 
895  return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
896  extend_by, InvalidBlockNumber,
897  buffers, extended_by);
898 }
static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2135
static SMgrRelation RelationGetSmgr(Relation rel)
Definition: rel.h:567
struct SMgrRelationData * smgr
Definition: bufmgr.h:103
Form_pg_class rd_rel
Definition: rel.h:111

References Assert, ExtendBufferedRelCommon(), InvalidBlockNumber, RelationData::rd_rel, BufferManagerRelation::rel, RelationGetSmgr(), BufferManagerRelation::relpersistence, and BufferManagerRelation::smgr.

Referenced by ExtendBufferedRel(), and RelationAddBlocks().

◆ ExtendBufferedRelCommon()

static BlockNumber ExtendBufferedRelCommon ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2135 of file bufmgr.c.

2143 {
2144  BlockNumber first_block;
2145 
2146  TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
2150  bmr.smgr->smgr_rlocator.backend,
2151  extend_by);
2152 
2153  if (bmr.relpersistence == RELPERSISTENCE_TEMP)
2154  first_block = ExtendBufferedRelLocal(bmr, fork, flags,
2155  extend_by, extend_upto,
2156  buffers, &extend_by);
2157  else
2158  first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
2159  extend_by, extend_upto,
2160  buffers, &extend_by);
2161  *extended_by = extend_by;
2162 
2163  TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
2167  bmr.smgr->smgr_rlocator.backend,
2168  *extended_by,
2169  first_block);
2170 
2171  return first_block;
2172 }
static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, ForkNumber fork, BufferAccessStrategy strategy, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: bufmgr.c:2179
BlockNumber ExtendBufferedRelLocal(BufferManagerRelation bmr, ForkNumber fork, uint32 flags, uint32 extend_by, BlockNumber extend_upto, Buffer *buffers, uint32 *extended_by)
Definition: localbuf.c:313
RelFileNumber relNumber

References RelFileLocatorBackend::backend, RelFileLocator::dbOid, ExtendBufferedRelLocal(), ExtendBufferedRelShared(), RelFileLocatorBackend::locator, RelFileLocator::relNumber, BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_rlocator, and RelFileLocator::spcOid.

Referenced by ExtendBufferedRelBy(), and ExtendBufferedRelTo().

◆ ExtendBufferedRelShared()

static BlockNumber ExtendBufferedRelShared ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
uint32  extend_by,
BlockNumber  extend_upto,
Buffer buffers,
uint32 extended_by 
)
static

Definition at line 2179 of file bufmgr.c.

2187 {
2188  BlockNumber first_block;
2189  IOContext io_context = IOContextForStrategy(strategy);
2190  instr_time io_start;
2191 
2192  LimitAdditionalPins(&extend_by);
2193 
2194  /*
2195  * Acquire victim buffers for extension without holding extension lock.
2196  * Writing out victim buffers is the most expensive part of extending the
2197  * relation, particularly when doing so requires WAL flushes. Zeroing out
2198  * the buffers is also quite expensive, so do that before holding the
2199  * extension lock as well.
2200  *
2201  * These pages are pinned by us and not valid. While we hold the pin they
2202  * can't be acquired as victim buffers by another backend.
2203  */
2204  for (uint32 i = 0; i < extend_by; i++)
2205  {
2206  Block buf_block;
2207 
2208  buffers[i] = GetVictimBuffer(strategy, io_context);
2209  buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
2210 
2211  /* new buffers are zero-filled */
2212  MemSet((char *) buf_block, 0, BLCKSZ);
2213  }
2214 
2215  /*
2216  * Lock relation against concurrent extensions, unless requested not to.
2217  *
2218  * We use the same extension lock for all forks. That's unnecessarily
2219  * restrictive, but currently extensions for forks don't happen often
2220  * enough to make it worth locking more granularly.
2221  *
2222  * Note that another backend might have extended the relation by the time
2223  * we get the lock.
2224  */
2225  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2227 
2228  /*
2229  * If requested, invalidate size cache, so that smgrnblocks asks the
2230  * kernel.
2231  */
2232  if (flags & EB_CLEAR_SIZE_CACHE)
2234 
2235  first_block = smgrnblocks(bmr.smgr, fork);
2236 
2237  /*
2238  * Now that we have the accurate relation size, check if the caller wants
2239  * us to extend to only up to a specific size. If there were concurrent
2240  * extensions, we might have acquired too many buffers and need to release
2241  * them.
2242  */
2243  if (extend_upto != InvalidBlockNumber)
2244  {
2245  uint32 orig_extend_by = extend_by;
2246 
2247  if (first_block > extend_upto)
2248  extend_by = 0;
2249  else if ((uint64) first_block + extend_by > extend_upto)
2250  extend_by = extend_upto - first_block;
2251 
2252  for (uint32 i = extend_by; i < orig_extend_by; i++)
2253  {
2254  BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
2255 
2256  /*
2257  * The victim buffer we acquired previously is clean and unused,
2258  * let it be found again quickly
2259  */
2260  StrategyFreeBuffer(buf_hdr);
2261  UnpinBuffer(buf_hdr);
2262  }
2263 
2264  if (extend_by == 0)
2265  {
2266  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2268  *extended_by = extend_by;
2269  return first_block;
2270  }
2271  }
2272 
2273  /* Fail if relation is already at maximum possible length */
2274  if ((uint64) first_block + extend_by >= MaxBlockNumber)
2275  ereport(ERROR,
2276  (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
2277  errmsg("cannot extend relation %s beyond %u blocks",
2278  relpath(bmr.smgr->smgr_rlocator, fork),
2279  MaxBlockNumber)));
2280 
2281  /*
2282  * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
2283  *
2284  * This needs to happen before we extend the relation, because as soon as
2285  * we do, other backends can start to read in those pages.
2286  */
2287  for (uint32 i = 0; i < extend_by; i++)
2288  {
2289  Buffer victim_buf = buffers[i];
2290  BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
2291  BufferTag tag;
2292  uint32 hash;
2293  LWLock *partition_lock;
2294  int existing_id;
2295 
2296  /* in case we need to pin an existing buffer below */
2299 
2300  InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
2301  hash = BufTableHashCode(&tag);
2302  partition_lock = BufMappingPartitionLock(hash);
2303 
2304  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
2305 
2306  existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
2307 
2308  /*
2309  * We get here only in the corner case where we are trying to extend
2310  * the relation but we found a pre-existing buffer. This can happen
2311  * because a prior attempt at extending the relation failed, and
2312  * because mdread doesn't complain about reads beyond EOF (when
2313  * zero_damaged_pages is ON) and so a previous attempt to read a block
2314  * beyond EOF could have left a "valid" zero-filled buffer.
2315  * Unfortunately, we have also seen this case occurring because of
2316  * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
2317  * that doesn't account for a recent write. In that situation, the
2318  * pre-existing buffer would contain valid data that we don't want to
2319  * overwrite. Since the legitimate cases should always have left a
2320  * zero-filled buffer, complain if not PageIsNew.
2321  */
2322  if (existing_id >= 0)
2323  {
2324  BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
2325  Block buf_block;
2326  bool valid;
2327 
2328  /*
2329  * Pin the existing buffer before releasing the partition lock,
2330  * preventing it from being evicted.
2331  */
2332  valid = PinBuffer(existing_hdr, strategy);
2333 
2334  LWLockRelease(partition_lock);
2335 
2336  /*
2337  * The victim buffer we acquired previously is clean and unused,
2338  * let it be found again quickly
2339  */
2340  StrategyFreeBuffer(victim_buf_hdr);
2341  UnpinBuffer(victim_buf_hdr);
2342 
2343  buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
2344  buf_block = BufHdrGetBlock(existing_hdr);
2345 
2346  if (valid && !PageIsNew((Page) buf_block))
2347  ereport(ERROR,
2348  (errmsg("unexpected data beyond EOF in block %u of relation %s",
2349  existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
2350  errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
2351 
2352  /*
2353  * We *must* do smgr[zero]extend before succeeding, else the page
2354  * will not be reserved by the kernel, and the next P_NEW call
2355  * will decide to return the same page. Clear the BM_VALID bit,
2356  * do StartBufferIO() and proceed.
2357  *
2358  * Loop to handle the very small possibility that someone re-sets
2359  * BM_VALID between our clearing it and StartBufferIO inspecting
2360  * it.
2361  */
2362  do
2363  {
2364  uint32 buf_state = LockBufHdr(existing_hdr);
2365 
2366  buf_state &= ~BM_VALID;
2367  UnlockBufHdr(existing_hdr, buf_state);
2368  } while (!StartBufferIO(existing_hdr, true, false));
2369  }
2370  else
2371  {
2372  uint32 buf_state;
2373 
2374  buf_state = LockBufHdr(victim_buf_hdr);
2375 
2376  /* some sanity checks while we hold the buffer header lock */
2377  Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
2378  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2379 
2380  victim_buf_hdr->tag = tag;
2381 
2382  buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
2383  if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
2384  buf_state |= BM_PERMANENT;
2385 
2386  UnlockBufHdr(victim_buf_hdr, buf_state);
2387 
2388  LWLockRelease(partition_lock);
2389 
2390  /* XXX: could combine the locked operations in it with the above */
2391  StartBufferIO(victim_buf_hdr, true, false);
2392  }
2393  }
2394 
2396 
2397  /*
2398  * Note: if smgrzeroextend fails, we will end up with buffers that are
2399  * allocated but not marked BM_VALID. The next relation extension will
2400  * still select the same block number (because the relation didn't get any
2401  * longer on disk) and so future attempts to extend the relation will find
2402  * the same buffers (if they have not been recycled) but come right back
2403  * here to try smgrzeroextend again.
2404  *
2405  * We don't need to set checksum for all-zero pages.
2406  */
2407  smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
2408 
2409  /*
2410  * Release the file-extension lock; it's now OK for someone else to extend
2411  * the relation some more.
2412  *
2413  * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
2414  * take noticeable time.
2415  */
2416  if (!(flags & EB_SKIP_EXTENSION_LOCK))
2418 
2420  io_start, extend_by);
2421 
2422  /* Set BM_VALID, terminate IO, and wake up any waiters */
2423  for (uint32 i = 0; i < extend_by; i++)
2424  {
2425  Buffer buf = buffers[i];
2426  BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
2427  bool lock = false;
2428 
2429  if (flags & EB_LOCK_FIRST && i == 0)
2430  lock = true;
2431  else if (flags & EB_LOCK_TARGET)
2432  {
2433  Assert(extend_upto != InvalidBlockNumber);
2434  if (first_block + i + 1 == extend_upto)
2435  lock = true;
2436  }
2437 
2438  if (lock)
2440 
2441  TerminateBufferIO(buf_hdr, false, BM_VALID, true);
2442  }
2443 
2444  pgBufferUsage.shared_blks_written += extend_by;
2445 
2446  *extended_by = extend_by;
2447 
2448  return first_block;
2449 }
#define MaxBlockNumber
Definition: block.h:35
#define BM_JUST_DIRTIED
Definition: buf_internals.h:66
static Buffer BufferDescriptorGetBuffer(const BufferDesc *bdesc)
bool track_io_timing
Definition: bufmgr.c:142
#define BufHdrGetBlock(bufHdr)
Definition: bufmgr.c:67
void LimitAdditionalPins(uint32 *additional_pins)
Definition: bufmgr.c:2104
static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
Definition: bufmgr.c:5531
void * Block
Definition: bufmgr.h:25
@ EB_LOCK_TARGET
Definition: bufmgr.h:92
@ EB_CLEAR_SIZE_CACHE
Definition: bufmgr.h:89
@ EB_SKIP_EXTENSION_LOCK
Definition: bufmgr.h:74
@ EB_LOCK_FIRST
Definition: bufmgr.h:86
Pointer Page
Definition: bufpage.h:78
static bool PageIsNew(Page page)
Definition: bufpage.h:230
#define MemSet(start, val, len)
Definition: c.h:1020
int errhint(const char *fmt,...)
Definition: elog.c:1317
IOContext IOContextForStrategy(BufferAccessStrategy strategy)
Definition: freelist.c:758
BufferUsage pgBufferUsage
Definition: instrument.c:20
void LockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:430
void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
Definition: lmgr.c:480
#define ExclusiveLock
Definition: lockdefs.h:42
IOContext
Definition: pgstat.h:287
@ IOOP_EXTEND
Definition: pgstat.h:299
instr_time pgstat_prepare_io_time(bool track_io_guc)
Definition: pgstat_io.c:100
void pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op, instr_time start_time, uint32 cnt)
Definition: pgstat_io.c:122
static unsigned hash(unsigned *uv, int n)
Definition: rege_dfa.c:715
#define relpath(rlocator, forknum)
Definition: relpath.h:94
BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum)
Definition: smgr.c:655
void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks, bool skipFsync)
Definition: smgr.c:560
int64 shared_blks_written
Definition: instrument.h:29
BlockNumber smgr_cached_nblocks[MAX_FORKNUM+1]
Definition: smgr.h:46

References Assert, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BM_TAG_VALID, BM_VALID, buf, BufferDesc::buf_id, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufHdrGetBlock, BufMappingPartitionLock(), BufTableHashCode(), BufTableInsert(), CurrentResourceOwner, EB_CLEAR_SIZE_CACHE, EB_LOCK_FIRST, EB_LOCK_TARGET, EB_SKIP_EXTENSION_LOCK, ereport, errcode(), errhint(), errmsg(), ERROR, ExclusiveLock, GetBufferDescriptor(), GetVictimBuffer(), hash(), i, INIT_FORKNUM, InitBufferTag(), InvalidBlockNumber, IOContextForStrategy(), IOOBJECT_RELATION, IOOP_EXTEND, LimitAdditionalPins(), RelFileLocatorBackend::locator, LockBufHdr(), LockRelationForExtension(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), MaxBlockNumber, MemSet, PageIsNew(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer(), BufferManagerRelation::rel, relpath, BufferManagerRelation::relpersistence, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferUsage::shared_blks_written, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, SMgrRelationData::smgr_rlocator, smgrnblocks(), smgrzeroextend(), StartBufferIO(), StrategyFreeBuffer(), BufferDesc::tag, TerminateBufferIO(), track_io_timing, UnlockBufHdr(), UnlockRelationForExtension(), and UnpinBuffer().

Referenced by ExtendBufferedRelCommon().

◆ ExtendBufferedRelTo()

Buffer ExtendBufferedRelTo ( BufferManagerRelation  bmr,
ForkNumber  fork,
BufferAccessStrategy  strategy,
uint32  flags,
BlockNumber  extend_to,
ReadBufferMode  mode 
)

Definition at line 909 of file bufmgr.c.

915 {
917  uint32 extended_by = 0;
918  Buffer buffer = InvalidBuffer;
919  Buffer buffers[64];
920 
921  Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
922  Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
923  Assert(extend_to != InvalidBlockNumber && extend_to > 0);
924 
925  if (bmr.smgr == NULL)
926  {
927  bmr.smgr = RelationGetSmgr(bmr.rel);
928  bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
929  }
930 
931  /*
932  * If desired, create the file if it doesn't exist. If
933  * smgr_cached_nblocks[fork] is positive then it must exist, no need for
934  * an smgrexists call.
935  */
936  if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
937  (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
939  !smgrexists(bmr.smgr, fork))
940  {
942 
943  /* recheck, fork might have been created concurrently */
944  if (!smgrexists(bmr.smgr, fork))
945  smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
946 
948  }
949 
950  /*
951  * If requested, invalidate size cache, so that smgrnblocks asks the
952  * kernel.
953  */
954  if (flags & EB_CLEAR_SIZE_CACHE)
956 
957  /*
958  * Estimate how many pages we'll need to extend by. This avoids acquiring
959  * unnecessarily many victim buffers.
960  */
961  current_size = smgrnblocks(bmr.smgr, fork);
962 
963  /*
964  * Since no-one else can be looking at the page contents yet, there is no
965  * difference between an exclusive lock and a cleanup-strength lock. Note
966  * that we pass the original mode to ReadBuffer_common() below, when
967  * falling back to reading the buffer to a concurrent relation extension.
968  */
970  flags |= EB_LOCK_TARGET;
971 
972  while (current_size < extend_to)
973  {
974  uint32 num_pages = lengthof(buffers);
975  BlockNumber first_block;
976 
977  if ((uint64) current_size + num_pages > extend_to)
978  num_pages = extend_to - current_size;
979 
980  first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
981  num_pages, extend_to,
982  buffers, &extended_by);
983 
984  current_size = first_block + extended_by;
985  Assert(num_pages != 0 || current_size >= extend_to);
986 
987  for (uint32 i = 0; i < extended_by; i++)
988  {
989  if (first_block + i != extend_to - 1)
990  ReleaseBuffer(buffers[i]);
991  else
992  buffer = buffers[i];
993  }
994  }
995 
996  /*
997  * It's possible that another backend concurrently extended the relation.
998  * In that case read the buffer.
999  *
1000  * XXX: Should we control this via a flag?
1001  */
1002  if (buffer == InvalidBuffer)
1003  {
1004  Assert(extended_by == 0);
1005  buffer = ReadBuffer_common(bmr.rel, bmr.smgr, 0,
1006  fork, extend_to - 1, mode, strategy);
1007  }
1008 
1009  return buffer;
1010 }
static Buffer ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, BufferAccessStrategy strategy)
Definition: bufmgr.c:1198
void ReleaseBuffer(Buffer buffer)
Definition: bufmgr.c:4896
@ EB_PERFORMING_RECOVERY
Definition: bufmgr.h:77
@ EB_CREATE_FORK_IF_NEEDED
Definition: bufmgr.h:83
@ RBM_ZERO_AND_CLEANUP_LOCK
Definition: bufmgr.h:48
@ RBM_ZERO_AND_LOCK
Definition: bufmgr.h:46
#define lengthof(array)
Definition: c.h:788
static PgChecksumMode mode
Definition: pg_checksums.c:56
int64 current_size
Definition: pg_checksums.c:64

References Assert, PrivateRefCountEntry::buffer, current_size, EB_CLEAR_SIZE_CACHE, EB_CREATE_FORK_IF_NEEDED, EB_LOCK_TARGET, EB_PERFORMING_RECOVERY, ExclusiveLock, ExtendBufferedRelCommon(), i, InvalidBlockNumber, InvalidBuffer, lengthof, LockRelationForExtension(), mode, RBM_ZERO_AND_CLEANUP_LOCK, RBM_ZERO_AND_LOCK, RelationData::rd_rel, ReadBuffer_common(), BufferManagerRelation::rel, RelationGetSmgr(), ReleaseBuffer(), BufferManagerRelation::relpersistence, BufferManagerRelation::smgr, SMgrRelationData::smgr_cached_nblocks, smgrcreate(), smgrexists(), smgrnblocks(), and UnlockRelationForExtension().

Referenced by fsm_extend(), vm_extend(), and XLogReadBufferExtended().

◆ FindAndDropRelationBuffers()

static void FindAndDropRelationBuffers ( RelFileLocator  rlocator,
ForkNumber  forkNum,
BlockNumber  nForkBlock,
BlockNumber  firstDelBlock 
)
static

Definition at line 4314 of file bufmgr.c.

4317 {
4318  BlockNumber curBlock;
4319 
4320  for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
4321  {
4322  uint32 bufHash; /* hash value for tag */
4323  BufferTag bufTag; /* identity of requested block */
4324  LWLock *bufPartitionLock; /* buffer partition lock for it */
4325  int buf_id;
4326  BufferDesc *bufHdr;
4327  uint32 buf_state;
4328 
4329  /* create a tag so we can lookup the buffer */
4330  InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
4331 
4332  /* determine its hash code and partition lock ID */
4333  bufHash = BufTableHashCode(&bufTag);
4334  bufPartitionLock = BufMappingPartitionLock(bufHash);
4335 
4336  /* Check that it is in the buffer pool. If not, do nothing. */
4337  LWLockAcquire(bufPartitionLock, LW_SHARED);
4338  buf_id = BufTableLookup(&bufTag, bufHash);
4339  LWLockRelease(bufPartitionLock);
4340 
4341  if (buf_id < 0)
4342  continue;
4343 
4344  bufHdr = GetBufferDescriptor(buf_id);
4345 
4346  /*
4347  * We need to lock the buffer header and recheck if the buffer is
4348  * still associated with the same block because the buffer could be
4349  * evicted by some other backend loading blocks for a different
4350  * relation after we release lock on the BufMapping table.
4351  */
4352  buf_state = LockBufHdr(bufHdr);
4353 
4354  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
4355  BufTagGetForkNum(&bufHdr->tag) == forkNum &&
4356  bufHdr->tag.blockNum >= firstDelBlock)
4357  InvalidateBuffer(bufHdr); /* releases spinlock */
4358  else
4359  UnlockBufHdr(bufHdr, buf_state);
4360  }
4361 }

References buftag::blockNum, BufMappingPartitionLock(), BufTableHashCode(), BufTableLookup(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), GetBufferDescriptor(), InitBufferTag(), InvalidateBuffer(), LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), BufferDesc::tag, and UnlockBufHdr().

Referenced by DropRelationBuffers(), and DropRelationsAllBuffers().

◆ FlushBuffer()

static void FlushBuffer ( BufferDesc buf,
SMgrRelation  reln,
IOObject  io_object,
IOContext  io_context 
)
static

Definition at line 3773 of file bufmgr.c.

3775 {
3776  XLogRecPtr recptr;
3777  ErrorContextCallback errcallback;
3778  instr_time io_start;
3779  Block bufBlock;
3780  char *bufToWrite;
3781  uint32 buf_state;
3782 
3783  /*
3784  * Try to start an I/O operation. If StartBufferIO returns false, then
3785  * someone else flushed the buffer before we could, so we need not do
3786  * anything.
3787  */
3788  if (!StartBufferIO(buf, false, false))
3789  return;
3790 
3791  /* Setup error traceback support for ereport() */
3793  errcallback.arg = (void *) buf;
3794  errcallback.previous = error_context_stack;
3795  error_context_stack = &errcallback;
3796 
3797  /* Find smgr relation for buffer */
3798  if (reln == NULL)
3800 
3801  TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
3802  buf->tag.blockNum,
3804  reln->smgr_rlocator.locator.dbOid,
3806 
3807  buf_state = LockBufHdr(buf);
3808 
3809  /*
3810  * Run PageGetLSN while holding header lock, since we don't have the
3811  * buffer locked exclusively in all cases.
3812  */
3813  recptr = BufferGetLSN(buf);
3814 
3815  /* To check if block content changes while flushing. - vadim 01/17/97 */
3816  buf_state &= ~BM_JUST_DIRTIED;
3817  UnlockBufHdr(buf, buf_state);
3818 
3819  /*
3820  * Force XLOG flush up to buffer's LSN. This implements the basic WAL
3821  * rule that log updates must hit disk before any of the data-file changes
3822  * they describe do.
3823  *
3824  * However, this rule does not apply to unlogged relations, which will be
3825  * lost after a crash anyway. Most unlogged relation pages do not bear
3826  * LSNs since we never emit WAL records for them, and therefore flushing
3827  * up through the buffer LSN would be useless, but harmless. However,
3828  * GiST indexes use LSNs internally to track page-splits, and therefore
3829  * unlogged GiST pages bear "fake" LSNs generated by
3830  * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
3831  * LSN counter could advance past the WAL insertion point; and if it did
3832  * happen, attempting to flush WAL through that location would fail, with
3833  * disastrous system-wide consequences. To make sure that can't happen,
3834  * skip the flush if the buffer isn't permanent.
3835  */
3836  if (buf_state & BM_PERMANENT)
3837  XLogFlush(recptr);
3838 
3839  /*
3840  * Now it's safe to write buffer to disk. Note that no one else should
3841  * have been able to write it while we were busy with log flushing because
3842  * only one process at a time can set the BM_IO_IN_PROGRESS bit.
3843  */
3844  bufBlock = BufHdrGetBlock(buf);
3845 
3846  /*
3847  * Update page checksum if desired. Since we have only shared lock on the
3848  * buffer, other processes might be updating hint bits in it, so we must
3849  * copy the page to private storage if we do checksumming.
3850  */
3851  bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
3852 
3854 
3855  /*
3856  * bufToWrite is either the shared buffer or a copy, as appropriate.
3857  */
3858  smgrwrite(reln,
3859  BufTagGetForkNum(&buf->tag),
3860  buf->tag.blockNum,
3861  bufToWrite,
3862  false);
3863 
3864  /*
3865  * When a strategy is in use, only flushes of dirty buffers already in the
3866  * strategy ring are counted as strategy writes (IOCONTEXT
3867  * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
3868  * statistics tracking.
3869  *
3870  * If a shared buffer initially added to the ring must be flushed before
3871  * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
3872  *
3873  * If a shared buffer which was added to the ring later because the
3874  * current strategy buffer is pinned or in use or because all strategy
3875  * buffers were dirty and rejected (for BAS_BULKREAD operations only)
3876  * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
3877  * (from_ring will be false).
3878  *
3879  * When a strategy is not in use, the write can only be a "regular" write
3880  * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
3881  */
3883  IOOP_WRITE, io_start, 1);
3884 
3886 
3887  /*
3888  * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
3889  * end the BM_IO_IN_PROGRESS state.
3890  */
3891  TerminateBufferIO(buf, true, 0, true);
3892 
3893  TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
3894  buf->tag.blockNum,
3896  reln->smgr_rlocator.locator.dbOid,
3898 
3899  /* Pop the error context stack */
3900  error_context_stack = errcallback.previous;
3901 }
#define BufferGetLSN(bufHdr)
Definition: bufmgr.c:68
static void shared_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5667
char * PageSetChecksumCopy(Page page, BlockNumber blkno)
Definition: bufpage.c:1510
ErrorContextCallback * error_context_stack
Definition: elog.c:94
@ IOOP_WRITE
Definition: pgstat.h:304
static void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync)
Definition: smgr.h:121
struct ErrorContextCallback * previous
Definition: elog.h:295
void(* callback)(void *arg)
Definition: elog.h:296
void XLogFlush(XLogRecPtr record)
Definition: xlog.c:2791

References ErrorContextCallback::arg, BM_JUST_DIRTIED, BM_PERMANENT, buf, BufferGetLSN, BufHdrGetBlock, BufTagGetForkNum(), BufTagGetRelFileLocator(), ErrorContextCallback::callback, RelFileLocator::dbOid, error_context_stack, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITE, RelFileLocatorBackend::locator, LockBufHdr(), PageSetChecksumCopy(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), ErrorContextCallback::previous, RelFileLocator::relNumber, BufferUsage::shared_blks_written, shared_buffer_write_error_callback(), SMgrRelationData::smgr_rlocator, smgropen(), smgrwrite(), RelFileLocator::spcOid, StartBufferIO(), TerminateBufferIO(), track_io_timing, UnlockBufHdr(), and XLogFlush().

Referenced by EvictUnpinnedBuffer(), FlushDatabaseBuffers(), FlushOneBuffer(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), and SyncOneBuffer().

◆ FlushDatabaseBuffers()

void FlushDatabaseBuffers ( Oid  dbid)

Definition at line 4834 of file bufmgr.c.

4835 {
4836  int i;
4837  BufferDesc *bufHdr;
4838 
4839  for (i = 0; i < NBuffers; i++)
4840  {
4841  uint32 buf_state;
4842 
4843  bufHdr = GetBufferDescriptor(i);
4844 
4845  /*
4846  * As in DropRelationBuffers, an unlocked precheck should be safe and
4847  * saves some cycles.
4848  */
4849  if (bufHdr->tag.dbOid != dbid)
4850  continue;
4851 
4852  /* Make sure we can handle the pin */
4855 
4856  buf_state = LockBufHdr(bufHdr);
4857  if (bufHdr->tag.dbOid == dbid &&
4858  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4859  {
4860  PinBuffer_Locked(bufHdr);
4864  UnpinBuffer(bufHdr);
4865  }
4866  else
4867  UnlockBufHdr(bufHdr, buf_state);
4868  }
4869 }

References BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), CurrentResourceOwner, buftag::dbOid, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by dbase_redo().

◆ FlushOneBuffer()

void FlushOneBuffer ( Buffer  buffer)

Definition at line 4876 of file bufmgr.c.

4877 {
4878  BufferDesc *bufHdr;
4879 
4880  /* currently not needed, but no fundamental reason not to support */
4881  Assert(!BufferIsLocal(buffer));
4882 
4883  Assert(BufferIsPinned(buffer));
4884 
4885  bufHdr = GetBufferDescriptor(buffer - 1);
4886 
4888 
4890 }
bool LWLockHeldByMe(LWLock *lock)
Definition: lwlock.c:1895

References Assert, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, FlushBuffer(), GetBufferDescriptor(), IOCONTEXT_NORMAL, IOOBJECT_RELATION, and LWLockHeldByMe().

Referenced by hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), and XLogReadBufferForRedoExtended().

◆ FlushRelationBuffers()

void FlushRelationBuffers ( Relation  rel)

Definition at line 4481 of file bufmgr.c.

4482 {
4483  int i;
4484  BufferDesc *bufHdr;
4485  SMgrRelation srel = RelationGetSmgr(rel);
4486 
4487  if (RelationUsesLocalBuffers(rel))
4488  {
4489  for (i = 0; i < NLocBuffer; i++)
4490  {
4491  uint32 buf_state;
4492  instr_time io_start;
4493 
4494  bufHdr = GetLocalBufferDescriptor(i);
4495  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4496  ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
4497  (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4498  {
4499  ErrorContextCallback errcallback;
4500  Page localpage;
4501 
4502  localpage = (char *) LocalBufHdrGetBlock(bufHdr);
4503 
4504  /* Setup error traceback support for ereport() */
4506  errcallback.arg = (void *) bufHdr;
4507  errcallback.previous = error_context_stack;
4508  error_context_stack = &errcallback;
4509 
4510  PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
4511 
4513 
4514  smgrwrite(srel,
4515  BufTagGetForkNum(&bufHdr->tag),
4516  bufHdr->tag.blockNum,
4517  localpage,
4518  false);
4519 
4522  io_start, 1);
4523 
4524  buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
4525  pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
4526 
4528 
4529  /* Pop the error context stack */
4530  error_context_stack = errcallback.previous;
4531  }
4532  }
4533 
4534  return;
4535  }
4536 
4537  for (i = 0; i < NBuffers; i++)
4538  {
4539  uint32 buf_state;
4540 
4541  bufHdr = GetBufferDescriptor(i);
4542 
4543  /*
4544  * As in DropRelationBuffers, an unlocked precheck should be safe and
4545  * saves some cycles.
4546  */
4547  if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
4548  continue;
4549 
4550  /* Make sure we can handle the pin */
4553 
4554  buf_state = LockBufHdr(bufHdr);
4555  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
4556  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4557  {
4558  PinBuffer_Locked(bufHdr);
4562  UnpinBuffer(bufHdr);
4563  }
4564  else
4565  UnlockBufHdr(bufHdr, buf_state);
4566  }
4567 }
static void pg_atomic_unlocked_write_u32(volatile pg_atomic_uint32 *ptr, uint32 val)
Definition: atomics.h:290
#define LocalBufHdrGetBlock(bufHdr)
Definition: bufmgr.c:71
static void local_buffer_write_error_callback(void *arg)
Definition: bufmgr.c:5687
void PageSetChecksumInplace(Page page, BlockNumber blkno)
Definition: bufpage.c:1542
int NLocBuffer
Definition: localbuf.c:42
@ IOOBJECT_TEMP_RELATION
Definition: pgstat.h:281
#define RelationUsesLocalBuffers(relation)
Definition: rel.h:637
int64 local_blks_written
Definition: instrument.h:33
RelFileLocator rd_locator
Definition: rel.h:57

References ErrorContextCallback::arg, buftag::blockNum, BM_DIRTY, BM_JUST_DIRTIED, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetForkNum(), BufTagMatchesRelFileLocator(), ErrorContextCallback::callback, CurrentResourceOwner, error_context_stack, FlushBuffer(), GetBufferDescriptor(), GetLocalBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, IOOBJECT_TEMP_RELATION, IOOP_WRITE, BufferUsage::local_blks_written, local_buffer_write_error_callback(), LocalBufHdrGetBlock, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, NLocBuffer, PageSetChecksumInplace(), pg_atomic_read_u32(), pg_atomic_unlocked_write_u32(), pgBufferUsage, pgstat_count_io_op_time(), pgstat_prepare_io_time(), PinBuffer_Locked(), ErrorContextCallback::previous, RelationData::rd_locator, RelationGetSmgr(), RelationUsesLocalBuffers, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), smgrwrite(), BufferDesc::state, BufferDesc::tag, track_io_timing, UnlockBufHdr(), and UnpinBuffer().

Referenced by fill_seq_with_data(), heapam_relation_copy_data(), and index_copy_data().

◆ FlushRelationsAllBuffers()

void FlushRelationsAllBuffers ( SMgrRelation smgrs,
int  nrels 
)

Definition at line 4579 of file bufmgr.c.

4580 {
4581  int i;
4582  SMgrSortArray *srels;
4583  bool use_bsearch;
4584 
4585  if (nrels == 0)
4586  return;
4587 
4588  /* fill-in array for qsort */
4589  srels = palloc(sizeof(SMgrSortArray) * nrels);
4590 
4591  for (i = 0; i < nrels; i++)
4592  {
4593  Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
4594 
4595  srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
4596  srels[i].srel = smgrs[i];
4597  }
4598 
4599  /*
4600  * Save the bsearch overhead for low number of relations to sync. See
4601  * DropRelationsAllBuffers for details.
4602  */
4603  use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
4604 
4605  /* sort the list of SMgrRelations if necessary */
4606  if (use_bsearch)
4607  qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
4608 
4609  for (i = 0; i < NBuffers; i++)
4610  {
4611  SMgrSortArray *srelent = NULL;
4612  BufferDesc *bufHdr = GetBufferDescriptor(i);
4613  uint32 buf_state;
4614 
4615  /*
4616  * As in DropRelationBuffers, an unlocked precheck should be safe and
4617  * saves some cycles.
4618  */
4619 
4620  if (!use_bsearch)
4621  {
4622  int j;
4623 
4624  for (j = 0; j < nrels; j++)
4625  {
4626  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
4627  {
4628  srelent = &srels[j];
4629  break;
4630  }
4631  }
4632  }
4633  else
4634  {
4635  RelFileLocator rlocator;
4636 
4637  rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
4638  srelent = bsearch((const void *) &(rlocator),
4639  srels, nrels, sizeof(SMgrSortArray),
4641  }
4642 
4643  /* buffer doesn't belong to any of the given relfilelocators; skip it */
4644  if (srelent == NULL)
4645  continue;
4646 
4647  /* Make sure we can handle the pin */
4650 
4651  buf_state = LockBufHdr(bufHdr);
4652  if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
4653  (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
4654  {
4655  PinBuffer_Locked(bufHdr);
4657  FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
4659  UnpinBuffer(bufHdr);
4660  }
4661  else
4662  UnlockBufHdr(bufHdr, buf_state);
4663  }
4664 
4665  pfree(srels);
4666 }
SMgrRelation srel
Definition: bufmgr.c:135
RelFileLocator rlocator
Definition: bufmgr.c:134

References Assert, BM_DIRTY, BM_VALID, BufferDescriptorGetContentLock(), BufTagGetRelFileLocator(), BufTagMatchesRelFileLocator(), CurrentResourceOwner, FlushBuffer(), GetBufferDescriptor(), i, IOCONTEXT_NORMAL, IOOBJECT_RELATION, j, RelFileLocatorBackend::locator, LockBufHdr(), LW_SHARED, LWLockAcquire(), LWLockRelease(), NBuffers, palloc(), pfree(), PinBuffer_Locked(), qsort, RelFileLocatorBackendIsTemp, RELS_BSEARCH_THRESHOLD, ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), SMgrSortArray::rlocator, rlocator_comparator(), SMgrRelationData::smgr_rlocator, SMgrSortArray::srel, BufferDesc::tag, UnlockBufHdr(), and UnpinBuffer().

Referenced by smgrdosyncall().

◆ ForgetPrivateRefCountEntry()

static void ForgetPrivateRefCountEntry ( PrivateRefCountEntry ref)
static

Definition at line 438 of file bufmgr.c.

439 {
440  Assert(ref->refcount == 0);
441 
442  if (ref >= &PrivateRefCountArray[0] &&
444  {
445  ref->buffer = InvalidBuffer;
446 
447  /*
448  * Mark the just used entry as reserved - in many scenarios that
449  * allows us to avoid ever having to search the array/hash for free
450  * entries.
451  */
452  ReservedRefCountEntry = ref;
453  }
454  else
455  {
456  bool found;
457  Buffer buffer = ref->buffer;
458 
459  hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
460  Assert(found);
463  }
464 }
static PrivateRefCountEntry * ReservedRefCountEntry
Definition: bufmgr.c:211
void * hash_search(HTAB *hashp, const void *keyPtr, HASHACTION action, bool *foundPtr)
Definition: dynahash.c:955
@ HASH_REMOVE
Definition: hsearch.h:115

References Assert, PrivateRefCountEntry::buffer, HASH_REMOVE, hash_search(), InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, PrivateRefCountEntry::refcount, REFCOUNT_ARRAY_ENTRIES, and ReservedRefCountEntry.

Referenced by UnpinBufferNoOwner().

◆ GetPrivateRefCount()

static int32 GetPrivateRefCount ( Buffer  buffer)
inlinestatic

Definition at line 415 of file bufmgr.c.

416 {
418 
419  Assert(BufferIsValid(buffer));
420  Assert(!BufferIsLocal(buffer));
421 
422  /*
423  * Not moving the entry - that's ok for the current users, but we might
424  * want to change this one day.
425  */
426  ref = GetPrivateRefCountEntry(buffer, false);
427 
428  if (ref == NULL)
429  return 0;
430  return ref->refcount;
431 }
static PrivateRefCountEntry * GetPrivateRefCountEntry(Buffer buffer, bool do_move)
Definition: bufmgr.c:341

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), GetPrivateRefCountEntry(), and PrivateRefCountEntry::refcount.

Referenced by CheckBufferIsPinnedOnce(), ConditionalLockBufferForCleanup(), DebugPrintBufferRefcount(), HoldingBufferPinThatDelaysRecovery(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), MarkBufferDirtyHint(), and ReadRecentBuffer().

◆ GetPrivateRefCountEntry()

static PrivateRefCountEntry * GetPrivateRefCountEntry ( Buffer  buffer,
bool  do_move 
)
static

Definition at line 341 of file bufmgr.c.

342 {
344  int i;
345 
346  Assert(BufferIsValid(buffer));
347  Assert(!BufferIsLocal(buffer));
348 
349  /*
350  * First search for references in the array, that'll be sufficient in the
351  * majority of cases.
352  */
353  for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
354  {
356 
357  if (res->buffer == buffer)
358  return res;
359  }
360 
361  /*
362  * By here we know that the buffer, if already pinned, isn't residing in
363  * the array.
364  *
365  * Only look up the buffer in the hashtable if we've previously overflowed
366  * into it.
367  */
368  if (PrivateRefCountOverflowed == 0)
369  return NULL;
370 
371  res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
372 
373  if (res == NULL)
374  return NULL;
375  else if (!do_move)
376  {
377  /* caller doesn't want us to move the hash entry into the array */
378  return res;
379  }
380  else
381  {
382  /* move buffer from hashtable into the free array slot */
383  bool found;
385 
386  /* Ensure there's a free array slot */
388 
389  /* Use up the reserved slot */
390  Assert(ReservedRefCountEntry != NULL);
392  ReservedRefCountEntry = NULL;
393  Assert(free->buffer == InvalidBuffer);
394 
395  /* and fill it */
396  free->buffer = buffer;
397  free->refcount = res->refcount;
398 
399  /* delete from hashtable */
400  hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
401  Assert(found);
404 
405  return free;
406  }
407 }
#define free(a)
Definition: header.h:65
@ HASH_FIND
Definition: hsearch.h:113

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsValid(), free, HASH_FIND, HASH_REMOVE, hash_search(), i, InvalidBuffer, PrivateRefCountArray, PrivateRefCountHash, PrivateRefCountOverflowed, REFCOUNT_ARRAY_ENTRIES, res, ReservedRefCountEntry, and ReservePrivateRefCountEntry().

Referenced by GetPrivateRefCount(), IncrBufferRefCount(), PinBuffer(), PinBuffer_Locked(), and UnpinBufferNoOwner().

◆ GetVictimBuffer()

static Buffer GetVictimBuffer ( BufferAccessStrategy  strategy,
IOContext  io_context 
)
static

Definition at line 1938 of file bufmgr.c.

1939 {
1940  BufferDesc *buf_hdr;
1941  Buffer buf;
1942  uint32 buf_state;
1943  bool from_ring;
1944 
1945  /*
1946  * Ensure, while the spinlock's not yet held, that there's a free refcount
1947  * entry, and a resource owner slot for the pin.
1948  */
1951 
1952  /* we return here if a prospective victim buffer gets used concurrently */
1953 again:
1954 
1955  /*
1956  * Select a victim buffer. The buffer is returned with its header
1957  * spinlock still held!
1958  */
1959  buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
1960  buf = BufferDescriptorGetBuffer(buf_hdr);
1961 
1962  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
1963 
1964  /* Pin the buffer and then release the buffer spinlock */
1965  PinBuffer_Locked(buf_hdr);
1966 
1967  /*
1968  * We shouldn't have any other pins for this buffer.
1969  */
1971 
1972  /*
1973  * If the buffer was dirty, try to write it out. There is a race
1974  * condition here, in that someone might dirty it after we released the
1975  * buffer header lock above, or even while we are writing it out (since
1976  * our share-lock won't prevent hint-bit updates). We will recheck the
1977  * dirty bit after re-locking the buffer header.
1978  */
1979  if (buf_state & BM_DIRTY)
1980  {
1981  LWLock *content_lock;
1982 
1983  Assert(buf_state & BM_TAG_VALID);
1984  Assert(buf_state & BM_VALID);
1985 
1986  /*
1987  * We need a share-lock on the buffer contents to write it out (else
1988  * we might write invalid data, eg because someone else is compacting
1989  * the page contents while we write). We must use a conditional lock
1990  * acquisition here to avoid deadlock. Even though the buffer was not
1991  * pinned (and therefore surely not locked) when StrategyGetBuffer
1992  * returned it, someone else could have pinned and exclusive-locked it
1993  * by the time we get here. If we try to get the lock unconditionally,
1994  * we'd block waiting for them; if they later block waiting for us,
1995  * deadlock ensues. (This has been observed to happen when two
1996  * backends are both trying to split btree index pages, and the second
1997  * one just happens to be trying to split the page the first one got
1998  * from StrategyGetBuffer.)
1999  */
2000  content_lock = BufferDescriptorGetContentLock(buf_hdr);
2001  if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
2002  {
2003  /*
2004  * Someone else has locked the buffer, so give it up and loop back
2005  * to get another one.
2006  */
2007  UnpinBuffer(buf_hdr);
2008  goto again;
2009  }
2010 
2011  /*
2012  * If using a nondefault strategy, and writing the buffer would
2013  * require a WAL flush, let the strategy decide whether to go ahead
2014  * and write/reuse the buffer or to choose another victim. We need a
2015  * lock to inspect the page LSN, so this can't be done inside
2016  * StrategyGetBuffer.
2017  */
2018  if (strategy != NULL)
2019  {
2020  XLogRecPtr lsn;
2021 
2022  /* Read the LSN while holding buffer header lock */
2023  buf_state = LockBufHdr(buf_hdr);
2024  lsn = BufferGetLSN(buf_hdr);
2025  UnlockBufHdr(buf_hdr, buf_state);
2026 
2027  if (XLogNeedsFlush(lsn)
2028  && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
2029  {
2030  LWLockRelease(content_lock);
2031  UnpinBuffer(buf_hdr);
2032  goto again;
2033  }
2034  }
2035 
2036  /* OK, do the I/O */
2037  FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
2038  LWLockRelease(content_lock);
2039 
2041  &buf_hdr->tag);
2042  }
2043 
2044 
2045  if (buf_state & BM_VALID)
2046  {
2047  /*
2048  * When a BufferAccessStrategy is in use, blocks evicted from shared
2049  * buffers are counted as IOOP_EVICT in the corresponding context
2050  * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
2051  * strategy in two cases: 1) while initially claiming buffers for the
2052  * strategy ring 2) to replace an existing strategy ring buffer
2053  * because it is pinned or in use and cannot be reused.
2054  *
2055  * Blocks evicted from buffers already in the strategy ring are
2056  * counted as IOOP_REUSE in the corresponding strategy context.
2057  *
2058  * At this point, we can accurately count evictions and reuses,
2059  * because we have successfully claimed the valid buffer. Previously,
2060  * we may have been forced to release the buffer due to concurrent
2061  * pinners or erroring out.
2062  */
2064  from_ring ? IOOP_REUSE : IOOP_EVICT);
2065  }
2066 
2067  /*
2068  * If the buffer has an entry in the buffer mapping table, delete it. This
2069  * can fail because another backend could have pinned or dirtied the
2070  * buffer.
2071  */
2072  if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
2073  {
2074  UnpinBuffer(buf_hdr);
2075  goto again;
2076  }
2077 
2078  /* a final set of sanity checks */
2079 #ifdef USE_ASSERT_CHECKING
2080  buf_state = pg_atomic_read_u32(&buf_hdr->state);
2081 
2082  Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
2083  Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
2084 
2086 #endif
2087 
2088  return buf;
2089 }
WritebackContext BackendWritebackContext
Definition: buf_init.c:24
void CheckBufferIsPinnedOnce(Buffer buffer)
Definition: bufmgr.c:5178
void ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, BufferTag *tag)
Definition: bufmgr.c:5888
BufferDesc * StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
Definition: freelist.c:196
bool StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
Definition: freelist.c:798
@ IOOP_EVICT
Definition: pgstat.h:298
@ IOOP_REUSE
Definition: pgstat.h:303
void pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op)
Definition: pgstat_io.c:77
bool XLogNeedsFlush(XLogRecPtr record)
Definition: xlog.c:3122

References Assert, BackendWritebackContext, BM_DIRTY, BM_TAG_VALID, BM_VALID, buf, BUF_STATE_GET_REFCOUNT, BufferDescriptorGetBuffer(), BufferDescriptorGetContentLock(), BufferGetLSN, CheckBufferIsPinnedOnce(), CurrentResourceOwner, FlushBuffer(), InvalidateVictimBuffer(), IOOBJECT_RELATION, IOOP_EVICT, IOOP_REUSE, LockBufHdr(), LW_SHARED, LWLockConditionalAcquire(), LWLockRelease(), pg_atomic_read_u32(), pgstat_count_io_op(), PinBuffer_Locked(), ReservePrivateRefCountEntry(), ResourceOwnerEnlarge(), ScheduleBufferTagForWriteback(), BufferDesc::state, StrategyGetBuffer(), StrategyRejectBuffer(), BufferDesc::tag, UnlockBufHdr(), UnpinBuffer(), and XLogNeedsFlush().

Referenced by BufferAlloc(), and ExtendBufferedRelShared().

◆ HoldingBufferPinThatDelaysRecovery()

bool HoldingBufferPinThatDelaysRecovery ( void  )

Definition at line 5346 of file bufmgr.c.

5347 {
5348  int bufid = GetStartupBufferPinWaitBufId();
5349 
5350  /*
5351  * If we get woken slowly then it's possible that the Startup process was
5352  * already woken by other backends before we got here. Also possible that
5353  * we get here by multiple interrupts or interrupts at inappropriate
5354  * times, so make sure we do nothing if the bufid is not set.
5355  */
5356  if (bufid < 0)
5357  return false;
5358 
5359  if (GetPrivateRefCount(bufid + 1) > 0)
5360  return true;
5361 
5362  return false;
5363 }
int GetStartupBufferPinWaitBufId(void)
Definition: proc.c:671

References GetPrivateRefCount(), and GetStartupBufferPinWaitBufId().

Referenced by CheckRecoveryConflictDeadlock(), and ProcessRecoveryConflictInterrupt().

◆ IncrBufferRefCount()

void IncrBufferRefCount ( Buffer  buffer)

Definition at line 4928 of file bufmgr.c.

4929 {
4930  Assert(BufferIsPinned(buffer));
4932  if (BufferIsLocal(buffer))
4933  LocalRefCount[-buffer - 1]++;
4934  else
4935  {
4936  PrivateRefCountEntry *ref;
4937 
4938  ref = GetPrivateRefCountEntry(buffer, true);
4939  Assert(ref != NULL);
4940  ref->refcount++;
4941  }
4943 }
static void ResourceOwnerRememberBuffer(ResourceOwner owner, Buffer buffer)

References Assert, PrivateRefCountEntry::buffer, BufferIsLocal, BufferIsPinned, CurrentResourceOwner, GetPrivateRefCountEntry(), LocalRefCount, PrivateRefCountEntry::refcount, ResourceOwnerEnlarge(), and ResourceOwnerRememberBuffer().

Referenced by _bt_steppage(), btrestrpos(), entryLoadMoreItems(), ReadBufferBI(), RelationAddBlocks(), scanPostingTree(), startScanEntry(), and tts_buffer_heap_store_tuple().

◆ InitBufferPoolAccess()

void InitBufferPoolAccess ( void  )

Definition at line 3565 of file bufmgr.c.

3566 {
3567  HASHCTL hash_ctl;
3568 
3569  memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
3570 
3571  hash_ctl.keysize = sizeof(int32);
3572  hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
3573 
3574  PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
3575  HASH_ELEM | HASH_BLOBS);
3576 
3577  /*
3578  * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
3579  * the corresponding phase of backend shutdown.
3580  */
3581  Assert(MyProc != NULL);
3583 }
static void AtProcExit_Buffers(int code, Datum arg)
Definition: bufmgr.c:3590
struct PrivateRefCountEntry PrivateRefCountEntry
HTAB * hash_create(const char *tabname, long nelem, const HASHCTL *info, int flags)
Definition: dynahash.c:352
#define HASH_ELEM
Definition: hsearch.h:95
#define HASH_BLOBS
Definition: hsearch.h:97
void on_shmem_exit(pg_on_exit_callback function, Datum arg)
Definition: ipc.c:365
PGPROC * MyProc
Definition: proc.c:66
Size keysize
Definition: hsearch.h:75
Size entrysize
Definition: hsearch.h:76

References Assert, AtProcExit_Buffers(), HASHCTL::entrysize, HASH_BLOBS, hash_create(), HASH_ELEM, HASHCTL::keysize, MyProc, on_shmem_exit(), PrivateRefCountArray, and PrivateRefCountHash.

Referenced by BaseInit().

◆ InvalidateBuffer()

static void InvalidateBuffer ( BufferDesc buf)
static

Definition at line 1772 of file bufmgr.c.

1773 {
1774  BufferTag oldTag;
1775  uint32 oldHash; /* hash value for oldTag */
1776  LWLock *oldPartitionLock; /* buffer partition lock for it */
1777  uint32 oldFlags;
1778  uint32 buf_state;
1779 
1780  /* Save the original buffer tag before dropping the spinlock */
1781  oldTag = buf->tag;
1782 
1783  buf_state = pg_atomic_read_u32(&buf->state);
1784  Assert(buf_state & BM_LOCKED);
1785  UnlockBufHdr(buf, buf_state);
1786 
1787  /*
1788  * Need to compute the old tag's hashcode and partition lock ID. XXX is it
1789  * worth storing the hashcode in BufferDesc so we need not recompute it
1790  * here? Probably not.
1791  */
1792  oldHash = BufTableHashCode(&oldTag);
1793  oldPartitionLock = BufMappingPartitionLock(oldHash);
1794 
1795 retry:
1796 
1797  /*
1798  * Acquire exclusive mapping lock in preparation for changing the buffer's
1799  * association.
1800  */
1801  LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
1802 
1803  /* Re-lock the buffer header */
1804  buf_state = LockBufHdr(buf);
1805 
1806  /* If it's changed while we were waiting for lock, do nothing */
1807  if (!BufferTagsEqual(&buf->tag, &oldTag))
1808  {
1809  UnlockBufHdr(buf, buf_state);
1810  LWLockRelease(oldPartitionLock);
1811  return;
1812  }
1813 
1814  /*
1815  * We assume the only reason for it to be pinned is that someone else is
1816  * flushing the page out. Wait for them to finish. (This could be an
1817  * infinite loop if the refcount is messed up... it would be nice to time
1818  * out after awhile, but there seems no way to be sure how many loops may
1819  * be needed. Note that if the other guy has pinned the buffer but not
1820  * yet done StartBufferIO, WaitIO will fall through and we'll effectively
1821  * be busy-looping here.)
1822  */
1823  if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
1824  {
1825  UnlockBufHdr(buf, buf_state);
1826  LWLockRelease(oldPartitionLock);
1827  /* safety check: should definitely not be our *own* pin */
1829  elog(ERROR, "buffer is pinned in InvalidateBuffer");
1830  WaitIO(buf);
1831  goto retry;
1832  }
1833 
1834  /*
1835  * Clear out the buffer's tag and flags. We must do this to ensure that
1836  * linear scans of the buffer array don't think the buffer is valid.
1837  */
1838  oldFlags = buf_state & BUF_FLAG_MASK;
1839  ClearBufferTag(&buf->tag);
1840  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1841  UnlockBufHdr(buf, buf_state);
1842 
1843  /*
1844  * Remove the buffer from the lookup hashtable, if it was in there.
1845  */
1846  if (oldFlags & BM_TAG_VALID)
1847  BufTableDelete(&oldTag, oldHash);
1848 
1849  /*
1850  * Done with mapping lock.
1851  */
1852  LWLockRelease(oldPartitionLock);
1853 
1854  /*
1855  * Insert the buffer at the head of the list of free buffers.
1856  */
1858 }
#define BUF_USAGECOUNT_MASK
Definition: buf_internals.h:45
static bool BufferTagsEqual(const BufferTag *tag1, const BufferTag *tag2)
#define BM_LOCKED
Definition: buf_internals.h:60
static void ClearBufferTag(BufferTag *tag)
void BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
Definition: buf_table.c:148
static void WaitIO(BufferDesc *buf)
Definition: bufmgr.c:5482

References Assert, BM_LOCKED, BM_TAG_VALID, buf, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), elog, ERROR, GetPrivateRefCount(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), StrategyFreeBuffer(), UnlockBufHdr(), and WaitIO().

Referenced by DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), and FindAndDropRelationBuffers().

◆ InvalidateVictimBuffer()

static bool InvalidateVictimBuffer ( BufferDesc buf_hdr)
static

Definition at line 1870 of file bufmgr.c.

1871 {
1872  uint32 buf_state;
1873  uint32 hash;
1874  LWLock *partition_lock;
1875  BufferTag tag;
1876 
1878 
1879  /* have buffer pinned, so it's safe to read tag without lock */
1880  tag = buf_hdr->tag;
1881 
1882  hash = BufTableHashCode(&tag);
1883  partition_lock = BufMappingPartitionLock(hash);
1884 
1885  LWLockAcquire(partition_lock, LW_EXCLUSIVE);
1886 
1887  /* lock the buffer header */
1888  buf_state = LockBufHdr(buf_hdr);
1889 
1890  /*
1891  * We have the buffer pinned nobody else should have been able to unset
1892  * this concurrently.
1893  */
1894  Assert(buf_state & BM_TAG_VALID);
1895  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1896  Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
1897 
1898  /*
1899  * If somebody else pinned the buffer since, or even worse, dirtied it,
1900  * give up on this buffer: It's clearly in use.
1901  */
1902  if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
1903  {
1904  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1905 
1906  UnlockBufHdr(buf_hdr, buf_state);
1907  LWLockRelease(partition_lock);
1908 
1909  return false;
1910  }
1911 
1912  /*
1913  * Clear out the buffer's tag and flags and usagecount. This is not
1914  * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
1915  * doing anything with the buffer. But currently it's beneficial, as the
1916  * cheaper pre-check for several linear scans of shared buffers use the
1917  * tag (see e.g. FlushDatabaseBuffers()).
1918  */
1919  ClearBufferTag(&buf_hdr->tag);
1920  buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
1921  UnlockBufHdr(buf_hdr, buf_state);
1922 
1923  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1924 
1925  /* finally delete buffer from the buffer mapping table */
1926  BufTableDelete(&tag, hash);
1927 
1928  LWLockRelease(partition_lock);
1929 
1930  Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
1931  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
1933 
1934  return true;
1935 }

References Assert, BM_DIRTY, BM_TAG_VALID, BM_VALID, BUF_FLAG_MASK, BUF_STATE_GET_REFCOUNT, BUF_USAGECOUNT_MASK, BufferDescriptorGetBuffer(), BufferTagsEqual(), BufMappingPartitionLock(), BufTableDelete(), BufTableHashCode(), ClearBufferTag(), GetPrivateRefCount(), hash(), LockBufHdr(), LW_EXCLUSIVE, LWLockAcquire(), LWLockRelease(), pg_atomic_read_u32(), BufferDesc::state, BufferDesc::tag, and UnlockBufHdr().

Referenced by EvictUnpinnedBuffer(), and GetVictimBuffer().

◆ IsBufferCleanupOK()

bool IsBufferCleanupOK ( Buffer  buffer)

Definition at line 5428 of file bufmgr.c.

5429 {
5430  BufferDesc *bufHdr;
5431  uint32 buf_state;
5432 
5433  Assert(BufferIsValid(buffer));
5434 
5435  if (BufferIsLocal(buffer))
5436  {
5437  /* There should be exactly one pin */
5438  if (LocalRefCount[-buffer - 1] != 1)
5439  return false;
5440  /* Nobody else to wait for */
5441  return true;
5442  }
5443 
5444  /* There should be exactly one local pin */
5445  if (GetPrivateRefCount(buffer) != 1)
5446  return false;
5447 
5448  bufHdr = GetBufferDescriptor(buffer - 1);
5449 
5450  /* caller must hold exclusive lock on buffer */
5452  LW_EXCLUSIVE));
5453 
5454  buf_state = LockBufHdr(bufHdr);
5455 
5456  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5457  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5458  {
5459  /* pincount is OK. */
5460  UnlockBufHdr(bufHdr, buf_state);
5461  return true;
5462  }
5463 
5464  UnlockBufHdr(bufHdr, buf_state);
5465  return false;
5466 }

References Assert, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsValid(), GetBufferDescriptor(), GetPrivateRefCount(), LocalRefCount, LockBufHdr(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), and UnlockBufHdr().

Referenced by _hash_doinsert(), _hash_expandtable(), _hash_splitbucket(), and hashbucketcleanup().

◆ IssuePendingWritebacks()

void IssuePendingWritebacks ( WritebackContext wb_context,
IOContext  io_context 
)

Definition at line 5933 of file bufmgr.c.

5934 {
5935  instr_time io_start;
5936  int i;
5937 
5938  if (wb_context->nr_pending == 0)
5939  return;
5940 
5941  /*
5942  * Executing the writes in-order can make them a lot faster, and allows to
5943  * merge writeback requests to consecutive blocks into larger writebacks.
5944  */
5945  sort_pending_writebacks(wb_context->pending_writebacks,
5946  wb_context->nr_pending);
5947 
5949 
5950  /*
5951  * Coalesce neighbouring writes, but nothing else. For that we iterate
5952  * through the, now sorted, array of pending flushes, and look forward to
5953  * find all neighbouring (or identical) writes.
5954  */
5955  for (i = 0; i < wb_context->nr_pending; i++)
5956  {
5959  SMgrRelation reln;
5960  int ahead;
5961  BufferTag tag;
5962  RelFileLocator currlocator;
5963  Size nblocks = 1;
5964 
5965  cur = &wb_context->pending_writebacks[i];
5966  tag = cur->tag;
5967  currlocator = BufTagGetRelFileLocator(&tag);
5968 
5969  /*
5970  * Peek ahead, into following writeback requests, to see if they can
5971  * be combined with the current one.
5972  */
5973  for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
5974  {
5975 
5976  next = &wb_context->pending_writebacks[i + ahead + 1];
5977 
5978  /* different file, stop */
5979  if (!RelFileLocatorEquals(currlocator,
5980  BufTagGetRelFileLocator(&next->tag)) ||
5981  BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
5982  break;
5983 
5984  /* ok, block queued twice, skip */
5985  if (cur->tag.blockNum == next->tag.blockNum)
5986  continue;
5987 
5988  /* only merge consecutive writes */
5989  if (cur->tag.blockNum + 1 != next->tag.blockNum)
5990  break;
5991 
5992  nblocks++;
5993  cur = next;
5994  }
5995 
5996  i += ahead;
5997 
5998  /* and finally tell the kernel to write the data to storage */
5999  reln = smgropen(currlocator, INVALID_PROC_NUMBER);
6000  smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
6001  }
6002 
6003  /*
6004  * Assume that writeback requests are only issued for buffers containing
6005  * blocks of permanent relations.
6006  */
6008  IOOP_WRITEBACK, io_start, wb_context->nr_pending);
6009 
6010  wb_context->nr_pending = 0;
6011 }
static int32 next
Definition: blutils.c:221
struct cursor * cur
Definition: ecpg.c:28
@ IOOP_WRITEBACK
Definition: pgstat.h:305
#define RelFileLocatorEquals(locator1, locator2)
void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks)
Definition: smgr.c:643
PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]

References buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), cur, i, INVALID_PROC_NUMBER, IOOBJECT_RELATION, IOOP_WRITEBACK, next, WritebackContext::nr_pending, WritebackContext::pending_writebacks, pgstat_count_io_op_time(), pgstat_prepare_io_time(), RelFileLocatorEquals, smgropen(), smgrwriteback(), and track_io_timing.

Referenced by BufferSync(), and ScheduleBufferTagForWriteback().

◆ LimitAdditionalPins()

void LimitAdditionalPins ( uint32 additional_pins)

Definition at line 2104 of file bufmgr.c.

2105 {
2106  uint32 max_backends;
2107  int max_proportional_pins;
2108 
2109  if (*additional_pins <= 1)
2110  return;
2111 
2112  max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
2113  max_proportional_pins = NBuffers / max_backends;
2114 
2115  /*
2116  * Subtract the approximate number of buffers already pinned by this
2117  * backend. We get the number of "overflowed" pins for free, but don't
2118  * know the number of pins in PrivateRefCountArray. The cost of
2119  * calculating that exactly doesn't seem worth it, so just assume the max.
2120  */
2121  max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
2122 
2123  if (max_proportional_pins <= 0)
2124  max_proportional_pins = 1;
2125 
2126  if (*additional_pins > max_proportional_pins)
2127  *additional_pins = max_proportional_pins;
2128 }
int MaxBackends
Definition: globals.c:143
#define NUM_AUXILIARY_PROCS
Definition: proc.h:440

References MaxBackends, NBuffers, NUM_AUXILIARY_PROCS, PrivateRefCountOverflowed, and REFCOUNT_ARRAY_ENTRIES.

Referenced by ExtendBufferedRelShared(), and read_stream_begin_relation().

◆ local_buffer_write_error_callback()

static void local_buffer_write_error_callback ( void *  arg)
static

Definition at line 5687 of file bufmgr.c.

5688 {
5689  BufferDesc *bufHdr = (BufferDesc *) arg;
5690 
5691  if (bufHdr != NULL)
5692  {
5693  char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
5694  MyProcNumber,
5695  BufTagGetForkNum(&bufHdr->tag));
5696 
5697  errcontext("writing block %u of relation %s",
5698  bufHdr->tag.blockNum, path);
5699  pfree(path);
5700  }
5701 }
#define errcontext
Definition: elog.h:196
void * arg

References arg, buftag::blockNum, BufTagGetForkNum(), BufTagGetRelFileLocator(), errcontext, MyProcNumber, pfree(), relpathbackend, and BufferDesc::tag.

Referenced by FlushRelationBuffers().

◆ LockBuffer()

void LockBuffer ( Buffer  buffer,
int  mode 
)

Definition at line 5131 of file bufmgr.c.

5132 {
5133  BufferDesc *buf;
5134 
5135  Assert(BufferIsPinned(buffer));
5136  if (BufferIsLocal(buffer))
5137  return; /* local buffers need no lock */
5138 
5139  buf = GetBufferDescriptor(buffer - 1);
5140 
5141  if (mode == BUFFER_LOCK_UNLOCK)
5143  else if (mode == BUFFER_LOCK_SHARE)
5145  else if (mode == BUFFER_LOCK_EXCLUSIVE)
5147  else
5148  elog(ERROR, "unrecognized buffer lock mode: %d", mode);
5149 }
#define BUFFER_LOCK_SHARE
Definition: bufmgr.h:194
#define BUFFER_LOCK_EXCLUSIVE
Definition: bufmgr.h:195

References Assert, buf, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_SHARE, BUFFER_LOCK_UNLOCK, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LW_SHARED, LWLockAcquire(), LWLockRelease(), and mode.

Referenced by _bt_lockbuf(), _bt_unlockbuf(), _bt_upgradelockbufcleanup(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_finish_split(), _hash_first(), _hash_freeovflpage(), _hash_getbuf(), _hash_getbuf_with_strategy(), _hash_getcachedmetap(), _hash_init(), _hash_kill_items(), _hash_readnext(), _hash_readpage(), _hash_readprev(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), blbulkdelete(), blgetbitmap(), blinsert(), BloomInitMetapage(), BloomNewBuffer(), blvacuumcleanup(), brin_doinsert(), brin_doupdate(), brin_evacuate_page(), brin_getinsertbuffer(), brin_page_cleanup(), bringetbitmap(), brinGetStats(), brinGetTupleForHeapBlock(), brininsert(), brinLockRevmapPageForUpdate(), brinRevmapDesummarizeRange(), brinRevmapInitialize(), brinsummarize(), bt_metap(), bt_multi_page_stats(), bt_page_items_internal(), bt_page_stats_internal(), bt_recheck_sibling_links(), collect_corrupt_items(), collect_visibility_data(), collectMatchBitmap(), ConditionalLockBufferForCleanup(), count_nondeletable_pages(), entryLoadMoreItems(), FreeSpaceMapPrepareTruncateRel(), fsm_readbuf(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), get_raw_page_internal(), GetVisibilityMapPins(), ginbulkdelete(), ginEntryInsert(), ginFindLeafPage(), ginFindParents(), ginFinishOldSplit(), ginFinishSplit(), ginGetStats(), ginHeapTupleFastInsert(), ginInsertCleanup(), ginInsertValue(), GinNewBuffer(), ginScanToDelete(), ginStepRight(), ginTraverseLock(), ginUpdateStats(), ginvacuumcleanup(), ginVacuumPostingTreeLeaves(), gistBufferingFindCorrectParent(), gistbufferinginserttuples(), gistdoinsert(), gistFindCorrectParent(), gistFindPath(), gistfinishsplit(), gistfixsplit(), gistformdownlink(), gistGetMaxLevel(), gistinserttuples(), gistkillitems(), gistNewBuffer(), gistProcessItup(), gistScanPage(), gistvacuum_delete_empty_pages(), gistvacuumpage(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_fetch(), heap_finish_speculative(), heap_get_latest_tid(), heap_index_delete_tuples(), heap_inplace_update(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_page_prune_opt(), heap_prepare_pagescan(), heap_update(), heap_xlog_visible(), heapam_index_build_range_scan(), heapam_index_fetch_tuple(), heapam_index_validate_scan(), heapam_relation_copy_for_cluster(), heapam_scan_analyze_next_block(), heapam_scan_bitmap_next_block(), heapam_scan_sample_next_tuple(), heapam_tuple_satisfies_snapshot(), heapgettup(), initBloomState(), lazy_scan_heap(), lazy_scan_new_or_empty(), lazy_vacuum_heap_rel(), LockBufferForCleanup(), log_newpage_range(), palloc_btree_page(), pg_visibility(), pgrowlocks(), pgstat_btree_page(), pgstat_gist_page(), pgstat_heap(), pgstatginindex_internal(), pgstathashindex(), pgstatindex_impl(), read_seq_tuple(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), scanGetCandidate(), scanPendingInsert(), ScanSourceDatabasePgClass(), shiftList(), spgdoinsert(), spgGetCache(), SpGistNewBuffer(), spgprocesspending(), spgvacuumpage(), spgWalk(), startScanEntry(), statapprox_heap(), summarize_range(), UnlockReleaseBuffer(), verify_heapam(), verifyBackupPageConsistency(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), vm_readbuf(), XLogReadBufferForRedoExtended(), XLogRecordPageWithFreeSpace(), and ZeroAndLockBuffer().

◆ LockBufferForCleanup()

void LockBufferForCleanup ( Buffer  buffer)

Definition at line 5211 of file bufmgr.c.

5212 {
5213  BufferDesc *bufHdr;
5214  TimestampTz waitStart = 0;
5215  bool waiting = false;
5216  bool logged_recovery_conflict = false;
5217 
5218  Assert(BufferIsPinned(buffer));
5219  Assert(PinCountWaitBuf == NULL);
5220 
5221  CheckBufferIsPinnedOnce(buffer);
5222 
5223  /* Nobody else to wait for */
5224  if (BufferIsLocal(buffer))
5225  return;
5226 
5227  bufHdr = GetBufferDescriptor(buffer - 1);
5228 
5229  for (;;)
5230  {
5231  uint32 buf_state;
5232 
5233  /* Try to acquire lock */
5235  buf_state = LockBufHdr(bufHdr);
5236 
5237  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5238  if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
5239  {
5240  /* Successfully acquired exclusive lock with pincount 1 */
5241  UnlockBufHdr(bufHdr, buf_state);
5242 
5243  /*
5244  * Emit the log message if recovery conflict on buffer pin was
5245  * resolved but the startup process waited longer than
5246  * deadlock_timeout for it.
5247  */
5248  if (logged_recovery_conflict)
5250  waitStart, GetCurrentTimestamp(),
5251  NULL, false);
5252 
5253  if (waiting)
5254  {
5255  /* reset ps display to remove the suffix if we added one */
5257  waiting = false;
5258  }
5259  return;
5260  }
5261  /* Failed, so mark myself as waiting for pincount 1 */
5262  if (buf_state & BM_PIN_COUNT_WAITER)
5263  {
5264  UnlockBufHdr(bufHdr, buf_state);
5265  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5266  elog(ERROR, "multiple backends attempting to wait for pincount 1");
5267  }
5269  PinCountWaitBuf = bufHdr;
5270  buf_state |= BM_PIN_COUNT_WAITER;
5271  UnlockBufHdr(bufHdr, buf_state);
5272  LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
5273 
5274  /* Wait to be signaled by UnpinBuffer() */
5275  if (InHotStandby)
5276  {
5277  if (!waiting)
5278  {
5279  /* adjust the process title to indicate that it's waiting */
5280  set_ps_display_suffix("waiting");
5281  waiting = true;
5282  }
5283 
5284  /*
5285  * Emit the log message if the startup process is waiting longer
5286  * than deadlock_timeout for recovery conflict on buffer pin.
5287  *
5288  * Skip this if first time through because the startup process has
5289  * not started waiting yet in this case. So, the wait start
5290  * timestamp is set after this logic.
5291  */
5292  if (waitStart != 0 && !logged_recovery_conflict)
5293  {
5295 
5296  if (TimestampDifferenceExceeds(waitStart, now,
5297  DeadlockTimeout))
5298  {
5300  waitStart, now, NULL, true);
5301  logged_recovery_conflict = true;
5302  }
5303  }
5304 
5305  /*
5306  * Set the wait start timestamp if logging is enabled and first
5307  * time through.
5308  */
5309  if (log_recovery_conflict_waits && waitStart == 0)
5310  waitStart = GetCurrentTimestamp();
5311 
5312  /* Publish the bufid that Startup process waits on */
5313  SetStartupBufferPinWaitBufId(buffer - 1);
5314  /* Set alarm and then wait to be signaled by UnpinBuffer() */
5316  /* Reset the published bufid */
5318  }
5319  else
5320  ProcWaitForSignal(WAIT_EVENT_BUFFER_PIN);
5321 
5322  /*
5323  * Remove flag marking us as waiter. Normally this will not be set
5324  * anymore, but ProcWaitForSignal() can return for other signals as
5325  * well. We take care to only reset the flag if we're the waiter, as
5326  * theoretically another backend could have started waiting. That's
5327  * impossible with the current usages due to table level locking, but
5328  * better be safe.
5329  */
5330  buf_state = LockBufHdr(bufHdr);
5331  if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
5333  buf_state &= ~BM_PIN_COUNT_WAITER;
5334  UnlockBufHdr(bufHdr, buf_state);
5335 
5336  PinCountWaitBuf = NULL;
5337  /* Loop back and try again */
5338  }
5339 }
bool TimestampDifferenceExceeds(TimestampTz start_time, TimestampTz stop_time, int msec)
Definition: timestamp.c:1790
TimestampTz GetCurrentTimestamp(void)
Definition: timestamp.c:1654
Datum now(PG_FUNCTION_ARGS)
Definition: timestamp.c:1618
#define BM_PIN_COUNT_WAITER
Definition: buf_internals.h:67
static BufferDesc * PinCountWaitBuf
Definition: bufmgr.c:175
int64 TimestampTz
Definition: timestamp.h:39
static volatile sig_atomic_t waiting
Definition: latch.c:162
@ PROCSIG_RECOVERY_CONFLICT_BUFFERPIN
Definition: procsignal.h:47
void set_ps_display_remove_suffix(void)
Definition: ps_status.c:421
void set_ps_display_suffix(const char *suffix)
Definition: ps_status.c:369
int DeadlockTimeout
Definition: proc.c:57
void SetStartupBufferPinWaitBufId(int bufid)
Definition: proc.c:659
void ProcWaitForSignal(uint32 wait_event_info)
Definition: proc.c:1866
void ResolveRecoveryConflictWithBufferPin(void)
Definition: standby.c:792
bool log_recovery_conflict_waits
Definition: standby.c:41
void LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, TimestampTz now, VirtualTransactionId *wait_list, bool still_waiting)
Definition: standby.c:273
int wait_backend_pgprocno
#define InHotStandby
Definition: xlogutils.h:57

References Assert, BM_PIN_COUNT_WAITER, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BUFFER_LOCK_EXCLUSIVE, BUFFER_LOCK_UNLOCK, BufferIsLocal, BufferIsPinned, CheckBufferIsPinnedOnce(), DeadlockTimeout, elog, ERROR, GetBufferDescriptor(), GetCurrentTimestamp(), InHotStandby, LockBuffer(), LockBufHdr(), log_recovery_conflict_waits, LogRecoveryConflict(), MyProcNumber, now(), PinCountWaitBuf, PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, ProcWaitForSignal(), ResolveRecoveryConflictWithBufferPin(), set_ps_display_remove_suffix(), set_ps_display_suffix(), SetStartupBufferPinWaitBufId(), TimestampDifferenceExceeds(), UnlockBufHdr(), BufferDesc::wait_backend_pgprocno, and waiting.

Referenced by _bt_upgradelockbufcleanup(), ginVacuumPostingTree(), hashbulkdelete(), heap_force_common(), lazy_scan_heap(), XLogReadBufferForRedoExtended(), and ZeroAndLockBuffer().

◆ LockBufHdr()

uint32 LockBufHdr ( BufferDesc desc)

Definition at line 5734 of file bufmgr.c.

5735 {
5736  SpinDelayStatus delayStatus;
5737  uint32 old_buf_state;
5738 
5740 
5741  init_local_spin_delay(&delayStatus);
5742 
5743  while (true)
5744  {
5745  /* set BM_LOCKED flag */
5746  old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
5747  /* if it wasn't set before we're OK */
5748  if (!(old_buf_state & BM_LOCKED))
5749  break;
5750  perform_spin_delay(&delayStatus);
5751  }
5752  finish_spin_delay(&delayStatus);
5753  return old_buf_state | BM_LOCKED;
5754 }
static uint32 pg_atomic_fetch_or_u32(volatile pg_atomic_uint32 *ptr, uint32 or_)
Definition: atomics.h:405
void perform_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:132
void finish_spin_delay(SpinDelayStatus *status)
Definition: s_lock.c:192
#define init_local_spin_delay(status)
Definition: s_lock.h:843

References Assert, BM_LOCKED, BufferDescriptorGetBuffer(), BufferIsLocal, finish_spin_delay(), init_local_spin_delay, perform_spin_delay(), pg_atomic_fetch_or_u32(), and BufferDesc::state.

Referenced by AbortBufferIO(), apw_dump_now(), BufferAlloc(), BufferGetLSNAtomic(), BufferSync(), ConditionalLockBufferForCleanup(), DropDatabaseBuffers(), DropRelationBuffers(), DropRelationsAllBuffers(), EvictUnpinnedBuffer(), ExtendBufferedRelShared(), FindAndDropRelationBuffers(), FlushBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetBufferFromRing(), GetVictimBuffer(), InvalidateBuffer(), InvalidateVictimBuffer(), IsBufferCleanupOK(), LockBufferForCleanup(), MarkBufferDirtyHint(), pg_buffercache_pages(), ReadRecentBuffer(), StartBufferIO(), StrategyGetBuffer(), SyncOneBuffer(), TerminateBufferIO(), UnlockBuffers(), UnpinBufferNoOwner(), and WaitIO().

◆ MarkBufferDirty()

void MarkBufferDirty ( Buffer  buffer)

Definition at line 2520 of file bufmgr.c.

2521 {
2522  BufferDesc *bufHdr;
2523  uint32 buf_state;
2524  uint32 old_buf_state;
2525 
2526  if (!BufferIsValid(buffer))
2527  elog(ERROR, "bad buffer ID: %d", buffer);
2528 
2529  if (BufferIsLocal(buffer))
2530  {
2531  MarkLocalBufferDirty(buffer);
2532  return;
2533  }
2534 
2535  bufHdr = GetBufferDescriptor(buffer - 1);
2536 
2537  Assert(BufferIsPinned(buffer));
2539  LW_EXCLUSIVE));
2540 
2541  old_buf_state = pg_atomic_read_u32(&bufHdr->state);
2542  for (;;)
2543  {
2544  if (old_buf_state & BM_LOCKED)
2545  old_buf_state = WaitBufHdrUnlocked(bufHdr);
2546 
2547  buf_state = old_buf_state;
2548 
2549  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
2550  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
2551 
2552  if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
2553  buf_state))
2554  break;
2555  }
2556 
2557  /*
2558  * If the buffer was not dirty already, do vacuum accounting.
2559  */
2560  if (!(old_buf_state & BM_DIRTY))
2561  {
2562  VacuumPageDirty++;
2564  if (VacuumCostActive)
2566  }
2567 }
static bool pg_atomic_compare_exchange_u32(volatile pg_atomic_uint32 *ptr, uint32 *expected, uint32 newval)
Definition: atomics.h:344
static uint32 WaitBufHdrUnlocked(BufferDesc *buf)
Definition: bufmgr.c:5764
bool VacuumCostActive
Definition: globals.c:159
int64 VacuumPageDirty
Definition: globals.c:156
int VacuumCostBalance
Definition: globals.c:158
int VacuumCostPageDirty
Definition: globals.c:150
void MarkLocalBufferDirty(Buffer buffer)
Definition: localbuf.c:449
int64 shared_blks_dirtied
Definition: instrument.h:28

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_LOCKED, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferIsLocal, BufferIsPinned, BufferIsValid(), elog, ERROR, GetBufferDescriptor(), LW_EXCLUSIVE, LWLockHeldByMeInMode(), MarkLocalBufferDirty(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), pgBufferUsage, BufferUsage::shared_blks_dirtied, BufferDesc::state, VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, and WaitBufHdrUnlocked().

Referenced by _bt_clear_incomplete_split(), _bt_dedup_pass(), _bt_delitems_delete(), _bt_delitems_vacuum(), _bt_getroot(), _bt_insertonpg(), _bt_mark_page_halfdead(), _bt_newlevel(), _bt_restore_meta(), _bt_set_cleanup_info(), _bt_split(), _bt_unlink_halfdead_page(), _hash_addovflpage(), _hash_doinsert(), _hash_expandtable(), _hash_freeovflpage(), _hash_init(), _hash_splitbucket(), _hash_squeezebucket(), _hash_vacuum_one_page(), addLeafTuple(), brin_doinsert(), brin_doupdate(), brin_initialize_empty_new_buffer(), brin_xlog_createidx(), brin_xlog_desummarize_page(), brin_xlog_insert_update(), brin_xlog_revmap_extend(), brin_xlog_samepage_update(), brin_xlog_update(), brinbuild(), brinbuildempty(), brinRevmapDesummarizeRange(), btree_xlog_dedup(), btree_xlog_delete(), btree_xlog_insert(), btree_xlog_mark_page_halfdead(), btree_xlog_newroot(), btree_xlog_split(), btree_xlog_unlink_page(), btree_xlog_vacuum(), createPostingTree(), dataExecPlaceToPageInternal(), dataExecPlaceToPageLeaf(), do_setval(), doPickSplit(), entryExecPlaceToPage(), fill_seq_fork_with_data(), FreeSpaceMapPrepareTruncateRel(), generic_redo(), GenericXLogFinish(), ginbuild(), ginbuildempty(), ginbulkdelete(), ginDeletePage(), ginHeapTupleFastInsert(), ginPlaceToPage(), ginRedoClearIncompleteSplit(), ginRedoCreatePTree(), ginRedoDeleteListPages(), ginRedoDeletePage(), ginRedoInsert(), ginRedoInsertListPage(), ginRedoUpdateMetapage(), ginRedoVacuumDataLeafPage(), ginUpdateStats(), ginVacuumPostingTreeLeaf(), gistbuild(), gistbuildempty(), gistdeletepage(), gistplacetopage(), gistprunepage(), gistRedoClearFollowRight(), gistRedoDeleteRecord(), gistRedoPageDelete(), gistRedoPageSplitRecord(), gistRedoPageUpdateRecord(), gistvacuumpage(), hash_xlog_add_ovfl_page(), hash_xlog_delete(), hash_xlog_init_bitmap_page(), hash_xlog_init_meta_page(), hash_xlog_insert(), hash_xlog_move_page_contents(), hash_xlog_split_allocate_page(), hash_xlog_split_cleanup(), hash_xlog_split_complete(), hash_xlog_squeeze_page(), hash_xlog_update_meta_page(), hash_xlog_vacuum_one_page(), hashbucketcleanup(), hashbulkdelete(), heap_abort_speculative(), heap_delete(), heap_finish_speculative(), heap_force_common(), heap_inplace_update(), heap_insert(), heap_lock_tuple(), heap_lock_updated_tuple_rec(), heap_multi_insert(), heap_page_prune_and_freeze(), heap_update(), heap_xlog_confirm(), heap_xlog_delete(), heap_xlog_inplace(), heap_xlog_insert(), heap_xlog_lock(), heap_xlog_lock_updated(), heap_xlog_multi_insert(), heap_xlog_prune_freeze(), heap_xlog_update(), heap_xlog_visible(), lazy_scan_new_or_empty(), lazy_scan_prune(), lazy_vacuum_heap_page(), log_newpage_range(), moveLeafs(), nextval_internal(), RelationAddBlocks(), RelationCopyStorageUsingBuffer(), RelationGetBufferForTuple(), revmap_physical_extend(), saveNodeLink(), seq_redo(), shiftList(), spgAddNodeAction(), spgbuild(), SpGistUpdateMetaPage(), spgRedoAddLeaf(), spgRedoAddNode(), spgRedoMoveLeafs(), spgRedoPickSplit(), spgRedoSplitTuple(), spgRedoVacuumLeaf(), spgRedoVacuumRedirect(), spgRedoVacuumRoot(), spgSplitNodeAction(), vacuumLeafPage(), vacuumLeafRoot(), vacuumRedirectAndPlaceholder(), visibilitymap_clear(), visibilitymap_prepare_truncate(), visibilitymap_set(), writeListPage(), and XLogReadBufferForRedoExtended().

◆ MarkBufferDirtyHint()

void MarkBufferDirtyHint ( Buffer  buffer,
bool  buffer_std 
)

Definition at line 4960 of file bufmgr.c.

4961 {
4962  BufferDesc *bufHdr;
4963  Page page = BufferGetPage(buffer);
4964 
4965  if (!BufferIsValid(buffer))
4966  elog(ERROR, "bad buffer ID: %d", buffer);
4967 
4968  if (BufferIsLocal(buffer))
4969  {
4970  MarkLocalBufferDirty(buffer);
4971  return;
4972  }
4973 
4974  bufHdr = GetBufferDescriptor(buffer - 1);
4975 
4976  Assert(GetPrivateRefCount(buffer) > 0);
4977  /* here, either share or exclusive lock is OK */
4979 
4980  /*
4981  * This routine might get called many times on the same page, if we are
4982  * making the first scan after commit of an xact that added/deleted many
4983  * tuples. So, be as quick as we can if the buffer is already dirty. We
4984  * do this by not acquiring spinlock if it looks like the status bits are
4985  * already set. Since we make this test unlocked, there's a chance we
4986  * might fail to notice that the flags have just been cleared, and failed
4987  * to reset them, due to memory-ordering issues. But since this function
4988  * is only intended to be used in cases where failing to write out the
4989  * data would be harmless anyway, it doesn't really matter.
4990  */
4991  if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
4993  {
4995  bool dirtied = false;
4996  bool delayChkptFlags = false;
4997  uint32 buf_state;
4998 
4999  /*
5000  * If we need to protect hint bit updates from torn writes, WAL-log a
5001  * full page image of the page. This full page image is only necessary
5002  * if the hint bit update is the first change to the page since the
5003  * last checkpoint.
5004  *
5005  * We don't check full_page_writes here because that logic is included
5006  * when we call XLogInsert() since the value changes dynamically.
5007  */
5008  if (XLogHintBitIsNeeded() &&
5009  (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
5010  {
5011  /*
5012  * If we must not write WAL, due to a relfilelocator-specific
5013  * condition or being in recovery, don't dirty the page. We can
5014  * set the hint, just not dirty the page as a result so the hint
5015  * is lost when we evict the page or shutdown.
5016  *
5017  * See src/backend/storage/page/README for longer discussion.
5018  */
5019  if (RecoveryInProgress() ||
5021  return;
5022 
5023  /*
5024  * If the block is already dirty because we either made a change
5025  * or set a hint already, then we don't need to write a full page
5026  * image. Note that aggressive cleaning of blocks dirtied by hint
5027  * bit setting would increase the call rate. Bulk setting of hint
5028  * bits would reduce the call rate...
5029  *
5030  * We must issue the WAL record before we mark the buffer dirty.
5031  * Otherwise we might write the page before we write the WAL. That
5032  * causes a race condition, since a checkpoint might occur between
5033  * writing the WAL record and marking the buffer dirty. We solve
5034  * that with a kluge, but one that is already in use during
5035  * transaction commit to prevent race conditions. Basically, we
5036  * simply prevent the checkpoint WAL record from being written
5037  * until we have marked the buffer dirty. We don't start the
5038  * checkpoint flush until we have marked dirty, so our checkpoint
5039  * must flush the change to disk successfully or the checkpoint
5040  * never gets written, so crash recovery will fix.
5041  *
5042  * It's possible we may enter here without an xid, so it is
5043  * essential that CreateCheckPoint waits for virtual transactions
5044  * rather than full transactionids.
5045  */
5048  delayChkptFlags = true;
5049  lsn = XLogSaveBufferForHint(buffer, buffer_std);
5050  }
5051 
5052  buf_state = LockBufHdr(bufHdr);
5053 
5054  Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
5055 
5056  if (!(buf_state & BM_DIRTY))
5057  {
5058  dirtied = true; /* Means "will be dirtied by this action" */
5059 
5060  /*
5061  * Set the page LSN if we wrote a backup block. We aren't supposed
5062  * to set this when only holding a share lock but as long as we
5063  * serialise it somehow we're OK. We choose to set LSN while
5064  * holding the buffer header lock, which causes any reader of an
5065  * LSN who holds only a share lock to also obtain a buffer header
5066  * lock before using PageGetLSN(), which is enforced in
5067  * BufferGetLSNAtomic().
5068  *
5069  * If checksums are enabled, you might think we should reset the
5070  * checksum here. That will happen when the page is written
5071  * sometime later in this checkpoint cycle.
5072  */
5073  if (!XLogRecPtrIsInvalid(lsn))
5074  PageSetLSN(page, lsn);
5075  }
5076 
5077  buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
5078  UnlockBufHdr(bufHdr, buf_state);
5079 
5080  if (delayChkptFlags)
5082 
5083  if (dirtied)
5084  {
5085  VacuumPageDirty++;
5087  if (VacuumCostActive)
5089  }
5090  }
5091 }
static void PageSetLSN(Page page, XLogRecPtr lsn)
Definition: bufpage.h:388
#define DELAY_CHKPT_START
Definition: proc.h:114
bool RelFileLocatorSkippingWAL(RelFileLocator rlocator)
Definition: storage.c:532
int delayChkptFlags
Definition: proc.h:236
bool RecoveryInProgress(void)
Definition: xlog.c:6290
#define XLogRecPtrIsInvalid(r)
Definition: xlogdefs.h:29
#define InvalidXLogRecPtr
Definition: xlogdefs.h:28
XLogRecPtr XLogSaveBufferForHint(Buffer buffer, bool buffer_std)
Definition: xloginsert.c:1065

References Assert, BM_DIRTY, BM_JUST_DIRTIED, BM_PERMANENT, BUF_STATE_GET_REFCOUNT, PrivateRefCountEntry::buffer, BufferDescriptorGetContentLock(), BufferGetPage(), BufferIsLocal, BufferIsValid(), BufTagGetRelFileLocator(), DELAY_CHKPT_START, PGPROC::delayChkptFlags, elog, ERROR, GetBufferDescriptor(), GetPrivateRefCount(), InvalidXLogRecPtr, LockBufHdr(), LWLockHeldByMe(), MarkLocalBufferDirty(), MyProc, PageSetLSN(), pg_atomic_read_u32(), pgBufferUsage, RecoveryInProgress(), RelFileLocatorSkippingWAL(), BufferUsage::shared_blks_dirtied, BufferDesc::state, BufferDesc::tag, UnlockBufHdr(), VacuumCostActive, VacuumCostBalance, VacuumCostPageDirty, VacuumPageDirty, XLogHintBitIsNeeded, XLogRecPtrIsInvalid, and XLogSaveBufferForHint().

Referenced by _bt_check_unique(), _bt_killitems(), _hash_kill_items(), brin_start_evacuating_page(), btvacuumpage(), fsm_search(), fsm_search_avail(), fsm_set_and_search(), fsm_vacuum_page(), gistkillitems(), heap_page_prune_and_freeze(), read_seq_tuple(), SetHintBits(), and XLogRecordPageWithFreeSpace().

◆ NewPrivateRefCountEntry()

static PrivateRefCountEntry * NewPrivateRefCountEntry ( Buffer  buffer)
static

Definition at line 315 of file bufmgr.c.

316 {
318 
319  /* only allowed to be called when a reservation has been made */
320  Assert(ReservedRefCountEntry != NULL);
321 
322  /* use up the reserved entry */
324  ReservedRefCountEntry = NULL;
325 
326  /* and fill it */
327  res->buffer = buffer;
328  res->refcount = 0;
329 
330  return res;
331 }

References Assert, PrivateRefCountEntry::buffer, res, and ReservedRefCountEntry.

Referenced by PinBuffer(), and PinBuffer_Locked().

◆ PinBuffer()

static bool PinBuffer ( BufferDesc buf,
BufferAccessStrategy  strategy 
)
static

Definition at line 2641 of file bufmgr.c.

2642 {
2644  bool result;
2645  PrivateRefCountEntry *ref;
2646 
2647  Assert(!BufferIsLocal(b));
2648  Assert(ReservedRefCountEntry != NULL);
2649 
2650  ref = GetPrivateRefCountEntry(b, true);
2651 
2652  if (ref == NULL)
2653  {
2654  uint32 buf_state;
2655  uint32 old_buf_state;
2656 
2657  ref = NewPrivateRefCountEntry(b);
2658 
2659  old_buf_state = pg_atomic_read_u32(&buf->state);
2660  for (;;)
2661  {
2662  if (old_buf_state & BM_LOCKED)
2663  old_buf_state = WaitBufHdrUnlocked(buf);
2664 
2665  buf_state = old_buf_state;
2666 
2667  /* increase refcount */
2668  buf_state += BUF_REFCOUNT_ONE;
2669 
2670  if (strategy == NULL)
2671  {
2672  /* Default case: increase usagecount unless already max. */
2674  buf_state += BUF_USAGECOUNT_ONE;
2675  }
2676  else
2677  {
2678  /*
2679  * Ring buffers shouldn't evict others from pool. Thus we
2680  * don't make usagecount more than 1.
2681  */
2682  if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
2683  buf_state += BUF_USAGECOUNT_ONE;
2684  }
2685 
2686  if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
2687  buf_state))
2688  {
2689  result = (buf_state & BM_VALID) != 0;
2690 
2691  /*
2692  * Assume that we acquired a buffer pin for the purposes of
2693  * Valgrind buffer client checks (even in !result case) to
2694  * keep things simple. Buffers that are unsafe to access are
2695  * not generally guaranteed to be marked undefined or
2696  * non-accessible in any case.
2697  */
2699  break;
2700  }
2701  }
2702  }
2703  else
2704  {
2705  /*
2706  * If we previously pinned the buffer, it is likely to be valid, but
2707  * it may not be if StartReadBuffers() was called and
2708  * WaitReadBuffers() hasn't been called yet. We'll check by loading
2709  * the flags without locking. This is racy, but it's OK to return
2710  * false spuriously: when WaitReadBuffers() calls StartBufferIO(),
2711  * it'll see that it's now valid.
2712  *
2713  * Note: We deliberately avoid a Valgrind client request here.
2714  * Individual access methods can optionally superimpose buffer page
2715  * client requests on top of our client requests to enforce that
2716  * buffers are only accessed while locked (and pinned). It's possible
2717  * that the buffer page is legitimately non-accessible here. We
2718  * cannot meddle with that.
2719  */
2720  result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
2721  }
2722 
2723  ref->refcount++;
2724  Assert(ref->refcount > 0);
2726  return result;
2727 }
#define BM_MAX_USAGE_COUNT
Definition: buf_internals.h:78
#define BUF_REFCOUNT_ONE
Definition: buf_internals.h:43
#define BUF_STATE_GET_USAGECOUNT(state)
Definition: buf_internals.h:52
static PrivateRefCountEntry * NewPrivateRefCountEntry(Buffer buffer)
Definition: bufmgr.c:315
#define VALGRIND_MAKE_MEM_DEFINED(addr, size)
Definition: memdebug.h:26

References Assert, b, BM_LOCKED, BM_MAX_USAGE_COUNT, BM_VALID, buf, BUF_REFCOUNT_ONE, BUF_STATE_GET_USAGECOUNT, BUF_USAGECOUNT_ONE, BufferDescriptorGetBuffer(), BufferIsLocal, BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_compare_exchange_u32(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ReservedRefCountEntry, ResourceOwnerRememberBuffer(), VALGRIND_MAKE_MEM_DEFINED, and WaitBufHdrUnlocked().

Referenced by BufferAlloc(), ExtendBufferedRelShared(), and ReadRecentBuffer().

◆ PinBuffer_Locked()

static void PinBuffer_Locked ( BufferDesc buf)
static

Definition at line 2752 of file bufmgr.c.

2753 {
2754  Buffer b;
2755  PrivateRefCountEntry *ref;
2756  uint32 buf_state;
2757 
2758  /*
2759  * As explained, We don't expect any preexisting pins. That allows us to
2760  * manipulate the PrivateRefCount after releasing the spinlock
2761  */
2763 
2764  /*
2765  * Buffer can't have a preexisting pin, so mark its page as defined to
2766  * Valgrind (this is similar to the PinBuffer() case where the backend
2767  * doesn't already have a buffer pin)
2768  */
2770 
2771  /*
2772  * Since we hold the buffer spinlock, we can update the buffer state and
2773  * release the lock in one operation.
2774  */
2775  buf_state = pg_atomic_read_u32(&buf->state);
2776  Assert(buf_state & BM_LOCKED);
2777  buf_state += BUF_REFCOUNT_ONE;
2778  UnlockBufHdr(buf, buf_state);
2779 
2781 
2782  ref = NewPrivateRefCountEntry(b);
2783  ref->refcount++;
2784 
2786 }

References Assert, b, BM_LOCKED, buf, BUF_REFCOUNT_ONE, BufferDescriptorGetBuffer(), BufHdrGetBlock, CurrentResourceOwner, GetPrivateRefCountEntry(), NewPrivateRefCountEntry(), pg_atomic_read_u32(), PrivateRefCountEntry::refcount, ResourceOwnerRememberBuffer(), UnlockBufHdr(), and VALGRIND_MAKE_MEM_DEFINED.

Referenced by EvictUnpinnedBuffer(), FlushDatabaseBuffers(), FlushRelationBuffers(), FlushRelationsAllBuffers(), GetVictimBuffer(), ReadRecentBuffer(), and SyncOneBuffer().

◆ PinBufferForBlock()

static pg_attribute_always_inline Buffer PinBufferForBlock ( Relation  rel,
SMgrRelation  smgr,
char  smgr_persistence,
ForkNumber  forkNum,
BlockNumber  blockNum,
BufferAccessStrategy  strategy,
bool foundPtr 
)
static

Definition at line 1105 of file bufmgr.c.

1112 {
1113  BufferDesc *bufHdr;
1114  IOContext io_context;
1115  IOObject io_object;
1116  char persistence;
1117 
1118  Assert(blockNum != P_NEW);
1119 
1120  /*
1121  * If there is no Relation it usually implies recovery and thus permanent,
1122  * but we take an argument because CreateAndCopyRelationData can reach us
1123  * with only an SMgrRelation for an unlogged relation that we don't want
1124  * to flag with BM_PERMANENT.
1125  */
1126  if (rel)
1127  persistence = rel->rd_rel->relpersistence;
1128  else if (smgr_persistence == 0)
1129  persistence = RELPERSISTENCE_PERMANENT;
1130  else
1131  persistence = smgr_persistence;
1132 
1133  if (persistence == RELPERSISTENCE_TEMP)
1134  {
1135  io_context = IOCONTEXT_NORMAL;
1136  io_object = IOOBJECT_TEMP_RELATION;
1137  }
1138  else
1139  {
1140  io_context = IOContextForStrategy(strategy);
1141  io_object = IOOBJECT_RELATION;
1142  }
1143 
1144  TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
1146  smgr->smgr_rlocator.locator.dbOid,
1148  smgr->smgr_rlocator.backend);
1149 
1150  if (persistence == RELPERSISTENCE_TEMP)
1151  {
1152  bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, foundPtr);
1153  if (*foundPtr)
1155  }
1156  else
1157  {
1158  bufHdr = BufferAlloc(smgr, persistence, forkNum, blockNum,
1159  strategy, foundPtr, io_context);
1160  if (*foundPtr)
1162  }
1163  if (rel)
1164  {
1165  /*
1166  * While pgBufferUsage's "read" counter isn't bumped unless we reach
1167  * WaitReadBuffers() (so, not for hits, and not for buffers that are
1168  * zeroed instead), the per-relation stats always count them.
1169  */
1171  if (*foundPtr)
1173  }
1174  if (*foundPtr)
1175  {
1176  VacuumPageHit++;
1177  pgstat_count_io_op(io_object, io_context, IOOP_HIT);
1178  if (VacuumCostActive)
1180 
1181  TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
1183  smgr->smgr_rlocator.locator.dbOid,
1185  smgr->smgr_rlocator.backend,
1186  true);
1187  }
1188 
1189  return BufferDescriptorGetBuffer(bufHdr);
1190 }
static BufferDesc * BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr, IOContext io_context)
Definition: bufmgr.c:1594
#define P_NEW
Definition: bufmgr.h:188
int64 VacuumPageHit
Definition: globals.c:154
int VacuumCostPageHit
Definition: globals.c:148